In [119]:
# importing relevant libraries, loading datasets and previewing datasets
import pandas as pd
import matplotlib as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
observations = pd.read_csv('observations.csv')
print(observations.head())
species_info = pd.read_csv('species_info.csv')
print(species_info.head())

            scientific_name                            park_name  observations
0        Vicia benghalensis  Great Smoky Mountains National Park            68
1            Neovison vison  Great Smoky Mountains National Park            77
2         Prunus subcordata               Yosemite National Park           138
3      Abutilon theophrasti                  Bryce National Park            84
4  Githopsis specularioides  Great Smoky Mountains National Park            85
  category                scientific_name  \
0   Mammal  Clethrionomys gapperi gapperi   
1   Mammal                      Bos bison   
2   Mammal                     Bos taurus   
3   Mammal                     Ovis aries   
4   Mammal                 Cervus elaphus   

                                        common_names conservation_status  
0                           Gapper's Red-Backed Vole                 NaN  
1                              American Bison, Bison                 NaN  
2  Aurochs, Aurochs, Domestic 

In [121]:
# merging two datasets and previewing merged dataset
comb_table = pd.merge(species_info, observations)
print(comb_table.head())

  category                scientific_name              common_names  \
0   Mammal  Clethrionomys gapperi gapperi  Gapper's Red-Backed Vole   
1   Mammal  Clethrionomys gapperi gapperi  Gapper's Red-Backed Vole   
2   Mammal  Clethrionomys gapperi gapperi  Gapper's Red-Backed Vole   
3   Mammal  Clethrionomys gapperi gapperi  Gapper's Red-Backed Vole   
4   Mammal                      Bos bison     American Bison, Bison   

  conservation_status                            park_name  observations  
0                 NaN                  Bryce National Park           130  
1                 NaN            Yellowstone National Park           270  
2                 NaN  Great Smoky Mountains National Park            98  
3                 NaN               Yosemite National Park           117  
4                 NaN               Yosemite National Park           128  


In [127]:
# Building a final table with each row representing each unique species and columns representing the counts of the species in the four parks
parks_name_list = comb_table.park_name.unique()
print(parks_name_list)
for park in parks_name_list:
    comb_table[park] = comb_table.apply(lambda row: row['observations'] if row['park_name']  == park else 0, axis=1)
final_table = comb_table.groupby('scientific_name').agg({'Bryce National Park': 'sum', 'Yellowstone National Park': 'sum', 'Great Smoky Mountains National Park': 'sum', 'Yosemite National Park': 'sum', 'category': 'first'}).reset_index()
print(comb_table.head())
print(final_table.head())

['Bryce National Park' 'Yellowstone National Park'
 'Great Smoky Mountains National Park' 'Yosemite National Park']
  category                scientific_name              common_names  \
0   Mammal  Clethrionomys gapperi gapperi  Gapper's Red-Backed Vole   
1   Mammal  Clethrionomys gapperi gapperi  Gapper's Red-Backed Vole   
2   Mammal  Clethrionomys gapperi gapperi  Gapper's Red-Backed Vole   
3   Mammal  Clethrionomys gapperi gapperi  Gapper's Red-Backed Vole   
4   Mammal                      Bos bison     American Bison, Bison   

  conservation_status                            park_name  observations  \
0                 NaN                  Bryce National Park           130   
1                 NaN            Yellowstone National Park           270   
2                 NaN  Great Smoky Mountains National Park            98   
3                 NaN               Yosemite National Park           117   
4                 NaN               Yosemite National Park           128   



In [129]:
# splitting the table into feature and label sets and then dividing them into training and test sets
features = final_table.iloc[:, 1:5]
labels = final_table.iloc[:,-1]
training_features, testing_features, training_labels, testing_labels = train_test_split(features, labels, test_size = 0.2)

In [131]:
# Building the K-Nearest neighbors model, fitting it on the training set and testing it on the test set
classifier = KNeighborsClassifier(n_neighbors = 9)
classifier.fit(training_features, training_labels)
guesses = classifier.predict(testing_features)
print(accuracy_score(guesses, testing_labels))

0.7709648331830478
