In [4]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [5]:
species = pd.read_csv('species_info.csv')
observations = pd.read_csv('observations.csv')

In [6]:
species.head()

Unnamed: 0,category,scientific_name,common_names,conservation_status
0,Mammal,Clethrionomys gapperi gapperi,Gapper's Red-Backed Vole,
1,Mammal,Bos bison,"American Bison, Bison",
2,Mammal,Bos taurus,"Aurochs, Aurochs, Domestic Cattle (Feral), Dom...",
3,Mammal,Ovis aries,"Domestic Sheep, Mouflon, Red Sheep, Sheep (Feral)",
4,Mammal,Cervus elaphus,Wapiti Or Elk,


In [7]:
species.describe()

Unnamed: 0,category,scientific_name,common_names,conservation_status
count,5824,5824,5824,191
unique,7,5541,5504,4
top,Vascular Plant,Castor canadensis,Brachythecium Moss,Species of Concern
freq,4470,3,7,161


In [8]:
species.category.unique()

array(['Mammal', 'Bird', 'Reptile', 'Amphibian', 'Fish', 'Vascular Plant',
       'Nonvascular Plant'], dtype=object)

In [9]:
species.conservation_status.unique()

array([nan, 'Species of Concern', 'Endangered', 'Threatened',
       'In Recovery'], dtype=object)

The IUCN Red List of Threatened Species has a "Least Concern" category. A species is considered Least Concern if it is not threatened by extinction and is widespread and abundant. 

The NatureServe Conservation Status Ranks also have a "Secure" category. A species is considered Secure if it has a very low risk of extinction and abundant populations.

In [10]:
species = species.fillna('Least Concern')
species.head()

Unnamed: 0,category,scientific_name,common_names,conservation_status
0,Mammal,Clethrionomys gapperi gapperi,Gapper's Red-Backed Vole,Least Concern
1,Mammal,Bos bison,"American Bison, Bison",Least Concern
2,Mammal,Bos taurus,"Aurochs, Aurochs, Domestic Cattle (Feral), Dom...",Least Concern
3,Mammal,Ovis aries,"Domestic Sheep, Mouflon, Red Sheep, Sheep (Feral)",Least Concern
4,Mammal,Cervus elaphus,Wapiti Or Elk,Least Concern


In [13]:
species[species.duplicated(subset=['common_names'])]
# species[species.common_names == 'Marsh Rice Rat']

Unnamed: 0,category,scientific_name,common_names,conservation_status
59,Mammal,Oryzomys palustris palustris,Marsh Rice Rat,Least Concern
124,Bird,Melanitta fusca,White-Winged Scoter,Least Concern
243,Bird,Anthus spinoletta,Water Pipit,Least Concern
278,Bird,Vermivora chrysoptera X pinus,Brewster's Warbler,Least Concern
280,Bird,Vermivora lawrencii,Lawrence's Warbler,Least Concern
...,...,...,...,...
5753,Vascular Plant,Rosa woodsii var. ultramontana,Woods' Rose,Least Concern
5757,Vascular Plant,Rubus parviflorus,"Thimbleberry, Western Thimbleberry",Least Concern
5788,Vascular Plant,Ribes viscosissimum,Sticky Currant,Least Concern
5795,Vascular Plant,Heuchera rubescens var. glandulosa,Pink Alumroot,Least Concern


In [14]:
species[species.duplicated(subset=['scientific_name'])]
# species[species.scientific_name.str.contains('Odocoileus virginianus')]
# species[species.common_names.str.contains('Elk')]

Unnamed: 0,category,scientific_name,common_names,conservation_status
3017,Mammal,Cervus elaphus,Rocky Mountain Elk,Least Concern
3019,Mammal,Odocoileus virginianus,"White-Tailed Deer, White-Tailed Deer",Least Concern
3020,Mammal,Canis lupus,"Gray Wolf, Wolf",In Recovery
3022,Mammal,Puma concolor,"Cougar, Mountain Lion, Puma",Least Concern
3025,Mammal,Lutra canadensis,River Otter,Least Concern
...,...,...,...,...
5619,Vascular Plant,Panicum rigidulum var. rigidulum,"Redtop Panicgrass, Redtop Panicum",Least Concern
5638,Vascular Plant,Setaria pumila,"Cattail Grass, Yellow Bristle Grass, Yellow Br...",Least Concern
5640,Vascular Plant,Vulpia bromoides,"Brome Fescue, Brome Six-Weeks Grass, Desert Fe...",Least Concern
5643,Vascular Plant,Vulpia myuros,"Foxtail Fescue, Rattail Fescue, Rat-Tail Fescu...",Least Concern


In [23]:
species.drop_duplicates('scientific_name', inplace=True)
species.drop_duplicates('common_names', inplace=True)
species[species.scientific_name.duplicated()]

Unnamed: 0,category,scientific_name,common_names,conservation_status


In [24]:
species.describe()

Unnamed: 0,category,scientific_name,common_names,conservation_status
count,5229,5229,5229,5229
unique,7,5229,5229,5
top,Vascular Plant,Clethrionomys gapperi gapperi,Gapper's Red-Backed Vole,Least Concern
freq,4086,1,1,5054


In [16]:
observations.head()

Unnamed: 0,scientific_name,park_name,observations
0,Vicia benghalensis,Great Smoky Mountains National Park,68
1,Neovison vison,Great Smoky Mountains National Park,77
2,Prunus subcordata,Yosemite National Park,138
3,Abutilon theophrasti,Bryce National Park,84
4,Githopsis specularioides,Great Smoky Mountains National Park,85


5541

5541

In [48]:
observations.scientific_name.groupby(observations.park_name).count()

park_name
Bryce National Park                    5824
Great Smoky Mountains National Park    5824
Yellowstone National Park              5824
Yosemite National Park                 5824
Name: scientific_name, dtype: int64

In [56]:
observations[observations.duplicated(subset=['scientific_name', 'park_name'])]

Unnamed: 0,scientific_name,park_name,observations
483,Agrostis gigantea,Yellowstone National Park,235
490,Agrostis mertensii,Yosemite National Park,128
945,Rumex crispus,Yellowstone National Park,255
1213,Dianthus barbatus,Bryce National Park,110
1259,Riparia riparia,Bryce National Park,91
...,...,...,...
23258,Microtus longicaudus,Yellowstone National Park,244
23259,Agrostis scabra,Bryce National Park,118
23267,Oxalis corniculata,Yosemite National Park,164
23273,Dactylis glomerata,Bryce National Park,89


In [55]:
observations[(observations.scientific_name == 'Agrostis gigantea') & (observations.park_name == 'Yellowstone National Park')]

Unnamed: 0,scientific_name,park_name,observations
449,Agrostis gigantea,Yellowstone National Park,253
483,Agrostis gigantea,Yellowstone National Park,235


In [57]:
observations[(observations.scientific_name == 'Agrostis mertensii') & (observations.park_name == 'Yosemite National Park')]

Unnamed: 0,scientific_name,park_name,observations
111,Agrostis mertensii,Yosemite National Park,135
490,Agrostis mertensii,Yosemite National Park,128


In [58]:
observations.describe()

Unnamed: 0,observations
count,23296.0
mean,142.287904
std,69.890532
min,9.0
25%,86.0
50%,124.0
75%,195.0
max,321.0


In [60]:
observations.count()

scientific_name    23296
park_name          23296
observations       23296
dtype: int64