# Dinosaur Project

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

# Inital Data Exploration & Cleaning
A high level walk though of the dataset's contents and cleaning.

## Exploration

In [18]:
#read in data 
df = pd.read_csv('dinosaur.csv') 
df

Unnamed: 0,Name,Period,Diet,Country
0,Aardonyx,Jurassic,herbivore,South Africa
1,Abelisaurus,Cretaceous,carnivore,South America
2,Abrictosaurus,Jurassic,herbivore,South Africa
3,Abrosaurus,Jurassic,herbivore,China
4,Abydosaurus,Cretaceous,herbivore,North America
...,...,...,...,...
1149,Zizhongosaurus,Jurassic,herbivore,China
1150,Zuniceratops,Cretaceous,herbivore,North America
1151,Zuolong,Jurassic,carnivore,China
1152,Zupaysaurus,Triassic,carnivore,South America


In [19]:
#nan value check
df.isna().sum()

Name       0
Period     0
Diet       0
Country    0
dtype: int64

In [20]:
#unique values in each column count
print('Number of unique values in the Name Column: ', df['Name'].nunique()) 
print('Number of unique values in the Period Column: ', 
      df['Period'].nunique()) 
print('Number of unique values in the Diet Column: ', df['Diet'].nunique()) 
print('Number of unique values in the Country Column: ', 
      df['Country'].nunique())

Number of unique values in the Name Column:  1143
Number of unique values in the Period Column:  14
Number of unique values in the Diet Column:  16
Number of unique values in the Country Column:  104


In [21]:
df['Period'].unique().tolist()

['Jurassic',
 'Cretaceous\xa0\xa0\xa0\xa0\xa0\xa0\xa0',
 'Cretaceous',
 'Triassic',
 'Jurassic/Cretaceous',
 'Late Triassic',
 'Late Cretaceous',
 'Triassic or Jurassic',
 'Middle Jurassic',
 'Early Cretaceous',
 'Early-Late Cretaceous',
 'Early Jurassic',
 'Triassic/Jurassic',
 '(unknown)']

In [23]:
df['Diet'].unique().tolist()

['herbivore',
 'carnivore',
 'carnivore/insectivore',
 '(herbivore)',
 'omnivore',
 'herbivore/omnivore',
 '(unknown)',
 '(carnivore)',
 'unknown',
 'Carnivore',
 'omnivorous',
 '?',
 'Herbivore',
 'carnivore/omnivore',
 'carnivore?',
 'insectivore']

In [24]:
df['Country'].unique().tolist()

['South Africa',
 'South America',
 'China',
 'North America',
 'England',
 'Mongolia',
 'Morocco',
 'Spain',
 'Egypt, Niger',
 'France',
 'South Africa, Lesotho, Zimbabwe',
 'Niger',
 'Hungary',
 'Japan',
 'Mongolia (& China?)',
 'Tanzania, USA, & Portugal',
 'Portugal, USA, & Tanzania',
 'Portugal',
 'India',
 'Russia & China',
 'Angola',
 'Antarctica',
 'Kazakhstan',
 'Madagascar',
 'Germany',
 'Russia',
 'Mongolia & China',
 'China & Mongolia',
 'Australia',
 'China & South Korea',
 'Tanzania',
 'Egypt,Niger, Morocco, Algeria',
 'Romania',
 'Pakistan',
 'Spain, Portugal, England',
 'Netherlands',
 'Uzbekistan',
 'Czech Republic',
 'Egypt, Niger, Algeria, Morocco',
 'Portugal & USA',
 'England, France, Switzerland, Morocco',
 'China, Russia, & Mongolia(?)',
 'Algeria',
 'USA, South Africa, & Zimbabwe',
 'Germany & France',
 'Belgium',
 'England, France, Spain, Portugal',
 'South Korea',
 'Morocco, Algeria, Egypt',
 'Denmark',
 'Portugal & Uzbekistan',
 'South Africa, Zimbabwe, Lesot

The data set contains: 
* 1154 rows 
* 4 columns - 'Name', 'Period', 'Diet', 'Country' 
* Name is a unique identifier that does not repeat 
* Period is a categorical column that contains either delineations within either the Jurassic, Cretaceous or Triassic Periods 
* Diet is a categorical column containing either herbivore, carnivore, omnivore, insectivore
* Country contains a mixture of countries, continents and text data. Further examination is needed.

No nan values, but unknown values, ? or duplicate but differently formatted data that will need to be identified and cleaned

## Cleaning 

In [34]:
#fix period column 
df['Period'] = df['Period'].replace('Cretaceous\xa0\xa0\xa0\xa0\xa0\xa0\xa0',
                                    'Cretaceous') 

#identified through external research 
df['Period'] = df['Period'].replace('(unknown)',
                                    'Middle Jurassic')

In [35]:
df['Period'].unique().tolist()

['Jurassic',
 'Cretaceous',
 'Triassic',
 'Jurassic/Cretaceous',
 'Late Triassic',
 'Late Cretaceous',
 'Triassic or Jurassic',
 'Middle Jurassic',
 'Early Cretaceous',
 'Early-Late Cretaceous',
 'Early Jurassic',
 'Triassic/Jurassic']

In [40]:
#fix diet column 
df['Diet'] = df['Diet'].astype(str).apply(lambda x: x.strip('(')
                                          .strip(')').lower()) 

Both dinosaurs listed in the dataset with a diet of ? have very little information (singular skeleton) and have been reclassified, both are believed to be carnivores, for this project I will classify them as such. 

In [47]:
#identified through external research 
df['Diet'] = df['Diet'].replace('?', 'carnivore').replace('carnivore?',
                                                          'carnivore')

In [56]:
#Lihkoelesauus - this is apparently an unconfirmed dinosaur so I will drop it
df = df.drop([596], axis=0)

KeyError: '[596] not found in axis'

In [57]:
#externally researched classifications 
herbivores = [138, 237] 
carnivores = [395, 593, 916, 1036]
insectivore= [1108, 1143] 

for dinosaur in herbivores: 
    df.loc[dinosaur, 'Diet'] = 'herbivore'  
    
for dinosaur in carnivores: 
    df.loc[dinosaur, 'Diet'] = 'carnivore'  
    
    
for dinosaur in insectivore: 
    df.loc[dinosaur, 'Diet'] = 'insectivore' 

Finally I will reformat omnivorous to omnivore and generalize the categories if a dinosaur is both a herbivore and an omnivore then it is more generally able to be classified as an omnivore.

In [61]:
#identified through external research 
df['Diet'] = df['Diet'].replace('omnivorous', 
                                'omnivore').replace('carnivore/insectivore',
                                                   'carnivore') 
df['Diet'] = df['Diet'].replace('herbivore/omnivore', 
                                'omnivore').replace('carnivore/omnivore',
                                                   'carnivore')