In [1]:
import pandas as pd

In [4]:
#Read in the CSV file
file = "Resources/zoo.csv"
df = pd.read_csv(file)
df

Unnamed: 0,animal,uniq_id,water_need,gender
0,elephant,1001,500.0,F
1,elephant,1002,600.0,F
2,elephant,1003,550.0,M
3,tiger,1004,300.0,M
4,tiger,1005,320.0,F
5,tiger,1006,330.0,F
6,tiger,1007,290.0,M
7,tiger,1008,310.0,F
8,zebra,1009,200000000.0,F
9,zebra,1010,220.0,M


In [6]:
# Count all the items in the colums. The count "should" be the same for
# all columns, but notice they are not the same.
#This means there is bad data somewhere that we need to clean.
df.count()

animal        23
uniq_id       23
water_need    22
gender        22
dtype: int64

In [11]:
# Remove any roes with a NaN (something that is not a number)
df = df.dropna()
df

Unnamed: 0,animal,uniq_id,water_need,gender
0,elephant,1001,500.0,F
1,elephant,1002,600.0,F
2,elephant,1003,550.0,M
3,tiger,1004,300.0,M
4,tiger,1005,320.0,F
5,tiger,1006,330.0,F
6,tiger,1007,290.0,M
7,tiger,1008,310.0,F
8,zebra,1009,200000000.0,F
9,zebra,1010,220.0,M


In [7]:
# Look at the averages, do they seem reasonable?
# No! the average is really large. There must be a
# really large number somewhere.
df.describe()

Unnamed: 0,uniq_id,water_need
count,23.0,22.0
mean,1055.043478,9091248.0
std,208.923535,42640070.0
min,1001.0,80.0
25%,1006.5,252.5
50%,1012.0,360.0
75%,1017.5,482.5
max,2013.0,200000000.0


In [12]:
# The water_need average is still high. Remove any values outside
# a reasonable range. Say, 10000. 
df = df[df['water_need'] < 10000] 
df

Unnamed: 0,animal,uniq_id,water_need,gender
0,elephant,1001,500.0,F
1,elephant,1002,600.0,F
2,elephant,1003,550.0,M
3,tiger,1004,300.0,M
4,tiger,1005,320.0,F
5,tiger,1006,330.0,F
6,tiger,1007,290.0,M
7,tiger,1008,310.0,F
9,zebra,1010,220.0,M
10,zebra,1011,240.0,F


In [14]:
# Now use describe and see if everything looks OK:
df.describe()

Unnamed: 0,uniq_id,water_need
count,20.0,20.0
mean,1011.2,352.0
std,6.509911,150.633749
min,1001.0,80.0
25%,1005.75,237.5
50%,1011.5,325.0
75%,1016.25,447.5
max,1022.0,600.0


 # Great! Now the data is clean, we can continue...

In [16]:
# Get a list of all the animals
# Note that df['animal'] returns a Pandas Series, which has to be turned into a Python list
animals_series = df['animal']
animals_list = animals_series.tolist()
animals_list

['elephant',
 'elephant',
 'elephant',
 'tiger',
 'tiger',
 'tiger',
 'tiger',
 'tiger',
 'zebra',
 'zebra',
 'zebra',
 'zebra',
 'zebra',
 'zebra',
 'zebra',
 'lion',
 'lion',
 'lion',
 'lion',
 'kangaroo',
 'kangaroo',
 'kangaroo']

In [17]:
# Get a list of all the animals
# Note that df['animal'] returns a Pandas Series, which has to be turned into a Python list
# Note that df['animal'] is the same as df.animal
animals_series = df.animal
animals_list = animals_series.tolist()
animals_list

['elephant',
 'elephant',
 'elephant',
 'tiger',
 'tiger',
 'tiger',
 'tiger',
 'tiger',
 'zebra',
 'zebra',
 'zebra',
 'zebra',
 'zebra',
 'zebra',
 'zebra',
 'lion',
 'lion',
 'lion',
 'lion',
 'kangaroo',
 'kangaroo',
 'kangaroo']

In [20]:
# Note that df['animal'] returns a Pandas Series, which has to be turned into a Python list
# Note that df['animal'] is the same as df.animal
# Note you can shorten this to one line
animals_list = df.animal.tolist()
animals_list

['elephant',
 'elephant',
 'elephant',
 'tiger',
 'tiger',
 'tiger',
 'tiger',
 'tiger',
 'zebra',
 'zebra',
 'zebra',
 'zebra',
 'zebra',
 'zebra',
 'zebra',
 'lion',
 'lion',
 'lion',
 'lion',
 'kangaroo',
 'kangaroo',
 'kangaroo']

In [21]:
# Loop through the list of animals, print them each on a line
for animal in animals_list:
    print(animal)

elephant
elephant
elephant
tiger
tiger
tiger
tiger
tiger
zebra
zebra
zebra
zebra
zebra
zebra
zebra
lion
lion
lion
lion
kangaroo
kangaroo
kangaroo


In [13]:
# Use the Pandas unique() function to get a list of unique animals.
df.animal.unique().tolist()

['elephant', 'tiger', 'zebra', 'lion', 'kangaroo']

In [17]:
# Get the number of differnt animals. Use len()
len(df.animal.unique())

5

In [30]:
# Get a count of gender
df[['gender','animal']].groupby('gender').count()

Unnamed: 0_level_0,animal
gender,Unnamed: 1_level_1
F,12
M,8


In [40]:
# Change all the water needs to 800
df.loc[:,'water_need'] = 600
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,animal,uniq_id,water_need,gender
0,elephant,1001,600,F
1,elephant,1002,600,F
2,elephant,1003,600,M
3,tiger,1004,600,M
4,tiger,1005,600,F
5,tiger,1006,600,F
6,tiger,1007,600,M
7,tiger,1008,600,F
9,zebra,1010,600,M
10,zebra,1011,600,F
