# Pandas

In [34]:
import pandas as pd

First, let's import our dataset

In [35]:
df = pd.read_csv('data/0101_Pandas_Basics/gapminder.tsv', sep='\t')

In [36]:
type(df)

pandas.core.frame.DataFrame

In [37]:
df.shape

(1704, 6)

In pandas, you can have columns of different types but each column must have the same type. If you see object as a column, typically it means string data type but if you come from a large pipeline of data it could mean, for example another data frame or a request object or any generic python object.

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
country      1704 non-null object
continent    1704 non-null object
year         1704 non-null int64
lifeExp      1704 non-null float64
pop          1704 non-null int64
gdpPercap    1704 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


In [39]:
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [40]:
df.tail()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.44996
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623
1703,Zimbabwe,Africa,2007,43.487,12311143,469.709298


For a data frame, there are 3 essential elements. The first one is the index, the left most column which looks like the row number. Up top is the column names. And finally the body for actual values.

In [41]:
df.columns

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

In [42]:
df.index

RangeIndex(start=0, stop=1704, step=1)

In [43]:
df.values

array([['Afghanistan', 'Asia', 1952, 28.801, 8425333, 779.4453145],
       ['Afghanistan', 'Asia', 1957, 30.331999999999997, 9240934,
        820.8530296],
       ['Afghanistan', 'Asia', 1962, 31.997, 10267083, 853.1007099999999],
       ...,
       ['Zimbabwe', 'Africa', 1997, 46.809, 11404948, 792.4499602999999],
       ['Zimbabwe', 'Africa', 2002, 39.989000000000004, 11926563,
        672.0386227000001],
       ['Zimbabwe', 'Africa', 2007, 43.486999999999995, 12311143,
        469.70929810000007]], dtype=object)

In [44]:
df.dtypes

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

## Subsetting

In [45]:
countries = df['country']
countries

0       Afghanistan
1       Afghanistan
2       Afghanistan
3       Afghanistan
4       Afghanistan
           ...     
1699       Zimbabwe
1700       Zimbabwe
1701       Zimbabwe
1702       Zimbabwe
1703       Zimbabwe
Name: country, Length: 1704, dtype: object

In [46]:
type(countries)

pandas.core.series.Series

Each pandas column is essentially just a pandas series and each series is just a numpy array.

In [47]:
countries_df = df[['country']]
countries_df

Unnamed: 0,country
0,Afghanistan
1,Afghanistan
2,Afghanistan
3,Afghanistan
4,Afghanistan
...,...
1699,Zimbabwe
1700,Zimbabwe
1701,Zimbabwe
1702,Zimbabwe


In [48]:
type(countries_df)

pandas.core.frame.DataFrame

In [49]:
df.drop(['continent', 'country'], axis='columns')

Unnamed: 0,year,lifeExp,pop,gdpPercap
0,1952,28.801,8425333,779.445314
1,1957,30.332,9240934,820.853030
2,1962,31.997,10267083,853.100710
3,1967,34.020,11537966,836.197138
4,1972,36.088,13079460,739.981106
...,...,...,...,...
1699,1987,62.351,9216418,706.157306
1700,1992,60.377,10704340,693.420786
1701,1997,46.809,11404948,792.449960
1702,2002,39.989,11926563,672.038623


Dropping columns doesn't change the original data frame unless you specify the inplace parameter. Thisis to prevent accidental mutations to the original data frame.

In [50]:
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


To subset rows

In [51]:
df.loc[0]

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap        779.445
Name: 0, dtype: object

Doesn't really look for the 1st and 2nd elements but rather does a string match for the index - or for the row with index label of 0 or 1

In [52]:
df.loc[[0, 1]]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303


It also does not accept negative values

In [53]:
try:
    df.loc[-1]
except KeyError:
    print('KeyError')

KeyError


If you want to use the actual indexes, you can use `iloc`.

In [54]:
df.iloc[-1]

country      Zimbabwe
continent      Africa
year             2007
lifeExp        43.487
pop          12311143
gdpPercap     469.709
Name: 1703, dtype: object

In [55]:
df.loc[:, ['year']].head()

Unnamed: 0,year
0,1952
1,1957
2,1962
3,1967
4,1972


In [56]:
try:
    subset = df.iloc[:, ['year']]
except IndexError:
    print('IndexError')

IndexError


In [57]:
df.iloc[:, [2]].head()

Unnamed: 0,year
0,1952
1,1957
2,1962
3,1967
4,1972


In [58]:
df.loc[df['country'] == 'Philippines']

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1212,Philippines,Asia,1952,47.752,22438691,1272.880995
1213,Philippines,Asia,1957,51.334,26072194,1547.944844
1214,Philippines,Asia,1962,54.757,30325264,1649.552153
1215,Philippines,Asia,1967,56.393,35356600,1814.12743
1216,Philippines,Asia,1972,58.065,40850141,1989.37407
1217,Philippines,Asia,1977,60.06,46850962,2373.204287
1218,Philippines,Asia,1982,62.082,53456774,2603.273765
1219,Philippines,Asia,1987,64.151,60017788,2189.634995
1220,Philippines,Asia,1992,66.458,67185766,2279.324017
1221,Philippines,Asia,1997,68.564,75012988,2536.534925


If you just care about subsetting columns, use square brackets but if you want to subset rows and columns, use loc. Now, you can also mask data frames but this gives a warning because we pandas uses bitwise comparison

In [59]:
try:
    df.loc[df['country'] == 'Philippines' & df['year'] == 1982]
except TypeError:
    print('TypeError')

TypeError


In [60]:
df.loc[(df['country'] == 'Philippines') & (df['year'] == 1982)]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1218,Philippines,Asia,1982,62.082,53456774,2603.273765


## Aggregate Functions

In [61]:
df.groupby('year').agg('mean')

Unnamed: 0_level_0,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,49.05762,16950400.0,3725.276046
1957,51.507401,18763410.0,4299.408345
1962,53.609249,20421010.0,4725.812342
1967,55.67829,22658300.0,5483.653047
1972,57.647386,25189980.0,6770.082815
1977,59.570157,27676380.0,7313.166421
1982,61.533197,30207300.0,7518.901673
1987,63.212613,33038570.0,7900.920218
1992,64.160338,35990920.0,8158.608521
1997,65.014676,38839470.0,9090.175363


In [62]:
import numpy as np
df.groupby('year').agg(np.mean)

Unnamed: 0_level_0,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,49.05762,16950400.0,3725.276046
1957,51.507401,18763410.0,4299.408345
1962,53.609249,20421010.0,4725.812342
1967,55.67829,22658300.0,5483.653047
1972,57.647386,25189980.0,6770.082815
1977,59.570157,27676380.0,7313.166421
1982,61.533197,30207300.0,7518.901673
1987,63.212613,33038570.0,7900.920218
1992,64.160338,35990920.0,8158.608521
1997,65.014676,38839470.0,9090.175363


Group by returns a hierarchical index

In [63]:
df.groupby(['year', 'continent'])[['lifeExp', 'gdpPercap']].agg(np.mean)

Unnamed: 0_level_0,Unnamed: 1_level_0,lifeExp,gdpPercap
year,continent,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,Africa,39.1355,1252.572466
1952,Americas,53.27984,4079.062552
1952,Asia,46.314394,5195.484004
1952,Europe,64.4085,5661.057435
1952,Oceania,69.255,10298.08565
1957,Africa,41.266346,1385.236062
1957,Americas,55.96028,4616.043733
1957,Asia,49.318544,5787.73294
1957,Europe,66.703067,6963.012816
1957,Oceania,70.295,11598.522455


In [64]:
df.groupby(['year', 'continent'])[['lifeExp', 'gdpPercap']].agg(np.mean).reset_index()

Unnamed: 0,year,continent,lifeExp,gdpPercap
0,1952,Africa,39.1355,1252.572466
1,1952,Americas,53.27984,4079.062552
2,1952,Asia,46.314394,5195.484004
3,1952,Europe,64.4085,5661.057435
4,1952,Oceania,69.255,10298.08565
5,1957,Africa,41.266346,1385.236062
6,1957,Americas,55.96028,4616.043733
7,1957,Asia,49.318544,5787.73294
8,1957,Europe,66.703067,6963.012816
9,1957,Oceania,70.295,11598.522455


In [65]:
import seaborn as sns

tips = sns.load_dataset('tips')

In [66]:
# Filter rows by smoker == 'No' and total_bill >= 10
tips.loc[(tips['smoker'] == 'No') & (tips['total_bill'] >= 10)]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
235,10.07,1.25,Male,No,Sat,Dinner,2
238,35.83,4.67,Female,No,Sat,Dinner,3
239,29.03,5.92,Male,No,Sat,Dinner,3
242,17.82,1.75,Male,No,Sat,Dinner,2


In [67]:
tips.groupby(['smoker', 'day', 'time'])['total_bill'].agg('mean').reset_index()

Unnamed: 0,smoker,day,time,total_bill
0,Yes,Thur,Lunch,19.190588
1,Yes,Thur,Dinner,
2,Yes,Fri,Lunch,12.323333
3,Yes,Fri,Dinner,19.806667
4,Yes,Sat,Lunch,
5,Yes,Sat,Dinner,21.276667
6,Yes,Sun,Lunch,
7,Yes,Sun,Dinner,24.12
8,No,Thur,Lunch,17.075227
9,No,Thur,Dinner,18.78


## Data Cleansing

In [68]:
df = pd.read_csv('data/0101_Pandas_Basics/pew.csv')
df.head()

Unnamed: 0,religion,<$10k,$10-20k,$20-30k,$30-40k,$40-50k,$50-75k,$75-100k,$100-150k,>150k,Don't know/refused
0,Agnostic,27,34,60,81,76,137,122,109,84,96
1,Atheist,12,27,37,52,35,70,73,59,74,76
2,Buddhist,27,21,30,34,33,58,62,39,53,54
3,Catholic,418,617,732,670,638,1116,949,792,633,1489
4,Don’t know/refused,15,14,15,11,10,35,21,17,18,116


In [69]:
df.melt(id_vars=['religion'])

Unnamed: 0,religion,variable,value
0,Agnostic,<$10k,27
1,Atheist,<$10k,12
2,Buddhist,<$10k,27
3,Catholic,<$10k,418
4,Don’t know/refused,<$10k,15
...,...,...,...
175,Orthodox,Don't know/refused,73
176,Other Christian,Don't know/refused,18
177,Other Faiths,Don't know/refused,71
178,Other World Religions,Don't know/refused,8


In [70]:
df.melt(id_vars=['religion'], var_name='income', value_name='count')

Unnamed: 0,religion,income,count
0,Agnostic,<$10k,27
1,Atheist,<$10k,12
2,Buddhist,<$10k,27
3,Catholic,<$10k,418
4,Don’t know/refused,<$10k,15
...,...,...,...
175,Orthodox,Don't know/refused,73
176,Other Christian,Don't know/refused,18
177,Other Faiths,Don't know/refused,71
178,Other World Religions,Don't know/refused,8


In [71]:
df = pd.read_csv('data/0101_Pandas_Basics/billboard.csv')
df.head()

Unnamed: 0,year,artist,track,time,date.entered,wk1,wk2,wk3,wk4,wk5,...,wk67,wk68,wk69,wk70,wk71,wk72,wk73,wk74,wk75,wk76
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,87,82.0,72.0,77.0,87.0,...,,,,,,,,,,
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,91,87.0,92.0,,,...,,,,,,,,,,
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,81,70.0,68.0,67.0,66.0,...,,,,,,,,,,
3,2000,3 Doors Down,Loser,4:24,2000-10-21,76,76.0,72.0,69.0,67.0,...,,,,,,,,,,
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,57,34.0,25.0,17.0,17.0,...,,,,,,,,,,


In [72]:
df.melt(id_vars=['year', 'artist', 'track', 'time', 'date.entered'], value_name='rank', var_name='week').head(3)

Unnamed: 0,year,artist,track,time,date.entered,week,rank
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk1,87.0
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,wk1,91.0
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,wk1,81.0


In [73]:
(df
 .melt(id_vars=['year', 'artist', 'track', 'time', 'date.entered'], value_name='rank', var_name='week')
 .groupby('artist')['rank']
 .mean()
)

artist
2 Pac               85.428571
2Ge+her             90.000000
3 Doors Down        37.602740
504 Boyz            56.222222
98^0                37.650000
                      ...    
Yankee Grey         83.125000
Yearwood, Trisha    84.166667
Ying Yang Twins     88.857143
Zombie Nation       99.000000
matchbox twenty     18.641026
Name: rank, Length: 228, dtype: float64

In [74]:
df = pd.read_csv('data/0101_Pandas_Basics/ebola.csv')
df.head()

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
3,1/2/2015,286,,8157.0,,,,,,,,3496.0,,,,,,
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,


In [75]:
df_long = df.melt(id_vars=['Date', 'Day'], var_name='Country', value_name='Count')
df_long.head()

Unnamed: 0,Date,Day,Country,Count
0,1/5/2015,289,Cases_Guinea,2776.0
1,1/4/2015,288,Cases_Guinea,2775.0
2,1/3/2015,287,Cases_Guinea,2769.0
3,1/2/2015,286,Cases_Guinea,
4,12/31/2014,284,Cases_Guinea,2730.0


In [76]:
df_long['Country'].str.split('_', expand=True)

Unnamed: 0,0,1
0,Cases,Guinea
1,Cases,Guinea
2,Cases,Guinea
3,Cases,Guinea
4,Cases,Guinea
...,...,...
1947,Deaths,Mali
1948,Deaths,Mali
1949,Deaths,Mali
1950,Deaths,Mali


In [77]:
df_long[['Status', 'Country']] = df_long['Country'].str.split('_', expand=True)
df_long.head()

Unnamed: 0,Date,Day,Country,Count,Status
0,1/5/2015,289,Guinea,2776.0,Cases
1,1/4/2015,288,Guinea,2775.0,Cases
2,1/3/2015,287,Guinea,2769.0,Cases
3,1/2/2015,286,Guinea,,Cases
4,12/31/2014,284,Guinea,2730.0,Cases


In [78]:
df = pd.read_csv('data/0101_Pandas_Basics/table2.csv')
df.head()

Unnamed: 0,country,year,type,count
0,Afghanistan,1999,cases,745
1,Afghanistan,1999,population,19987071
2,Afghanistan,2000,cases,2666
3,Afghanistan,2000,population,20595360
4,Brazil,1999,cases,37737


In [79]:
# Separate type into columns
df.pivot_table(index=['country', 'year'], columns='type', values='count').reset_index()

type,country,year,cases,population
0,Afghanistan,1999,745,19987071
1,Afghanistan,2000,2666,20595360
2,Brazil,1999,37737,172006362
3,Brazil,2000,80488,174504898
4,China,1999,212258,1272915272
5,China,2000,213766,1280428583


In [80]:
df = pd.read_csv('data/0101_Pandas_Basics/table3.csv')
df.head()

Unnamed: 0,country,year,rate
0,Afghanistan,1999,745/19987071
1,Afghanistan,2000,2666/20595360
2,Brazil,1999,37737/172006362
3,Brazil,2000,80488/174504898
4,China,1999,212258/1272915272


In [81]:
df['population'] = df['rate'].str.split('/', expand=True)[1]
df

Unnamed: 0,country,year,rate,population
0,Afghanistan,1999,745/19987071,19987071
1,Afghanistan,2000,2666/20595360,20595360
2,Brazil,1999,37737/172006362,172006362
3,Brazil,2000,80488/174504898,174504898
4,China,1999,212258/1272915272,1272915272
5,China,2000,213766/1280428583,1280428583


In [84]:
df = pd.DataFrame([1,2,3,4,5,6], columns=['value'])
df

Unnamed: 0,value
0,1
1,2
2,3
3,4
4,5
5,6


In [85]:
def raise_to(x, e):
    return x ** e

df['value'].apply(raise_to, e=2)

0     1
1     4
2     9
3    16
4    25
5    36
Name: value, dtype: int64