In [1]:
import pandas as pd
from calendar import isleap
import numpy as np

In [2]:
data = pd.read_csv('../data/gapminder.tsv', sep='\t')

In [3]:
data.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


# Apply

In [18]:
def formula(row):
    return row['pop'] / 1000000 + 20 - row['lifeExp']

In [10]:
data['dawg_index']=data['pop'] / 1000000 + 20 - data['lifeExp']

In [11]:
data.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,dawg_index
0,Afghanistan,Asia,1952,28.801,8425333,779.445314,-0.375667
1,Afghanistan,Asia,1957,30.332,9240934,820.85303,-1.091066
2,Afghanistan,Asia,1962,31.997,10267083,853.10071,-1.729917
3,Afghanistan,Asia,1967,34.02,11537966,836.197138,-2.482034
4,Afghanistan,Asia,1972,36.088,13079460,739.981106,-3.00854


In [20]:
data['dawg_index_2']=data.apply(formula, axis='columns')

In [21]:
data.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,dawg_index,dawg_index_2
0,Afghanistan,Asia,1952,28.801,8425333,779.445314,-0.375667,-0.375667
1,Afghanistan,Asia,1957,30.332,9240934,820.85303,-1.091066,-1.091066
2,Afghanistan,Asia,1962,31.997,10267083,853.10071,-1.729917,-1.729917
3,Afghanistan,Asia,1967,34.02,11537966,836.197138,-2.482034,-2.482034
4,Afghanistan,Asia,1972,36.088,13079460,739.981106,-3.00854,-3.00854


- How many observations were recoreded in leap years?
- The function isleap can be use to determine whether a year is a leap year

In [30]:
data['leap']=data.year.apply(isleap)

In [31]:
data.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,dawg_index,dawg_index_2,leap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314,-0.375667,-0.375667,True
1,Afghanistan,Asia,1957,30.332,9240934,820.85303,-1.091066,-1.091066,False
2,Afghanistan,Asia,1962,31.997,10267083,853.10071,-1.729917,-1.729917,False
3,Afghanistan,Asia,1967,34.02,11537966,836.197138,-2.482034,-2.482034,False
4,Afghanistan,Asia,1972,36.088,13079460,739.981106,-3.00854,-3.00854,True


In [43]:
data.set_index(['country', 'leap']);

In [44]:
data.groupby(['continent','leap']).count();

In [24]:
isleap(1999)

False

In [25]:
isleap(2000)

True

In [26]:
isleap(2010)

False

In [27]:
isleap(2020)

True

# Binning

The international poverty line is define at `$1.9` per day.
That is `$677` per year.

We want to group our observation in four categories:
- POOR: gdpPercap <= 677
- MIDDLE: 677 < gdpPercap <= 2 * 677
- RICH: gdpPercap > 2 * 677

In [62]:
# data.head(10)

In [52]:
data['wealth'] = pd.cut(data.gdpPercap, [0.0, 677, 2 * 677, np.inf], labels=['POOR', 'MIDDLE', 'RICH'])

In [65]:
# data

In [63]:
# data.head(10)

In [59]:
data.groupby('wealth').mean()

Unnamed: 0_level_0,year,lifeExp,pop,gdpPercap,dawg_index,dawg_index_2,leap
wealth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
POOR,1974.57764,43.865437,42123040.0,522.420669,18.257603,18.257603,0.285714
MIDDLE,1978.048387,47.760985,33325280.0,958.571831,5.564294,5.564294,0.267742
RICH,1980.507705,64.457586,27029860.0,9662.327941,-17.427724,-17.427724,0.240876


In [60]:
data.groupby('wealth').count()

Unnamed: 0_level_0,country,continent,year,lifeExp,pop,gdpPercap,dawg_index,dawg_index_2,leap
wealth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
POOR,161,161,161,161,161,161,161,161,161
MIDDLE,310,310,310,310,310,310,310,310,310
RICH,1233,1233,1233,1233,1233,1233,1233,1233,1233


In [61]:
data.groupby('wealth').country.count()

wealth
POOR       161
MIDDLE     310
RICH      1233
Name: country, dtype: int64