In [1]:
import pandas as pd
import numpy as np
import timeit

In [2]:
df = pd.read_csv('datasets/census.csv')
df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
0,40,3,6,1,0,Alabama,Alabama,4779736,4780127,4785161,...,0.002295,-0.193196,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861


In [None]:
# The first of the pandas idioms is method chaining
(df.where(df['SUMLEV'] == 50)
.dropna()
.set_index(['STNAME', 'CTYNAME'])
.rename(columns = {'ESTIMATESBASE2010': 'Estimates base 2010'})
)

# ----> 1.2581288990000417

In [None]:
# Compare to the non-idiomatic version
df = df[df['SUMLEV'] == 50]
df.set_index(['STNAME', 'CTYNAME'], inplace = True)
df.rename(columns = {'ESTIMATESBASE2010': 'Estimates base 2010'}, inplace = True)

# ----> 0.08998235999990811

In [4]:
# The second approach is actually faster, but the 
# idiomatic approach is more readable. So it's a 
# trade-off

In [3]:
# A second idiom is the pandas equivalent to the map()
# function, called applymap(), which pass a function to
# operate over each cell of a DataFrame, with the result
# being also a DataFrame

# To map across all rows of a DataFrame we can use the apply()
# function

# Let's take a dataframe with five columns, each containing
# some population estimates for a given year, and use apply()
# to create two extra columns for minimum or maximum values
def min_max(row):

    data = row[['POPESTIMATE2010',
    'POPESTIMATE2011',
    'POPESTIMATE2012',
    'POPESTIMATE2013',
    'POPESTIMATE2014',
    'POPESTIMATE2015']]

    return pd.Series({'min': np.min(data), 'max': np.max(data)})

df.apply(min_max, axis ='columns').head()

Unnamed: 0,min,max
0,4785161,4858979
1,54660,55347
2,183193,203709
3,26489,27341
4,22512,22861


In [4]:
# Here's a modified version that creates and adds the
# columns to the original DataFrame
def min_max(row):

    data = row[['POPESTIMATE2010',
    'POPESTIMATE2011',
    'POPESTIMATE2012',
    'POPESTIMATE2013',
    'POPESTIMATE2014',
    'POPESTIMATE2015']]

    row['max'] = np.max(data)
    row['min'] = np.min(data)

    return row

df.apply(min_max, axis = 'columns').head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,max,min
0,40,3,6,1,0,Alabama,Alabama,4779736,4780127,4785161,...,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594,4858979,4785161
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333,55347,54660
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499,203709,183193
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299,27341,26489
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861,22861,22512


In [6]:
# The most popular way to use apply() is with lambdas
rows = ['POPESTIMATE2010', 'POPESTIMATE2011', 'POPESTIMATE2012',
    'POPESTIMATE2013', 'POPESTIMATE2014','POPESTIMATE2015']

df.apply(lambda x: np.max(x[rows]), axis = 1).head()

0    4858979
1      55347
2     203709
3      27341
4      22861
dtype: int64

In [7]:
# Let's divide the states into four categories: Northeast,
# Midwest, South, and West
def get_state_region(x):
    northeast = ['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire',
    'Rhode Island', 'Vermont', 'New York', 'New Jersey', 'Pennsylvania']
    midwest = ['Illinois', 'Indiana', 'Iowa', 'Kansas', 'Michigan', 'Minnesota',
     'Missouri', 'Nebraska', 'North Dakota', 'Ohio', 'South Dakota', 'Wisconsin']
    south = ['Alabama', 'Arkansas', 'Delaware', 'District of Columbia', 'Florida', 
     'Georgia', 'Kentucky', 'Louisiana', 'Maryland', 'Mississippi', 'North Carolina', 
     'Oklahoma', 'South Carolina', 'Tennessee', 'Texas', 'Virginia', 'West Virginia']
    west = [ 'Alaska', 'Arizona', 'California', 'Hawaii', 'Idaho', 'Montana',
      'Nevada', 'New Mexico', 'Oregon', 'Utah', 'Washington', 'Wyoming']

    if x in northeast:
        return 'Northeast'
    
    elif x in midwest:
        return 'Midwest'
    
    elif x in south:
        return 'South'
    
    else :
        return 'West'

# Let's use this function to create a new column called
# Region
df['State_Region'] = df['STNAME'].apply(lambda x: get_state_region(x))

# Result
df[['STNAME', 'State_Region']].head()

Unnamed: 0,STNAME,State_Region
0,Alabama,South
1,Alabama,South
2,Alabama,South
3,Alabama,South
4,Alabama,South
