In [389]:
import numpy as np
import pandas as pd
from os import walk # to get list of filenames
import sys # to print function name within function

#for country name cleanup
import unidecode
import re

# Helper functions

In [390]:
# create dfs for later merging

def make_df(filename):
    name = filename.split('.')[0]
    f_type = filename.split('.')[1]
    if f_type=='xlsx':
        df = pd.read_excel(mypath + filename)
    elif f_type=='csv':
        df = pd.read_csv(mypath + filename)
    else:
        print 'Unknown file type'
    #val_name = df.name.split('_')[0]
    df.rename(columns={df.columns[0]:'country'}, inplace=True)
    df.name = name
    #df.columns = map(lambda col: 
                    #'{}_{}'.format(str(col), name), df.columns)
    
    return df


In [525]:
# clean up country names before merging

# helper functions

# https://gist.github.com/gornostal/1f123aaf838506038710
def force_to_unicode(text):
    "If text is unicode, it is returned as is. If it's str, convert it to Unicode using UTF-8 encoding"
    return text if isinstance(text, unicode) else text.decode('utf8')

def remove_special_char(series):
    clean_series = re.sub('\W+',' ', series)
    return clean_series

def fix_country_col(df):
    country_col = str(df.columns[0])
    df.rename(columns = {country_col : 'country'}, inplace = True)
    return df

def add_regions(df):
    regions = pd.read_csv('data/countries_with_regions.csv')
    df = regions.merge(df, on='country', how='outer')
    return df

def clean_gapminder_df(df):
    name = df.name
    df = fix_country_col(df)
    df.name = name

    # remove special chars & accents
    df['country'] = df['country'].apply(force_to_unicode)
    df.columns = df.columns.apply(force_to_unicode)
    #df['country'] = df['country'].apply(unidecode.unidecode, 'utf_8')
    df['country'] = df['country'].apply(remove_special_char)
    
    # add continent & sub-continent data
    #df = add_regions(df)
    
    return df

# create df
#regions = pd.read_json('data/all_countries.json')
#regions = regions[['name', 'region', 'sub-region']]

#regions = clean_gapminder_df(regions)

# save as csv
#regions.to_csv('data/regions_cleaned.csv', index = False)

In [392]:
# create list of filenames

def list_files(mypath):
    files = []
    [files.extend(filenames) for (dirpath, dirnames, filenames) in walk(mypath)]
    return files    

In [393]:
# inspect DF

def df_min_max(df):
    #print '\n'
    print sys._getframe().f_code.co_name
    print df.columns[1:].min()
    print df.columns[1:].max()

# any years with missing data?
def df_yrs_nan_vals(df):
    #print '\n'  
    print sys._getframe().f_code.co_name
    print df.isnull().any().sum()

# any countries that have no data at all?
def df_countries_no_data(df):
    #print '\n'   
    print sys._getframe().f_code.co_name
    print df.iloc[:,3:].isnull().all().sum()

def inspect_df(df):
    name = str.upper(df.name)
    #print '\n'  
    print 'Inspecting %s:' % name
    df_min_max(df)
    df_yrs_nan_vals(df)
    df_countries_no_data(df)
    raw_input('Press <ENTER> to continue')
    print '\n'
    print df.head()
    raw_input('Press <ENTER> to continue')

In [394]:
def get_val_names(dfs):
    names = [df.name.split('_')[0] for df in dfs]
    return names

In [670]:
# reshape into long format for easier plotting

def reshape_for_plot(df):
    #df = df.reset_index().copy()
    #df = df.set_index(['region', 'sub-region', 'country'])
    name = df.name.split('_')[0]
    df = df.set_index(['country'])
    df = df.sort_index(level = 0)

    t = df.T
    t = t.unstack(level = 1)
    df = pd.DataFrame(t)
    
    df = df.reset_index()
    df = df.rename(columns={'level_1':'year', 0: name})
    
    # make sure year column is int type
    df['year'] = df['year'].astype('int64')
    df.name = name

    return df

# Look at source files

In [526]:
originalpath = 'data/original/'
cleanpath = 'data/cleaned/'
mypath = originalpath

originals = list_files(mypath)
originals

['1indicator CDIAC carbon_dioxide_emissions_per_capita.csv',
 'energy use per person.xlsx',
 'hdi_human_development_index.csv',
 'income_per_person_gdppercapita_ppp_inflation_adjusted.csv',
 'motor_vehicles_per_1000_pop2010.xlsx',
 'population.xlsx',
 'roads_paved_percent_of_total_roads.csv',
 'sulfur_emissions_per_person_kg.csv',
 'surviving_kids_per_woman.csv']

In [648]:
# convert to DF by splitting file type as new column
files_df = pd.Series(originals).str.split('.', expand=True).rename(columns={0:'clean_name', 1:'type'})

# keep original filename
files_df['orig_name']=originals

# convert files to DF
dfs = files_df['orig_name'].apply(make_df)

# remove accents and special chars
dfs = dfs.apply(clean_gapminder_df)

In [635]:
# all files converted to df?
print len(originals)
len(dfs)

9


9

In [636]:
files_df

Unnamed: 0,clean_name,type,orig_name
0,1indicator CDIAC carbon_dioxide_emissions_per_...,csv,1indicator CDIAC carbon_dioxide_emissions_per_...
1,energy use per person,xlsx,energy use per person.xlsx
2,hdi_human_development_index,csv,hdi_human_development_index.csv
3,income_per_person_gdppercapita_ppp_inflation_a...,csv,income_per_person_gdppercapita_ppp_inflation_a...
4,motor_vehicles_per_1000_pop2010,xlsx,motor_vehicles_per_1000_pop2010.xlsx
5,population,xlsx,population.xlsx
6,roads_paved_percent_of_total_roads,csv,roads_paved_percent_of_total_roads.csv
7,sulfur_emissions_per_person_kg,csv,sulfur_emissions_per_person_kg.csv
8,surviving_kids_per_woman,csv,surviving_kids_per_woman.csv


## Clean filenames

In [649]:
def clean_names(series):
    # replace spaces with undescores,
    # remove or shorten meaningless words
    series = series.str.upper()\
    .str.replace(' ', '_')\
    .str.replace('INDICATOR_', '')\
    .str.replace('PER_CAPITA', 'pc')\
    .str.replace('PER_PERSON', 'pc')\
    .str.lower()

    return series

In [650]:
files_df['clean_name'] = clean_names(files_df['clean_name'])

In [651]:
files_df

Unnamed: 0,clean_name,type,orig_name
0,1cdiac_carbon_dioxide_emissions_pc,csv,1indicator CDIAC carbon_dioxide_emissions_per_...
1,energy_use_pc,xlsx,energy use per person.xlsx
2,hdi_human_development_index,csv,hdi_human_development_index.csv
3,income_pc_gdppercapita_ppp_inflation_adjusted,csv,income_per_person_gdppercapita_ppp_inflation_a...
4,motor_vehicles_per_1000_pop2010,xlsx,motor_vehicles_per_1000_pop2010.xlsx
5,population,xlsx,population.xlsx
6,roads_paved_percent_of_total_roads,csv,roads_paved_percent_of_total_roads.csv
7,sulfur_emissions_pc_kg,csv,sulfur_emissions_per_person_kg.csv
8,surviving_kids_per_woman,csv,surviving_kids_per_woman.csv


In [652]:
# manually fix certain rows
fix_these_rows = [0,2,3,4,6]
needs_fixing = files_df.iloc[fix_these_rows]['clean_name']
needs_fixing.values

array(['1cdiac_carbon_dioxide_emissions_pc',
       'hdi_human_development_index',
       'income_pc_gdppercapita_ppp_inflation_adjusted',
       'motor_vehicles_per_1000_pop2010',
       'roads_paved_percent_of_total_roads'], dtype=object)

In [653]:
# copy-paste-modify
fixed = np.array(['CO2_pc','hdi',
       'income_pc',
       'motor_vehicles_per_1000',
       'roads_paved_%'])


In [654]:
# update rows with cleaned names
files_df['clean_name'].iloc[fix_these_rows] = fixed

In [655]:
# verify
files_df['clean_name']

0                      CO2_pc
1               energy_use_pc
2                         hdi
3                   income_pc
4     motor_vehicles_per_1000
5                  population
6               roads_paved_%
7      sulfur_emissions_pc_kg
8    surviving_kids_per_woman
Name: clean_name, dtype: object

In [656]:
# assign clean name as the df.name attribute
def update_name(df, clean_name):
    df.name = clean_name

# could not figure out how to do this with pd.Series.apply
map(update_name, dfs, files_df['clean_name'].values)

[None, None, None, None, None, None, None, None, None]

In [436]:
# save as CSV using cleaned name
'''def clean_to_csv(df):
    cleanname = files_df['clean_name'].str.cat(files_df['type'], sep='.')
    df.to_csv(cleanpath + cleanname, index=False)
    return none'''

In [657]:
# IS THIS STEP EVEN NEEDED????
# Save to csv files
dfs.apply(lambda df: df.to_csv(cleanpath + df.name + '.' + 'csv', index = False))

0    None
1    None
2    None
3    None
4    None
5    None
6    None
7    None
8    None
Name: orig_name, dtype: object

# Create wide DFs

In [255]:
# create "driving" DF, ie the left-most DF when merging
#co2_regions = pd.read_pickle('data/co2_regions.pkl')
#co2_regions.to_csv('data/cleaned/!co2_regions.csv')

## Inspect resulting DFs

In [540]:
#len(dfs)
[inspect_df(df) for df in dfs]

Inspecting CO2_PC:
df_min_max
1751
2012
df_yrs_nan_vals
253
df_countries_no_data
0
Press <ENTER> to continue


                 country  1751  1755  1762  1763  1764  1765  1766  1767  \
0               Abkhazia   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   
1            Afghanistan   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   
2  Akrotiri and Dhekelia   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   
3                Albania   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   
4                Algeria   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   

   1768    ...         2003      2004      2005      2006      2007      2008  \
0   NaN    ...          NaN       NaN       NaN       NaN       NaN       NaN   
1   NaN    ...     0.022704  0.027472  0.036780  0.047090  0.068312  0.131602   
2   NaN    ...          NaN       NaN       NaN       NaN       NaN       NaN   
3   NaN    ...     1.382066  1.332966  1.353789  1.224310  1.279420  1.297753   
4   NaN    ...     2.899236

Press <ENTER> to continue
Inspecting SULFUR_EMISSIONS_PC_KG:
df_min_max
1850
2000
df_yrs_nan_vals
0
df_countries_no_data
0
Press <ENTER> to continue


               country     1850     1851     1852     1853     1854     1855  \
0          Afghanistan  0.00000  0.00007  0.00013  0.00020  0.00026  0.00033   
1              Albania  0.00017  0.00019  0.00024  0.00026  0.00030  0.00034   
2              Algeria  0.00000  0.00200  0.00398  0.00593  0.00786  0.00976   
3               Angola  0.00003  0.00003  0.00005  0.00008  0.00011  0.00013   
4  Antigua and Barbuda  0.00240  0.00160  0.00301  0.00271  0.00336  0.00325   

      1856     1857     1858  ...     1991   1992   1993   1994   1995   1996  \
0  0.00039  0.00045  0.00051  ...     3.99   3.73   3.40   3.23   3.16   3.14   
1  0.00039  0.00042  0.00048  ...    10.40   3.77   2.96   2.46   2.20   2.19   
2  0.01170  0.01350  0.01540  ...     4.76   4.93   5.01   4.92   5.17   4.98   
3  0.00016  0.00018  0.00021  ...     7.61  

[None, None, None, None, None, None, None, None, None]

Not sure what happened here. The same problem is in the original CSV, so I downloaded it again from Gapminder. Problem solved.

## Surviving DF contains projections

Unlike the other DFs, this one looks into the future, which will create NANs all over the place in the merged DF.

Let's drop these projections.

In [541]:
# find DF
files_df['clean_name']

0                      CO2_pc
1               energy_use_pc
2                         hdi
3                   income_pc
4     motor_vehicles_per_1000
5                  population
6               roads_paved_%
7      sulfur_emissions_pc_kg
8    surviving_kids_per_woman
Name: clean_name, dtype: object

In [542]:
# it's the final (8th) df
# remove projection years (post 2015) for Surviving DF

start = dfs[8].columns.get_loc('2016')
end = dfs[8].columns.get_loc('2099') + 1

dfs[8].drop(dfs[8].columns[start:end], axis=1, inplace=True)

In [543]:
# verify
dfs[8].tail()

Unnamed: 0,country,1760,1761,1762,1763,1764,1765,1766,1767,1768,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
175,Venezuela,,,,,,,,,,...,2.44,2.42,2.39,2.36,2.33,,,,,
176,Vietnam,,,,,,,,,,...,1.78,1.77,1.76,1.75,1.73,,,,,
177,Yemen,,,,,,,,,,...,4.82,4.77,4.68,4.64,4.55,,,,,
178,Zambia,,,,,,,,,,...,3.92,4.02,4.12,4.22,4.33,,,,,
179,Zimbabwe,,,,,,,,,,...,2.26,2.27,2.28,2.32,2.32,,,,,


# Create long DFs

In [671]:
long_dfs = [reshape_for_plot(dfs[i]) for i, _ in enumerate(dfs)]
len(long_dfs)

9

# Join long DFs

In [673]:
# thank you http://notconfusing.com/joining-many-dataframes-at-once-in-pandas-n-ary-join/
def merge_dfs(ldf, rdf):
    right_on = list(rdf.columns[0:2])
    left_on = list(ldf.columns[0:2])
    return ldf.merge(rdf, how='left', left_on=left_on, right_on=right_on)
    

final_df = reduce(merge_dfs, long_dfs) #that's the magic
final_df.tail(20)

Unnamed: 0,country,year,CO2,energy,hdi,income,motor,population,roads,sulfur,surviving
59435,Zimbabwe,1993,1.441291,0.865807,0.479,2350.0,,11256512.0,17.0,8.27,3.69
59436,Zimbabwe,1994,1.538291,0.842736,0.475,2520.0,,11476807.0,54.9,7.72,3.47
59437,Zimbabwe,1995,1.294742,0.842225,0.465,2480.0,,11683136.0,48.0,7.25,3.24
59438,Zimbabwe,1996,1.260839,0.824112,0.46,2690.0,,11877664.0,47.4,6.82,3.04
59439,Zimbabwe,1997,1.191934,0.802734,0.451,2710.0,,12059858.0,47.4,6.3,2.85
59440,Zimbabwe,1998,1.162342,0.809675,0.442,2750.0,,12226742.0,47.4,6.64,2.67
59441,Zimbabwe,1999,1.276289,0.853539,0.434,2690.0,,12374019.0,,7.3,2.56
59442,Zimbabwe,2000,1.110012,0.790319,0.427,2570.0,,12499981.0,,7.15,2.46
59443,Zimbabwe,2001,0.998965,0.772111,0.427,2580.0,,12603988.0,,,2.36
59444,Zimbabwe,2002,0.946346,0.770138,0.418,2320.0,,12691431.0,19.0,,2.28


In [700]:
# add region data as columns
countries_regions = pd.read_csv('data/countries_with_regions.csv')
final_df = final_df.merge(countries_regions, on='country', how='left')
final_df.tail(25)

Unnamed: 0,country,region_x,sub-region_x,year,CO2,energy,hdi,income,motor,population,roads,sulfur,surviving,region_y,sub-region_y
15047,Zimbabwe,Africa,Eastern Africa,1986,1.428493,0.85218,,2410.0,,9198874.0,,7.86,5.08,Africa,Eastern Africa
15048,Zimbabwe,Africa,Eastern Africa,1987,1.600133,0.89783,,2350.0,,9535657.0,,9.41,4.89,Africa,Eastern Africa
15049,Zimbabwe,Africa,Eastern Africa,1988,1.634179,0.878731,,2450.0,,9866776.0,,8.3,4.71,Africa,Eastern Africa
15050,Zimbabwe,Africa,Eastern Africa,1989,1.59154,0.870441,,2490.0,,10184966.0,,8.3,4.49,Africa,Eastern Africa
15051,Zimbabwe,Africa,Eastern Africa,1990,1.480788,0.888059,0.499,2590.0,,10484771.0,14.0,8.07,4.33,Africa,Eastern Africa
15052,Zimbabwe,Africa,Eastern Africa,1991,1.472027,0.916924,0.501,2670.0,,10763036.0,15.0,9.28,4.13,Africa,Eastern Africa
15053,Zimbabwe,Africa,Eastern Africa,1992,1.535539,0.924668,0.486,2370.0,,11019717.0,16.0,9.8,3.88,Africa,Eastern Africa
15054,Zimbabwe,Africa,Eastern Africa,1993,1.441291,0.865807,0.479,2350.0,,11256512.0,17.0,8.27,3.69,Africa,Eastern Africa
15055,Zimbabwe,Africa,Eastern Africa,1994,1.538291,0.842736,0.475,2520.0,,11476807.0,54.9,7.72,3.47,Africa,Eastern Africa
15056,Zimbabwe,Africa,Eastern Africa,1995,1.294742,0.842225,0.465,2480.0,,11683136.0,48.0,7.25,3.24,Africa,Eastern Africa


In [677]:
# rearrange column order so region info beside country col
cols = final_df.columns.tolist()

newcols = [cols[0]]
newcols.extend(cols[-2:])
newcols.extend(cols[1:-2])

newcols

['country',
 'region',
 'sub-region',
 'year',
 'CO2',
 'energy',
 'hdi',
 'income',
 'motor',
 'population',
 'roads',
 'sulfur',
 'surviving']

In [678]:
final_df = final_df[newcols]
final_df.head()

Unnamed: 0,country,region,sub-region,year,CO2,energy,hdi,income,motor,population,roads,sulfur,surviving
0,Abkhazia,,,1751,,,,,,,,,
1,Abkhazia,,,1755,,,,,,,,,
2,Abkhazia,,,1762,,,,,,,,,
3,Abkhazia,,,1763,,,,,,,,,
4,Abkhazia,,,1764,,,,,,,,,


In [679]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59455 entries, 0 to 59454
Data columns (total 13 columns):
country       59455 non-null object
region        50853 non-null object
sub-region    50853 non-null object
year          59455 non-null int64
CO2           15072 non-null float64
energy        5139 non-null float64
hdi           3562 non-null float64
income        38553 non-null float64
motor         440 non-null float64
population    17631 non-null float64
roads         1931 non-null float64
sulfur        23556 non-null float64
surviving     29952 non-null float64
dtypes: float64(9), int64(1), object(3)
memory usage: 6.4+ MB


In [701]:
# how many countries missing CO2 emission values?
final_df.loc[final_df['CO2'].isnull()]

Unnamed: 0,country,region_x,sub-region_x,year,CO2,energy,hdi,income,motor,population,roads,sulfur,surviving,region_y,sub-region_y


In [697]:
# drop all countries missing CO2 data
no_co2 = final_df[final_df['CO2'].isnull()].index
final_df.drop(no_co2, inplace=True)

In [698]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15072 entries, 442 to 59452
Data columns (total 13 columns):
country       15072 non-null object
region        15072 non-null object
sub-region    15072 non-null object
year          15072 non-null int64
CO2           15072 non-null float64
energy        5082 non-null float64
hdi           3283 non-null float64
income        13823 non-null float64
motor         439 non-null float64
population    10943 non-null float64
roads         1863 non-null float64
sulfur        10775 non-null float64
surviving     13244 non-null float64
dtypes: float64(9), int64(1), object(3)
memory usage: 1.6+ MB


In [699]:
# save as csv
final_df.to_csv('data/final/final_df.csv', index=False)