In [1]:
import pandas as pd
from os import walk # to get list of filenames
import sys # to print function name within function

#for country name cleanup
import unidecode
import re

# Helper functions

In [2]:
# create dfs for later merging

def make_df(filename):
    df = pd.read_csv(mypath + filename)
    df.name = filename.split('.')[0]
    val_name = df.name.split('_')[0]
    df.rename(columns={df.columns[0]:'country'}, inplace=True)
    #df.columns = map(lambda col: 
                    #'{}_{}'.format(str(col), name), df.columns)
    
    return df


In [3]:
# clean up country names before merging

# helper functions
def remove_special_char(series):
    clean_series = re.sub('\W+',' ', series)
    return clean_series

def fix_country_col(df):
    country_col = str(df.columns[0])
    df.rename(columns = {country_col : 'country'}, inplace = True)
    return df

def add_regions(df):
    regions = pd.read_csv('data/countries_with_regions.csv')
    df = regions.merge(df, on='country', how='outer')
    return df

def clean_gapminder_df(df):
    df = fix_country_col(df)

    # remove special chars & accents
    df['country'] = df['country'].apply(unidecode.unidecode)
    df['country'] = df['country'].apply(remove_special_char)
    
    # add continent & sub-continent data
    df = add_regions(df)
    
    return df

# create df
#regions = pd.read_json('data/all_countries.json')
#regions = regions[['name', 'region', 'sub-region']]

#regions = clean_gapminder_df(regions)

# save as csv
#regions.to_csv('data/regions_cleaned.csv', index = False)

In [4]:
# create list of filenames

mypath = 'data/original/'
def list_files(mypath):
    files = []
    [files.extend(filenames) for (dirpath, dirnames, filenames) in walk(mypath)]
    return files    

In [5]:
# inspect DF

def df_min_max(df):
    #print '\n'
    print sys._getframe().f_code.co_name
    print df.columns[1:].min()
    print df.columns[1:].max()

# any years with missing data?
def df_yrs_nan_vals(df):
    #print '\n'  
    print sys._getframe().f_code.co_name
    print df.isnull().any().sum()

# any countries that have no data at all?
def df_countries_no_data(df):
    #print '\n'   
    print sys._getframe().f_code.co_name
    print df.iloc[:,3:].isnull().all().sum()

def inspect_df(df):
    name = df.name
    #print '\n'  
    print 'Inspect %s dataframe:' % name
    df_min_max(df)
    df_yrs_nan_vals(df)
    df_countries_no_data(df)
    print '\n'
    print df.head()
    raw_input('Press <ENTER> to continue')

In [6]:
def get_val_names(dfs):
    names = [df.name.split('_')[0] for df in dfs]
    return names

In [7]:
# reshape into long format for easier plotting

def reshape_for_plot(df, val_name):
    #df = df.reset_index().copy()
    #df = df.set_index(['region', 'sub-region', 'country'])
    name = df.name.split('_')[0]
    df = df.set_index(['country'])
    df = df.sort_index(level = 0)

    t = df.T
    t = t.unstack(level = 1)
    df = pd.DataFrame(t)
    
    df = df.reset_index()
    df = df.rename(columns={'level_1':'year', 0: val_name})
    df.name = name

    return df

# Look at source files

In [41]:
originals = list_files(mypath)
originals

['Coal Consumption per capita.xlsx',
 'energy use per person.xlsx',
 'hdi_human_development_index.csv',
 'income_per_person_gdppercapita_ppp_inflation_adjusted.csv',
 'indicator CDIAC carbon_dioxide_emissions_per_capita.csv',
 'indicator undata total_fertility.xlsx',
 'Indicator_Electricity consumption per capita.xlsx',
 'motor_vehicles_per_1000_pop2010.xlsx',
 'population.xlsx',
 'roads_paved_percent_of_total_roads.csv',
 'sulfur_emissions_per_person_kg.csv',
 'surviving_kids_per_woman.csv']

In [44]:
# replace spaces with undescores,
# remove or shorten meaningless words
fixed_filenames = pd.Series(originals).str.upper()\
.str.replace(' ', '_').str.replace('INDICATOR_', '')\
.str.replace('PER_CAPITA', 'pc').str.lower()

fixed_filenames

0                              coal_consumption_pc.xlsx
1                            energy_use_per_person.xlsx
2                       hdi_human_development_index.csv
3     income_per_person_gdppercapita_ppp_inflation_a...
4                 cdiac_carbon_dioxide_emissions_pc.csv
5                           undata_total_fertility.xlsx
6                       electricity_consumption_pc.xlsx
7                  motor_vehicles_per_1000_pop2010.xlsx
8                                       population.xlsx
9                roads_paved_percent_of_total_roads.csv
10                   sulfur_emissions_per_person_kg.csv
11                         surviving_kids_per_woman.csv
dtype: object

In [50]:
# identify type of file

filetype = fixed_filenames.str.split('.', expand=True)
filetype[1]


0     xlsx
1     xlsx
2      csv
3      csv
4      csv
5     xlsx
6     xlsx
7     xlsx
8     xlsx
9      csv
10     csv
11     csv
Name: 1, dtype: object

In [57]:
files_df = pd.concat([fixed_filenames, filetype[1]], axis=1)
files_df

Unnamed: 0,0,1
0,coal_consumption_pc.xlsx,xlsx
1,energy_use_per_person.xlsx,xlsx
2,hdi_human_development_index.csv,csv
3,income_per_person_gdppercapita_ppp_inflation_a...,csv
4,cdiac_carbon_dioxide_emissions_pc.csv,csv
5,undata_total_fertility.xlsx,xlsx
6,electricity_consumption_pc.xlsx,xlsx
7,motor_vehicles_per_1000_pop2010.xlsx,xlsx
8,population.xlsx,xlsx
9,roads_paved_percent_of_total_roads.csv,csv


# Create wide DFs

In [7]:
co2_regions = pd.read_pickle('data/co2_regions.pkl')
co2_regions.to_csv('data/original/co2_regions.csv')

In [92]:
files = list_files(mypath)
print len(files)
files

8


['!co2_regions.csv',
 'children_per_woman_total_fertility.csv',
 'energy_regions_matched.csv',
 'hdi_human_development_index.csv',
 'income_per_person_gdppercapita_ppp_inflation_adjusted.csv',
 'pop_regions.csv',
 'roads_paved_percent_of_total_roads.csv',
 'surviving_kids_per_woman.csv']

In [82]:
# verify
dfs = [make_df(filename) for filename in files]
len(dfs)

8

In [39]:
[inspect_df(df) for df in dfs]

Inspect !co2_regions dataframe:
df_min_max
1751
sub-region
df_yrs_nan_vals
249
df_countries_no_data
0


       country  region       sub-region  1751  1755  1762  1763  1764  1765  \
0  Afghanistan    Asia    Southern Asia   NaN   NaN   NaN   NaN   NaN   NaN   
1      Albania  Europe  Southern Europe   NaN   NaN   NaN   NaN   NaN   NaN   
2      Algeria  Africa  Northern Africa   NaN   NaN   NaN   NaN   NaN   NaN   
3      Andorra  Europe  Southern Europe   NaN   NaN   NaN   NaN   NaN   NaN   
4       Angola  Africa    Middle Africa   NaN   NaN   NaN   NaN   NaN   NaN   

   1766    ...         2003      2004      2005      2006      2007      2008  \
0   NaN    ...     0.022704  0.027472  0.036780  0.047090  0.068312  0.131602   
1   NaN    ...     1.382066  1.332966  1.353789  1.224310  1.279420  1.297753   
2   NaN    ...     2.899236  2.762220  3.257010  3.113135  3.312875  3.328945   
3   NaN    ...     7.414281  7.499690  7.390955  6.839940  6.622435  6.527241   
4   NaN    ...  

Press <ENTER> to continue
Inspect surviving_kids_per_woman dataframe:
df_min_max
1760
2099
df_yrs_nan_vals
275
df_countries_no_data
85


       country  1760  1761  1762  1763  1764  1765  1766  1767  1768  ...   \
0  Afghanistan   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    
1      Albania   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    
2      Algeria   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    
3       Angola   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    
4    Argentina   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    

   2090  2091  2092  2093  2094  2095  2096  2097  2098  2099  
0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  1.78  
1   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  1.94  
2   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  1.93  
3   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  1.75  
4   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  1.92  

[5 rows x

[None, None, None, None, None, None, None, None]

## Surviving DF contains projections

Unlike the other DFs, this one looks into the future, which will create NANs all over the place in the merged DF.

Let's drop these projections.

In [86]:
# remove projection years (post 2015) for Surviving DF

start = dfs[7].columns.get_loc('2016')
end = dfs[7].columns.get_loc('2099') + 1

dfs[7].drop(dfs[7].columns[start:end], axis=1, inplace=True)

In [88]:
# verify
dfs[7].tail()

Unnamed: 0,country,1760,1761,1762,1763,1764,1765,1766,1767,1768,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
175,Venezuela,,,,,,,,,,...,2.44,2.42,2.39,2.36,2.33,,,,,
176,Vietnam,,,,,,,,,,...,1.78,1.77,1.76,1.75,1.73,,,,,
177,Yemen,,,,,,,,,,...,4.82,4.77,4.68,4.64,4.55,,,,,
178,Zambia,,,,,,,,,,...,3.92,4.02,4.12,4.22,4.33,,,,,
179,Zimbabwe,,,,,,,,,,...,2.26,2.27,2.28,2.32,2.32,,,,,


# Create long DFs

In [94]:
# extract first word of each file
# these will be the label for'value' column 
# of each long DF
val_names = get_val_names(dfs)
val_names

['!co2', 'children', 'energy', 'hdi', 'income', 'pop', 'roads', 'surviving']

In [96]:
long_dfs = [reshape_for_plot(dfs[i], val_names[i]) for i, _ in enumerate(dfs)]
len(long_dfs)

8

# Join long DFs

In [18]:
#new_df = pd.merge(A_df, B_df,  how='left', left_on=['A_c1','c2'], right_on = ['B_c1','c2'])

In [97]:
# thank you http://notconfusing.com/joining-many-dataframes-at-once-in-pandas-n-ary-join/
def merge_dfs(ldf, rdf):
    right_on = list(rdf.columns[0:2])
    left_on = list(ldf.columns[0:2])
    return ldf.merge(rdf, how='outer', left_on=left_on, right_on=right_on)
    

final_df = reduce(merge_dfs, long_dfs) #that's the magic
final_df.head()

Unnamed: 0,country,year,!co2,children,energy,hdi,income,pop,roads,surviving
0,Afghanistan,region,Asia,,,,,Asia,,
1,Afghanistan,sub-region,Southern Asia,,,,,Southern Asia,,
2,Afghanistan,1751,,,,,,,,
3,Afghanistan,1755,,,,,,,,
4,Afghanistan,1762,,,,,,,,


In [98]:
# drop region & subregion rows
final_df.drop(final_df.index[:2], inplace=True)
final_df.reset_index(inplace=True, drop=True)
final_df.head()

Unnamed: 0,country,year,!co2,children,energy,hdi,income,pop,roads,surviving
0,Afghanistan,1751,,,,,,,,
1,Afghanistan,1755,,,,,,,,
2,Afghanistan,1762,,,,,,,,
3,Afghanistan,1763,,,,,,,,
4,Afghanistan,1764,,,,,,,,


In [99]:
# add region data as columns
countries_regions = pd.read_csv('data/countries_with_regions.csv')
final_df = final_df.merge(countries_regions, on='country', how='left')


In [102]:
# rearrange column order so region info beside country col
cols = final_df.columns.tolist()

newcols = [cols[0]]
newcols.extend(cols[-2:])
newcols.extend(cols[1:-2])

newcols

['country',
 'region',
 'sub-region',
 'year',
 '!co2',
 'children',
 'energy',
 'hdi',
 'income',
 'pop',
 'roads',
 'surviving']

In [None]:
final_df = final_df[newcols]
final_df.head()

In [104]:
# save as csv
final_df.to_csv('data/cleaned/final_df.csv', index=False)