In [452]:
import numpy as np
import pandas as pd
from os import walk # to get list of filenames
import sys # to print function name within function

#for country name cleanup
import unidecode
import re

# Helper functions

In [453]:
# create dfs for later merging

def make_df(filename):
    name = filename.split('.')[0]
    f_type = filename.split('.')[1]
    if f_type=='xlsx':
        df = pd.read_excel(mypath + filename)
    elif f_type=='csv':
        df = pd.read_csv(mypath + filename)
    else:
        print 'Unknown file type'
    
    # make sure all DFs have a 'country' column
    # to simplify .join() later
    df.rename(columns={df.columns[0]:'country'}, inplace=True)
    
    # the .name attribute will provide be used
    # when reshaping, as the label for the value column
    df.name = name
    
    return df


In [761]:
# clean up country names before merging

# helper functions

# https://gist.github.com/gornostal/1f123aaf838506038710
def force_to_unicode(text):
    "If text is unicode, it is returned as is. If it's str, convert it to Unicode using UTF-8 encoding"
    return text if isinstance(text, unicode) else text.decode('utf8')

def remove_special_char(series):
    clean_series = re.sub('\W+',' ', series)
    return clean_series

def fix_country_col(df):
    country_col = str(df.columns[0])
    df.rename(columns = {country_col : 'country'}, inplace = True)
    return df

def clean_country_col(df):
    # trim, remove accents & special chars
    # make lowercase
    df['country'] = (
        df['country'].apply(force_to_unicode)
        .apply(remove_special_char)
        .str.lower().str.strip()
    )
    return df

def add_regions(df):
    regions = pd.read_csv('data/countries_with_regions.csv')
    regions = clean_country_col(regions)
    df = regions.merge(df, on='country', how='left')
    
    return df

def clean_gapminder_df(df):
    name = df.name
    df = fix_country_col(df)

    # remove special chars & accents
    df = clean_country_col(df)
    df['country'] = df['country'].apply(force_to_unicode).apply(remove_special_char).str.lower().str.strip()
    
    # restore title case
    df['country'] = df['country'].str.title()
    
    # re-assign original df name
    df.name = name

    return df

# create regions df
#regions = pd.read_json('data/all_countries.json')
#regions = regions[['name', 'region', 'sub-region']]

#regions = clean_gapminder_df(regions)

# save as csv
#regions.to_csv('data/regions_cleaned.csv', index = False)

In [455]:
# create list of filenames

def list_files(mypath):
    files = []
    [files.extend(filenames) for (dirpath, dirnames, filenames) in walk(mypath)]
    return files    

In [456]:
# inspect DF

def df_min_max(df):
    #print '\n'
    print sys._getframe().f_code.co_name
    print df.columns[1:].min()
    print df.columns[1:].max()

# any years with missing data?
def df_yrs_nan_vals(df):
    #print '\n'  
    print sys._getframe().f_code.co_name
    print df.isnull().any().sum()

# any countries that have no data at all?
def df_countries_no_data(df):
    #print '\n'   
    print sys._getframe().f_code.co_name
    print df.iloc[:,3:].isnull().all().sum()

def inspect_df(df):
    name = str.upper(df.name)
    #print '\n'  
    print 'Inspecting %s:' % name
    df_min_max(df)
    df_yrs_nan_vals(df)
    df_countries_no_data(df)
    raw_input('Press <ENTER> to continue')
    print '\n'
    print df.head()
    raw_input('Press <ENTER> to continue')

In [33]:
def get_val_names(dfs):
    names = [df.name.split('_')[0] for df in dfs]
    return names

In [399]:
# reshape into long format for easier plotting

def reshape_for_plot(df):
    #df = df.reset_index().copy()
    #df = df.set_index(['region', 'sub-region', 'country'])
    name = df.name.split('_')[0]
    df = df.set_index(['region', 'sub-region', 'country'])
    df = df.sort_index(level = 0)

    t = df.T
    t = t.unstack(level = 1)
    df = pd.DataFrame(t)
    
    df = df.reset_index()
    df = df.rename(columns={'level_1':'year', 0: name})
    
    # make sure year column is int type
    #df['year'] = df['year'].astype('int64')
    df.name = name

    return df

# Look at source files

In [759]:
originalpath = 'data/original/'
cleanpath = 'data/cleaned/'
mypath = originalpath

originals = list_files(mypath)
originals

['!CO2_2013.csv',
 'energy use per person.xlsx',
 'hdi_human_development_index.csv',
 'income_per_person_gdppercapita_ppp_inflation_adjusted.csv',
 'motor_vehicles_per_1000_pop2010.xlsx',
 'population.xlsx',
 'roads_paved_percent_of_total_roads.csv',
 'sulfur_emissions_per_person_kg.csv',
 'surviving_kids_per_woman.csv']

# Convert to DF

In [762]:
# batch convert files to df's
# and save filename info in another df

# extract file extension as new column
files_df = pd.Series(originals).str.split('.', expand=True).rename(columns={0:'clean_name', 1:'type'})

# keep original filename
files_df['orig_name']=originals

# convert files to DF
dfs = files_df['orig_name'].apply(make_df)

# remove accents and special chars
dfs = dfs.apply(clean_gapminder_df)

files_df

Unnamed: 0,clean_name,type,orig_name
0,!CO2_2013,csv,!CO2_2013.csv
1,energy use per person,xlsx,energy use per person.xlsx
2,hdi_human_development_index,csv,hdi_human_development_index.csv
3,income_per_person_gdppercapita_ppp_inflation_a...,csv,income_per_person_gdppercapita_ppp_inflation_a...
4,motor_vehicles_per_1000_pop2010,xlsx,motor_vehicles_per_1000_pop2010.xlsx
5,population,xlsx,population.xlsx
6,roads_paved_percent_of_total_roads,csv,roads_paved_percent_of_total_roads.csv
7,sulfur_emissions_per_person_kg,csv,sulfur_emissions_per_person_kg.csv
8,surviving_kids_per_woman,csv,surviving_kids_per_woman.csv


# Clean filenames for use as labels

In [763]:
def clean_names(series):
    # replace spaces with undescores,
    # remove or shorten meaningless words
    series = series.str.upper()\
    .str.replace(' ', '_')\
    .str.replace('INDICATOR_', '')\
    .str.replace('PER_CAPITA', 'pc')\
    .str.replace('PER_PERSON', 'pc')\
    .str.lower()

    return series

In [764]:
files_df['clean_name'] = clean_names(files_df['clean_name'])

In [765]:
files_df

Unnamed: 0,clean_name,type,orig_name
0,!co2_2013,csv,!CO2_2013.csv
1,energy_use_pc,xlsx,energy use per person.xlsx
2,hdi_human_development_index,csv,hdi_human_development_index.csv
3,income_pc_gdppercapita_ppp_inflation_adjusted,csv,income_per_person_gdppercapita_ppp_inflation_a...
4,motor_vehicles_per_1000_pop2010,xlsx,motor_vehicles_per_1000_pop2010.xlsx
5,population,xlsx,population.xlsx
6,roads_paved_percent_of_total_roads,csv,roads_paved_percent_of_total_roads.csv
7,sulfur_emissions_pc_kg,csv,sulfur_emissions_per_person_kg.csv
8,surviving_kids_per_woman,csv,surviving_kids_per_woman.csv


In [766]:
# manually fix certain rows
fix_these_rows = [0,2,3,4,6]
needs_fixing = files_df.iloc[fix_these_rows]['clean_name']
needs_fixing.values

array(['!co2_2013', 'hdi_human_development_index',
       'income_pc_gdppercapita_ppp_inflation_adjusted',
       'motor_vehicles_per_1000_pop2010',
       'roads_paved_percent_of_total_roads'], dtype=object)

In [767]:
# copy-paste-modify
fixed = np.array(['CO2_pc','hdi',
       'income_pc',
       'motor_vehicles_per_1000',
       'roads_paved_%'])


In [768]:
# update rows with cleaned names
files_df['clean_name'].iloc[fix_these_rows] = fixed

In [769]:
# verify
files_df['clean_name']

0                      CO2_pc
1               energy_use_pc
2                         hdi
3                   income_pc
4     motor_vehicles_per_1000
5                  population
6               roads_paved_%
7      sulfur_emissions_pc_kg
8    surviving_kids_per_woman
Name: clean_name, dtype: object

In [770]:
# assign clean name as the df.name attribute
# these will be used later in plot titles
def update_name(df, clean_name):
    df.name = clean_name

# could not figure out how to do this with pd.Series.apply
map(update_name, dfs, files_df['clean_name'].values)

[None, None, None, None, None, None, None, None, None]

In [772]:
# update files_df to list df.names
dfnames = [df.name for df in dfs]
files_df['df_name'] = dfnames
files_df

Unnamed: 0,clean_name,type,orig_name,val_name,df_name
0,CO2_pc,csv,!CO2_2013.csv,CO2,CO2_pc
1,energy_use_pc,xlsx,energy use per person.xlsx,energy,energy_use_pc
2,hdi,csv,hdi_human_development_index.csv,hdi,hdi
3,income_pc,csv,income_per_person_gdppercapita_ppp_inflation_a...,income,income_pc
4,motor_vehicles_per_1000,xlsx,motor_vehicles_per_1000_pop2010.xlsx,motor,motor_vehicles_per_1000
5,population,xlsx,population.xlsx,population,population
6,roads_paved_%,csv,roads_paved_percent_of_total_roads.csv,roads,roads_paved_%
7,sulfur_emissions_pc_kg,csv,sulfur_emissions_per_person_kg.csv,sulfur,sulfur_emissions_pc_kg
8,surviving_kids_per_woman,csv,surviving_kids_per_woman.csv,surviving,surviving_kids_per_woman


In [771]:
# add value names to files_df
# for use later in long format df's
valnames = [name.split('_')[0] for name in dfnames]
files_df['val_name'] = valnames
files_df

Unnamed: 0,clean_name,type,orig_name,val_name
0,CO2_pc,csv,!CO2_2013.csv,CO2
1,energy_use_pc,xlsx,energy use per person.xlsx,energy
2,hdi,csv,hdi_human_development_index.csv,hdi
3,income_pc,csv,income_per_person_gdppercapita_ppp_inflation_a...,income
4,motor_vehicles_per_1000,xlsx,motor_vehicles_per_1000_pop2010.xlsx,motor
5,population,xlsx,population.xlsx,population
6,roads_paved_%,csv,roads_paved_percent_of_total_roads.csv,roads
7,sulfur_emissions_pc_kg,csv,sulfur_emissions_per_person_kg.csv,sulfur
8,surviving_kids_per_woman,csv,surviving_kids_per_woman.csv,surviving


In [773]:
# IS THIS STEP EVEN NEEDED?
# Save all to new CSV files
#dfs.apply(lambda df: df.to_csv(cleanpath + df.name + '.' + 'csv', index = False))

# Inspect each DF

In [774]:
[inspect_df(df) for df in dfs]

Inspecting CO2_PC:
df_min_max
1751
2013
df_yrs_nan_vals
263
df_countries_no_data
0
Press <ENTER> to continue


       country  1751  1752  1753  1754  1755  1756  1757  1758  1759   ...     \
0  Afghanistan   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   ...      
1      Albania   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   ...      
2      Algeria   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   ...      
3      Andorra   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   ...      
4       Angola   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   ...      

      2004     2005     2006     2007     2008     2009     2010     2011  \
0  0.04041  0.05440  0.06552  0.08785  0.15895  0.24905  0.30291  0.42522   
1  1.34203  1.37998  1.27761  1.30428  1.47382  1.49426  1.58449  1.81554   
2  2.72677  3.21986  2.99727  3.19557  3.16824  3.42982  3.30686  3.30026   
3  7.49969  7.39095  6.83994  6.62244  6.52724  6.17852  6.09210  5.70224   
4  1.08651  1.069

Inspecting SULFUR_EMISSIONS_PC_KG:
df_min_max
1850
2000
df_yrs_nan_vals
0
df_countries_no_data
0
Press <ENTER> to continue


               country     1850     1851     1852     1853     1854     1855  \
0          Afghanistan  0.00000  0.00007  0.00013  0.00020  0.00026  0.00033   
1              Albania  0.00017  0.00019  0.00024  0.00026  0.00030  0.00034   
2              Algeria  0.00000  0.00200  0.00398  0.00593  0.00786  0.00976   
3               Angola  0.00003  0.00003  0.00005  0.00008  0.00011  0.00013   
4  Antigua And Barbuda  0.00240  0.00160  0.00301  0.00271  0.00336  0.00325   

      1856     1857     1858  ...     1991   1992   1993   1994   1995   1996  \
0  0.00039  0.00045  0.00051  ...     3.99   3.73   3.40   3.23   3.16   3.14   
1  0.00039  0.00042  0.00048  ...    10.40   3.77   2.96   2.46   2.20   2.19   
2  0.01170  0.01350  0.01540  ...     4.76   4.93   5.01   4.92   5.17   4.98   
3  0.00016  0.00018  0.00021  ...     7.61   7.62   6.98   7.58   6.89

[None, None, None, None, None, None, None, None, None]

## Surviving DF contains projections

Unlike the other DFs, this one looks into the future, which will create NANs all over the place in the merged DF.

Let's drop these projections.

In [775]:
# find which DF this is
files_df

Unnamed: 0,clean_name,type,orig_name,val_name,df_name
0,CO2_pc,csv,!CO2_2013.csv,CO2,CO2_pc
1,energy_use_pc,xlsx,energy use per person.xlsx,energy,energy_use_pc
2,hdi,csv,hdi_human_development_index.csv,hdi,hdi
3,income_pc,csv,income_per_person_gdppercapita_ppp_inflation_a...,income,income_pc
4,motor_vehicles_per_1000,xlsx,motor_vehicles_per_1000_pop2010.xlsx,motor,motor_vehicles_per_1000
5,population,xlsx,population.xlsx,population,population
6,roads_paved_%,csv,roads_paved_percent_of_total_roads.csv,roads,roads_paved_%
7,sulfur_emissions_pc_kg,csv,sulfur_emissions_per_person_kg.csv,sulfur,sulfur_emissions_pc_kg
8,surviving_kids_per_woman,csv,surviving_kids_per_woman.csv,surviving,surviving_kids_per_woman


In [776]:
# it's the final (8th) df
# remove projection years (post 2015) for Surviving DF

start = dfs[8].columns.get_loc('2016')
end = dfs[8].columns.get_loc('2099') + 1

dfs[8].drop(dfs[8].columns[start:end], axis=1, inplace=True)

In [777]:
# verify
dfs[8].tail()

Unnamed: 0,country,1760,1761,1762,1763,1764,1765,1766,1767,1768,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
175,Venezuela,,,,,,,,,,...,2.44,2.42,2.39,2.36,2.33,,,,,
176,Vietnam,,,,,,,,,,...,1.78,1.77,1.76,1.75,1.73,,,,,
177,Yemen,,,,,,,,,,...,4.82,4.77,4.68,4.64,4.55,,,,,
178,Zambia,,,,,,,,,,...,3.92,4.02,4.12,4.22,4.33,,,,,
179,Zimbabwe,,,,,,,,,,...,2.26,2.27,2.28,2.32,2.32,,,,,


In [781]:
# are there still rows that are all NaNs?
dfs[8].set_index('country').loc[dfs[8].set_index('country').isnull().all(1)]

Unnamed: 0_level_0,1760,1761,1762,1763,1764,1765,1766,1767,1768,1769,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


# Create long DFs

In [783]:
dfs[0].head()

Unnamed: 0,country,1751,1752,1753,1754,1755,1756,1757,1758,1759,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
0,Afghanistan,,,,,,,,,,...,0.04041,0.0544,0.06552,0.08785,0.15895,0.24905,0.30291,0.42522,0.68802,0.69312
1,Albania,,,,,,,,,,...,1.34203,1.37998,1.27761,1.30428,1.47382,1.49426,1.58449,1.81554,1.63562,1.66974
2,Algeria,,,,,,,,,,...,2.72677,3.21986,2.99727,3.19557,3.16824,3.42982,3.30686,3.30026,3.47163,3.51446
3,Andorra,,,,,,,,,,...,7.49969,7.39095,6.83994,6.62244,6.52724,6.17852,6.0921,5.70224,5.61408,5.52625
4,Angola,,,,,,,,,,...,1.08651,1.06932,1.20077,1.31098,1.29557,1.35427,1.36921,1.38263,1.47212,1.38437


In [854]:
def make_df_long(df):
    vals = df.columns[1:].values
    df_long = pd.melt(
        df, id_vars=['country'], value_vars=vals
    ).sort_values(
        ['country', 'variable']).rename(
        columns={'value':df.name, 'variable':'year'}
                         ).reset_index(drop=True
                                      )
    df_long.name = df.name
    df_long['year']=df_long['year'].astype('str')
    return df_long

In [855]:
long_dfs = [make_df_long(dfs[i]) for i, _ in enumerate(dfs)]
len(long_dfs)

9

# Join long DFs

In [856]:
# thank you http://notconfusing.com/joining-many-dataframes-at-once-in-pandas-n-ary-join/
def merge_dfs(ldf, rdf):
    right_on = list(rdf.columns[0:2])
    left_on = list(ldf.columns[0:2])
    return ldf.merge(rdf, how='left', left_on=left_on, right_on=right_on)
    

final_df = reduce(merge_dfs, long_dfs) #that's the magic
final_df.tail(20)

Unnamed: 0,country,year,CO2_pc,energy_use_pc,hdi,income_pc,motor_vehicles_per_1000,population,roads_paved_%,sulfur_emissions_pc_kg,surviving_kids_per_woman
60207,Zimbabwe,1994,1.53736,0.842736,0.475,2520.0,,11476807.0,54.9,7.72,3.47
60208,Zimbabwe,1995,1.29491,0.842225,0.465,2480.0,,11683136.0,48.0,7.25,3.24
60209,Zimbabwe,1996,1.26229,0.824112,0.46,2690.0,,11877664.0,47.4,6.82,3.04
60210,Zimbabwe,1997,1.19457,0.802734,0.451,2710.0,,12059858.0,47.4,6.3,2.85
60211,Zimbabwe,1998,1.16567,0.809675,0.442,2750.0,,12226742.0,47.4,6.64,2.67
60212,Zimbabwe,1999,1.27951,0.853539,0.434,2690.0,,12374019.0,,7.3,2.56
60213,Zimbabwe,2000,1.11379,0.790319,0.427,2570.0,,12499981.0,,7.15,2.46
60214,Zimbabwe,2001,0.99929,0.772111,0.427,2580.0,,12603988.0,,,2.36
60215,Zimbabwe,2002,0.94271,0.770138,0.418,2320.0,,12691431.0,19.0,,2.28
60216,Zimbabwe,2003,0.83413,0.754592,0.407,1910.0,,12774162.0,,,2.25


## Add region & sub_region columns

In [857]:
countries_regions = pd.read_csv('data/countries_with_regions.csv')
final_df = final_df.merge(countries_regions, on='country', how='left')
final_df.tail(25)

Unnamed: 0,country,year,CO2_pc,energy_use_pc,hdi,income_pc,motor_vehicles_per_1000,population,roads_paved_%,sulfur_emissions_pc_kg,surviving_kids_per_woman,region,sub-region
60202,Zimbabwe,1989,1.58907,0.870441,,2490.0,,10184966.0,,8.3,4.49,Africa,Eastern Africa
60203,Zimbabwe,1990,1.47859,0.888059,0.499,2590.0,,10484771.0,14.0,8.07,4.33,Africa,Eastern Africa
60204,Zimbabwe,1991,1.47,0.916924,0.501,2670.0,,10763036.0,15.0,9.28,4.13,Africa,Eastern Africa
60205,Zimbabwe,1992,1.53358,0.924668,0.486,2370.0,,11019717.0,16.0,9.8,3.88,Africa,Eastern Africa
60206,Zimbabwe,1993,1.43976,0.865807,0.479,2350.0,,11256512.0,17.0,8.27,3.69,Africa,Eastern Africa
60207,Zimbabwe,1994,1.53736,0.842736,0.475,2520.0,,11476807.0,54.9,7.72,3.47,Africa,Eastern Africa
60208,Zimbabwe,1995,1.29491,0.842225,0.465,2480.0,,11683136.0,48.0,7.25,3.24,Africa,Eastern Africa
60209,Zimbabwe,1996,1.26229,0.824112,0.46,2690.0,,11877664.0,47.4,6.82,3.04,Africa,Eastern Africa
60210,Zimbabwe,1997,1.19457,0.802734,0.451,2710.0,,12059858.0,47.4,6.3,2.85,Africa,Eastern Africa
60211,Zimbabwe,1998,1.16567,0.809675,0.442,2750.0,,12226742.0,47.4,6.64,2.67,Africa,Eastern Africa


In [858]:
# rearrange column order so region info beside country col
cols = final_df.columns.tolist()

newcols = [cols[0]]
newcols.extend(cols[-2:])
newcols.extend(cols[1:-2])

newcols

['country',
 'region',
 'sub-region',
 'year',
 'CO2_pc',
 'energy_use_pc',
 'hdi',
 'income_pc',
 'motor_vehicles_per_1000',
 'population',
 'roads_paved_%',
 'sulfur_emissions_pc_kg',
 'surviving_kids_per_woman']

In [859]:
final_df = final_df[newcols]
final_df.head()

Unnamed: 0,country,region,sub-region,year,CO2_pc,energy_use_pc,hdi,income_pc,motor_vehicles_per_1000,population,roads_paved_%,sulfur_emissions_pc_kg,surviving_kids_per_woman
0,Afghanistan,Asia,Southern Asia,1751,,,,,,,,,
1,Afghanistan,Asia,Southern Asia,1752,,,,,,,,,
2,Afghanistan,Asia,Southern Asia,1753,,,,,,,,,
3,Afghanistan,Asia,Southern Asia,1754,,,,,,,,,
4,Afghanistan,Asia,Southern Asia,1755,,,,,,,,,


In [860]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60227 entries, 0 to 60226
Data columns (total 13 columns):
country                     60227 non-null object
region                      49444 non-null object
sub-region                  49444 non-null object
year                        60227 non-null object
CO2_pc                      17055 non-null float64
energy_use_pc               5366 non-null float64
hdi                         3948 non-null float64
income_pc                   40660 non-null float64
motor_vehicles_per_1000     457 non-null float64
population                  17769 non-null float64
roads_paved_%               2031 non-null float64
sulfur_emissions_pc_kg      24613 non-null float64
surviving_kids_per_woman    31527 non-null float64
dtypes: float64(9), object(4)
memory usage: 6.4+ MB


In [861]:
# how many rows are missing CO2 emission values?,
final_df.loc[final_df['CO2_pc'].isnull()]

Unnamed: 0,country,region,sub-region,year,CO2_pc,energy_use_pc,hdi,income_pc,motor_vehicles_per_1000,population,roads_paved_%,sulfur_emissions_pc_kg,surviving_kids_per_woman
0,Afghanistan,Asia,Southern Asia,1751,,,,,,,,,
1,Afghanistan,Asia,Southern Asia,1752,,,,,,,,,
2,Afghanistan,Asia,Southern Asia,1753,,,,,,,,,
3,Afghanistan,Asia,Southern Asia,1754,,,,,,,,,
4,Afghanistan,Asia,Southern Asia,1755,,,,,,,,,
5,Afghanistan,Asia,Southern Asia,1756,,,,,,,,,
6,Afghanistan,Asia,Southern Asia,1757,,,,,,,,,
7,Afghanistan,Asia,Southern Asia,1758,,,,,,,,,
8,Afghanistan,Asia,Southern Asia,1759,,,,,,,,,
9,Afghanistan,Asia,Southern Asia,1760,,,,,,,,,


In [862]:
# drop all countries missing CO2 data
no_co2 = final_df[final_df['CO2_pc'].isnull()].index
final_df.drop(no_co2, inplace=True)

In [863]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17055 entries, 198 to 60226
Data columns (total 13 columns):
country                     17055 non-null object
region                      14970 non-null object
sub-region                  14970 non-null object
year                        17055 non-null object
CO2_pc                      17055 non-null float64
energy_use_pc               5324 non-null float64
hdi                         3898 non-null float64
income_pc                   14969 non-null float64
motor_vehicles_per_1000     456 non-null float64
population                  12517 non-null float64
roads_paved_%               1977 non-null float64
sulfur_emissions_pc_kg      11230 non-null float64
surviving_kids_per_woman    13919 non-null float64
dtypes: float64(9), object(4)
memory usage: 1.8+ MB


In [865]:
# save as csv
final_df.to_csv('data/final/final_df.csv', index=False)

# Updates

The original CO2 dataset went up to 2012, but 2010 was the last year there was substantial data.
I'd seen references online to 2014 data, but finding the actual data files was a challenge. I had several false starts.

Finally, it was the [Open Numbers github account](https://github.com/open-numbers/ddf--gapminder--co2_emission) that helped me.

In [746]:
# load data
url_co2 = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--co2_emission/master/ddf--datapoints--co2_emissions_tonnes_per_person--by--country--year.csv'
url_countries = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--co2_emission/master/ddf--entities--country.csv'

co2 = pd.read_csv(url_co2) # emissions values
countries = pd.read_csv(url_countries) # country key

df = countries.merge(co2, on='country')

# hoping this fixes concat issues when trying add rows from previous version of DF
df['year'] = df['year'].astype('str')

# put into standard wide format
df['country'] = df['name']
df = df.drop('name', axis=1).rename(columns={'co2_emissions_tonnes_per_person': 'CO2'})
df = df.pivot('country', 'year', 'CO2').reset_index()
df.name = 'CO2_2013'
df.head()

year,country,1751,1752,1753,1754,1755,1756,1757,1758,1759,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
0,Afghanistan,,,,,,,,,,...,0.04041,0.0544,0.06552,0.08785,0.15895,0.24905,0.30291,0.42522,0.68802,0.69312
1,Albania,,,,,,,,,,...,1.34203,1.37998,1.27761,1.30428,1.47382,1.49426,1.58449,1.81554,1.63562,1.66974
2,Algeria,,,,,,,,,,...,2.72677,3.21986,2.99727,3.19557,3.16824,3.42982,3.30686,3.30026,3.47163,3.51446
3,Andorra,,,,,,,,,,...,7.49969,7.39095,6.83994,6.62244,6.52724,6.17852,6.0921,5.70224,5.61408,5.52625
4,Angola,,,,,,,,,,...,1.08651,1.06932,1.20077,1.31098,1.29557,1.35427,1.36921,1.38263,1.47212,1.38437


In [747]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229 entries, 0 to 228
Columns: 264 entries, country to 2013
dtypes: float64(263), object(1)
memory usage: 472.4+ KB


In [748]:
# compare with previous co2 data (last complete year: 2010)
df_old = pd.read_csv('data/archive/1indicator CDIAC carbon_dioxide_emissions_per_capita.csv')
df_old.rename(columns={'CO2 per capita': 'country'}, inplace=True)
df_old.head()

Unnamed: 0,country,1751,1755,1762,1763,1764,1765,1766,1767,1768,...,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012
0,Abkhazia,,,,,,,,,,...,,,,,,,,,,
1,Afghanistan,,,,,,,,,,...,0.022704,0.027472,0.03678,0.04709,0.068312,0.131602,0.213325,0.262174,,
2,Akrotiri and Dhekelia,,,,,,,,,,...,,,,,,,,,,
3,Albania,,,,,,,,,,...,1.382066,1.332966,1.353789,1.22431,1.27942,1.297753,1.215055,1.336544,,
4,Algeria,,,,,,,,,,...,2.899236,2.76222,3.25701,3.113135,3.312875,3.328945,3.564361,3.480977,3.562504,3.785654


In [743]:
df_old.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Columns: 254 entries, country to 2012
dtypes: float64(253), object(1)
memory usage: 466.4+ KB


In [749]:
# df_old has 235 countries, 6 more than the current df
# But the current df has _9_ more columns (years) than df old.

# the most crucial diff is the countries in the old df that are not in the current one.

# first clean colums in each df
df_old_clean = clean_country_col(df_old)
df_clean = clean_country_col(df)

In [752]:
# 2010 countries NOT in 2013 dataset
notin_2013 = df_old.loc[~df_old['country'].isin(df['country'])]
notin_2013

Unnamed: 0,country,1751,1755,1762,1763,1764,1765,1766,1767,1768,...,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012
0,abkhazia,,,,,,,,,,...,,,,,,,,,,
2,akrotiri and dhekelia,,,,,,,,,,...,,,,,,,,,,
5,american samoa,,,,,,,,,,...,,,,,,,,,,
41,channel islands,,,,,,,,,,...,,,,,,,,,,
44,cocos island,,,,,,,,,,...,,,,,,,,,,
63,eritrea and ethiopia,,,,,,,,,,...,,,,,,,,,,
82,guam,,,,,,,,,,...,,,,,,,,,,
84,guernsey,,,,,,,,,,...,,,,,,,,,,
89,holy see,,,,,,,,,,...,,,,,,,,,,
99,isle of man,,,,,,,,,,...,,,,,,,,,,


OMG. 30 countries. But they seem be mostly NANs.
Let's verify

In [753]:
# of these countries, how many actually have data?
notin_2013.set_index('country', inplace=True)
notin_2013.isnull().all(1)

country
abkhazia                     True
akrotiri and dhekelia        True
american samoa               True
channel islands              True
cocos island                 True
eritrea and ethiopia         True
guam                         True
guernsey                     True
holy see                     True
isle of man                  True
jersey                       True
kosovo                       True
mayotte                      True
monaco                       True
norfolk island               True
northern cyprus              True
northern mariana islands     True
pitcairn                     True
san marino                   True
serbia excluding kosovo      True
somaliland                   True
south ossetia                True
svalbard                     True
tokelau                      True
transnistria                 True
tuvalu                       True
west bank and gaza          False
western sahara              False
virgin islands u s           True
land  

In [754]:
# drop all-NaN rows
notin_2013 = notin_2013.dropna(how='all')
notin_2013

Unnamed: 0_level_0,1751,1755,1762,1763,1764,1765,1766,1767,1768,1769,...,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
west bank and gaza,,,,,,,,,,,...,0.374235,0.535766,0.771369,0.622923,0.623526,0.536602,0.531716,0.585513,,
western sahara,,,,,,,,,,,...,0.612034,0.57268,0.54109,0.516309,0.496383,0.479618,0.464255,0.449262,,


In [755]:
# add notin_2013 rows to the current df

df.set_index('country', inplace=True)
df_appended = df.append(notin_2013)

# there now should be 231 rows instead of 229
df_appended.info()

<class 'pandas.core.frame.DataFrame'>
Index: 231 entries, afghanistan to western sahara
Columns: 263 entries, 1751 to 2013
dtypes: float64(263)
memory usage: 476.4+ KB


In [757]:
df_appended.tail()

Unnamed: 0_level_0,1751,1752,1753,1754,1755,1756,1757,1758,1759,1760,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
yugoslavia,,,,,,,,,,,...,,,,,,,,,,
zambia,,,,,,,,,,,...,0.18293,0.18998,0.16939,0.1396,0.16635,0.18703,0.1947,0.20451,0.23533,0.25084
zimbabwe,,,,,,,,,,,...,0.73716,0.82938,0.79573,0.74338,0.57572,0.60741,0.66307,0.81561,0.88788,0.92491
west bank and gaza,,,,,,,,,,,...,0.535766,0.771369,0.622923,0.623526,0.536602,0.531716,0.585513,,,
western sahara,,,,,,,,,,,...,0.57268,0.54109,0.516309,0.496383,0.479618,0.464255,0.449262,,,


In [758]:
# save as a csv so it can be 
# batched-processed with the other datasets
co2_2013.to_csv(originalpath + '!CO2_2013.csv', index=False)