In [53]:
import numpy as np
import pandas as pd
from os import walk # to get list of filenames
import sys # to print function name within function
import pprint as pp

#for country name cleanup
import unidecode
import re

# Helper functions

In [2]:
# create dfs for later merging

def make_df(filename):
    name = filename.split('.')[0]
    f_type = filename.split('.')[1]
    if f_type=='xlsx':
        df = pd.read_excel(mypath + filename)
    elif f_type=='csv':
        df = pd.read_csv(mypath + filename)
    else:
        print 'Unknown file type'
    
    # make sure all DFs have a 'country' column
    # to simplify .join() later
    df.rename(columns={df.columns[0]:'country'}, inplace=True)
    
    # the .name attribute will provide be used
    # when reshaping, as the label for the value column
    df.name = name
    
    return df


In [3]:
# clean up country names before merging

# helper functions

# https://gist.github.com/gornostal/1f123aaf838506038710
def force_to_unicode(text):
    "If text is unicode, it is returned as is. If it's str, convert it to Unicode using UTF-8 encoding"
    return text if isinstance(text, unicode) else text.decode('utf8')

def remove_special_char(series):
    clean_series = re.sub('\W+',' ', series)
    return clean_series

def fix_country_col(df):
    country_col = str(df.columns[0])
    df.rename(columns = {country_col : 'country'}, inplace = True)
    return df

def clean_country_col(df):
    # trim, remove accents & special chars
    # make lowercase
    df['country'] = (
        df['country'].apply(force_to_unicode)
        .apply(remove_special_char)
        .str.lower().str.strip()
    )
    return df

def add_regions(df):
    regions = pd.read_csv('data/countries_with_regions.csv')
    regions = clean_country_col(regions)
    df = regions.merge(df, on='country', how='left')
    
    return df

def clean_gapminder_df(df):
    name = df.name
    df = fix_country_col(df)

    # remove special chars & accents
    df = clean_country_col(df)
    df['country'] = df['country'].apply(force_to_unicode).apply(remove_special_char).str.lower().str.strip()
    
    # restore title case
    df['country'] = df['country'].str.title()
    
    # re-assign original df name
    df.name = name

    return df

# create regions df
#regions = pd.read_json('data/all_countries.json')
#regions = regions[['name', 'region', 'sub-region']]

#regions = clean_gapminder_df(regions)

# save as csv
#regions.to_csv('data/regions_cleaned.csv', index = False)

In [4]:
# create list of filenames

def list_files(mypath):
    files = []
    [files.extend(filenames) for (dirpath, dirnames, filenames) in walk(mypath)]
    return files    

In [38]:
# formatting stuff

def repeat_to_length(s, wanted):
    return (s * (wanted//len(s) + 1))[:wanted]

In [46]:
# inspect gapminder DF

def df_min_max(df):
    #print '\n'
    print sys._getframe().f_code.co_name
    print df.columns[1:].min()
    print df.columns[1:].max()

# any years with missing data?
def df_yrs_nan_vals(df):
    #print '\n'  
    print sys._getframe().f_code.co_name
    print df.isnull().any().sum()

# any countries that have no data at all?
def df_countries_no_data(df):
    #print '\n'   
    print sys._getframe().f_code.co_name
    print df.iloc[:,3:].isnull().all().sum()

def inspect_df(df):
    name = str.upper(df.name)
    #print '\n'  
    print 'Inspecting %s:' % name
    df_min_max(df)
    df_yrs_nan_vals(df)
    df_countries_no_data(df)
    print repeat_to_length('*', 30)
    raw_input('Press <ENTER> to continue')
    print '\n'
    print df.info()
    print repeat_to_length('*', 30)
    raw_input('Press <ENTER> to continue')
    print '\n'
    print df.head()
    raw_input('Press <ENTER> to continue')

In [6]:
def get_val_names(dfs):
    names = [df.name.split('_')[0] for df in dfs]
    return names

In [7]:
# reshape into long format for easier plotting

def reshape_for_plot(df):
    #df = df.reset_index().copy()
    #df = df.set_index(['region', 'sub-region', 'country'])
    name = df.name.split('_')[0]
    df = df.set_index(['region', 'sub-region', 'country'])
    df = df.sort_index(level = 0)

    t = df.T
    t = t.unstack(level = 1)
    df = pd.DataFrame(t)
    
    df = df.reset_index()
    df = df.rename(columns={'level_1':'year', 0: name})
    
    # make sure year column is int type
    #df['year'] = df['year'].astype('int64')
    df.name = name

    return df

# Look at source files

In [8]:
originalpath = 'data/original/'
cleanpath = 'data/cleaned/'
mypath = originalpath

originals = list_files(mypath)
originals

['!CO2_2013.csv',
 'energy use per person.xlsx',
 'hdi_human_development_index.csv',
 'income_per_person_gdppercapita_ppp_inflation_adjusted.csv',
 'motor_vehicles_per_1000_pop2010.xlsx',
 'population.xlsx',
 'roads_paved_percent_of_total_roads.csv',
 'sulfur_emissions_per_person_kg.csv',
 'surviving_kids_per_woman.csv']

# Convert to DF

In [9]:
# batch convert files to df's
# and save filename info in another df

# extract file extension as new column
files_df = pd.Series(originals).str.split('.', expand=True).rename(columns={0:'clean_name', 1:'type'})

# keep original filename
files_df['orig_name']=originals

# convert files to DF
dfs = files_df['orig_name'].apply(make_df)

# remove accents and special chars
dfs = dfs.apply(clean_gapminder_df)

files_df

Unnamed: 0,clean_name,type,orig_name
0,!CO2_2013,csv,!CO2_2013.csv
1,energy use per person,xlsx,energy use per person.xlsx
2,hdi_human_development_index,csv,hdi_human_development_index.csv
3,income_per_person_gdppercapita_ppp_inflation_a...,csv,income_per_person_gdppercapita_ppp_inflation_a...
4,motor_vehicles_per_1000_pop2010,xlsx,motor_vehicles_per_1000_pop2010.xlsx
5,population,xlsx,population.xlsx
6,roads_paved_percent_of_total_roads,csv,roads_paved_percent_of_total_roads.csv
7,sulfur_emissions_per_person_kg,csv,sulfur_emissions_per_person_kg.csv
8,surviving_kids_per_woman,csv,surviving_kids_per_woman.csv


# Clean filenames for use as labels

In [10]:
def clean_names(series):
    # replace spaces with undescores,
    # remove or shorten meaningless words
    series = series.str.upper()\
    .str.replace(' ', '_')\
    .str.replace('INDICATOR_', '')\
    .str.replace('PER_CAPITA', 'pc')\
    .str.replace('PER_PERSON', 'pc')\
    .str.lower()

    return series

In [11]:
files_df['clean_name'] = clean_names(files_df['clean_name'])

In [12]:
files_df

Unnamed: 0,clean_name,type,orig_name
0,!co2_2013,csv,!CO2_2013.csv
1,energy_use_pc,xlsx,energy use per person.xlsx
2,hdi_human_development_index,csv,hdi_human_development_index.csv
3,income_pc_gdppercapita_ppp_inflation_adjusted,csv,income_per_person_gdppercapita_ppp_inflation_a...
4,motor_vehicles_per_1000_pop2010,xlsx,motor_vehicles_per_1000_pop2010.xlsx
5,population,xlsx,population.xlsx
6,roads_paved_percent_of_total_roads,csv,roads_paved_percent_of_total_roads.csv
7,sulfur_emissions_pc_kg,csv,sulfur_emissions_per_person_kg.csv
8,surviving_kids_per_woman,csv,surviving_kids_per_woman.csv


In [13]:
# manually fix certain rows
fix_these_rows = [0,2,3,4,6]
needs_fixing = files_df.iloc[fix_these_rows]['clean_name']
needs_fixing.values

array(['!co2_2013', 'hdi_human_development_index',
       'income_pc_gdppercapita_ppp_inflation_adjusted',
       'motor_vehicles_per_1000_pop2010',
       'roads_paved_percent_of_total_roads'], dtype=object)

In [14]:
# copy-paste-modify
fixed = np.array(['CO2_pc','hdi',
       'income_pc',
       'motor_vehicles_per_1000',
       'roads_paved_%'])


In [15]:
# update rows with cleaned names
files_df['clean_name'].iloc[fix_these_rows] = fixed

In [16]:
# verify
files_df['clean_name']

0                      CO2_pc
1               energy_use_pc
2                         hdi
3                   income_pc
4     motor_vehicles_per_1000
5                  population
6               roads_paved_%
7      sulfur_emissions_pc_kg
8    surviving_kids_per_woman
Name: clean_name, dtype: object

In [17]:
# assign clean name as the df.name attribute
# these will be used later in plot titles
def update_name(df, clean_name):
    df.name = clean_name

# could not figure out how to do this with pd.Series.apply
map(update_name, dfs, files_df['clean_name'].values)

[None, None, None, None, None, None, None, None, None]

In [18]:
# update files_df to list df.names
dfnames = [df.name for df in dfs]
files_df['df_name'] = dfnames
files_df

Unnamed: 0,clean_name,type,orig_name,df_name
0,CO2_pc,csv,!CO2_2013.csv,CO2_pc
1,energy_use_pc,xlsx,energy use per person.xlsx,energy_use_pc
2,hdi,csv,hdi_human_development_index.csv,hdi
3,income_pc,csv,income_per_person_gdppercapita_ppp_inflation_a...,income_pc
4,motor_vehicles_per_1000,xlsx,motor_vehicles_per_1000_pop2010.xlsx,motor_vehicles_per_1000
5,population,xlsx,population.xlsx,population
6,roads_paved_%,csv,roads_paved_percent_of_total_roads.csv,roads_paved_%
7,sulfur_emissions_pc_kg,csv,sulfur_emissions_per_person_kg.csv,sulfur_emissions_pc_kg
8,surviving_kids_per_woman,csv,surviving_kids_per_woman.csv,surviving_kids_per_woman


In [19]:
# add value names to files_df
# for use later in long format df's
valnames = [name.split('_')[0] for name in dfnames]
files_df['val_name'] = valnames
files_df

Unnamed: 0,clean_name,type,orig_name,df_name,val_name
0,CO2_pc,csv,!CO2_2013.csv,CO2_pc,CO2
1,energy_use_pc,xlsx,energy use per person.xlsx,energy_use_pc,energy
2,hdi,csv,hdi_human_development_index.csv,hdi,hdi
3,income_pc,csv,income_per_person_gdppercapita_ppp_inflation_a...,income_pc,income
4,motor_vehicles_per_1000,xlsx,motor_vehicles_per_1000_pop2010.xlsx,motor_vehicles_per_1000,motor
5,population,xlsx,population.xlsx,population,population
6,roads_paved_%,csv,roads_paved_percent_of_total_roads.csv,roads_paved_%,roads
7,sulfur_emissions_pc_kg,csv,sulfur_emissions_per_person_kg.csv,sulfur_emissions_pc_kg,sulfur
8,surviving_kids_per_woman,csv,surviving_kids_per_woman.csv,surviving_kids_per_woman,surviving


In [20]:
# IS THIS STEP EVEN NEEDED?
# Save all to new CSV files
#dfs.apply(lambda df: df.to_csv(cleanpath + df.name + '.' + 'csv', index = False))

# Inspect each DF

In [45]:
[inspect_df(df) for df in dfs]

Inspecting CO2_PC:
df_min_max
1751
2013
df_yrs_nan_vals
263
df_countries_no_data
0
*******************************
Press <ENTER> to continue


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229 entries, 0 to 228
Columns: 264 entries, country to 2013
dtypes: float64(263), object(1)
memory usage: 472.4+ KB
None
*******************************


       country  1751  1752  1753  1754  1755  1756  1757  1758  1759   ...     \
0  Afghanistan   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   ...      
1      Albania   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   ...      
2      Algeria   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   ...      
3      Andorra   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   ...      
4       Angola   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   ...      

      2004     2005     2006     2007     2008     2009     2010     2011  \
0  0.04041  0.05440  0.06552  0.08785  0.15895  0.24905  0.30291  0.42522   
1  1.34203 

Press <ENTER> to continue
Inspecting MOTOR_VEHICLES_PER_1000:
df_min_max
2002
2007
df_yrs_nan_vals
6
df_countries_no_data
0
*******************************
Press <ENTER> to continue


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161 entries, 0 to 160
Data columns (total 7 columns):
country    161 non-null object
2002       82 non-null float64
2003       78 non-null float64
2004       65 non-null float64
2005       50 non-null float64
2006       38 non-null float64
2007       149 non-null float64
dtypes: float64(6), object(1)
memory usage: 8.9+ KB
None
*******************************


       country  2002  2003  2004       2005       2006        2007
0  Afghanistan   NaN   NaN   NaN        NaN        NaN   22.809539
1      Albania  73.0   NaN  85.0  87.475235  97.318069  102.212411
2      Algeria   NaN  88.0  89.0  91.000000        NaN         NaN
3       Angola   NaN   NaN   NaN        NaN        NaN   39.593660
4    Argentina   NaN   NaN   NaN        NaN        NaN  313.893647
P

Press <ENTER> to continue
Inspecting SURVIVING_KIDS_PER_WOMAN:
df_min_max
1760
2015
df_yrs_nan_vals
195
df_countries_no_data
5
*******************************
Press <ENTER> to continue


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Columns: 257 entries, country to 2015
dtypes: float64(256), object(1)
memory usage: 361.5+ KB
None
*******************************


       country  1760  1761  1762  1763  1764  1765  1766  1767  1768  ...   \
0  Afghanistan   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    
1      Albania   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    
2      Algeria   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    
3       Angola   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    
4    Argentina   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    

   2006  2007  2008  2009  2010  2011  2012  2013  2014  2015  
0  4.54  4.45  4.44  4.35  4.26   NaN   NaN   NaN   NaN   NaN  
1  1.61  1.

[None, None, None, None, None, None, None, None, None]

## Surviving DF contains projections

Unlike the other DFs, this one looks into the future, which will create NANs all over the place in the merged DF.

Let's drop these projections.

In [22]:
# find which DF this is
files_df

Unnamed: 0,clean_name,type,orig_name,df_name,val_name
0,CO2_pc,csv,!CO2_2013.csv,CO2_pc,CO2
1,energy_use_pc,xlsx,energy use per person.xlsx,energy_use_pc,energy
2,hdi,csv,hdi_human_development_index.csv,hdi,hdi
3,income_pc,csv,income_per_person_gdppercapita_ppp_inflation_a...,income_pc,income
4,motor_vehicles_per_1000,xlsx,motor_vehicles_per_1000_pop2010.xlsx,motor_vehicles_per_1000,motor
5,population,xlsx,population.xlsx,population,population
6,roads_paved_%,csv,roads_paved_percent_of_total_roads.csv,roads_paved_%,roads
7,sulfur_emissions_pc_kg,csv,sulfur_emissions_per_person_kg.csv,sulfur_emissions_pc_kg,sulfur
8,surviving_kids_per_woman,csv,surviving_kids_per_woman.csv,surviving_kids_per_woman,surviving


In [23]:
# it's the final (8th) df
# remove projection years (post 2015) for Surviving DF

##TODO: store dfs as col in files_df?

start = dfs[8].columns.get_loc('2016')
end = dfs[8].columns.get_loc('2099') + 1

dfs[8].drop(dfs[8].columns[start:end], axis=1, inplace=True)

In [24]:
# verify
dfs[8].tail()

Unnamed: 0,country,1760,1761,1762,1763,1764,1765,1766,1767,1768,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
175,Venezuela,,,,,,,,,,...,2.44,2.42,2.39,2.36,2.33,,,,,
176,Vietnam,,,,,,,,,,...,1.78,1.77,1.76,1.75,1.73,,,,,
177,Yemen,,,,,,,,,,...,4.82,4.77,4.68,4.64,4.55,,,,,
178,Zambia,,,,,,,,,,...,3.92,4.02,4.12,4.22,4.33,,,,,
179,Zimbabwe,,,,,,,,,,...,2.26,2.27,2.28,2.32,2.32,,,,,


In [25]:
# are there still rows that are all NaNs?
dfs[8].set_index('country').loc[dfs[8].set_index('country').isnull().all(1)]

Unnamed: 0_level_0,1760,1761,1762,1763,1764,1765,1766,1767,1768,1769,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


# Create long DFs

In [26]:
dfs[0].head()

Unnamed: 0,country,1751,1752,1753,1754,1755,1756,1757,1758,1759,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
0,Afghanistan,,,,,,,,,,...,0.04041,0.0544,0.06552,0.08785,0.15895,0.24905,0.30291,0.42522,0.68802,0.69312
1,Albania,,,,,,,,,,...,1.34203,1.37998,1.27761,1.30428,1.47382,1.49426,1.58449,1.81554,1.63562,1.66974
2,Algeria,,,,,,,,,,...,2.72677,3.21986,2.99727,3.19557,3.16824,3.42982,3.30686,3.30026,3.47163,3.51446
3,Andorra,,,,,,,,,,...,7.49969,7.39095,6.83994,6.62244,6.52724,6.17852,6.0921,5.70224,5.61408,5.52625
4,Angola,,,,,,,,,,...,1.08651,1.06932,1.20077,1.31098,1.29557,1.35427,1.36921,1.38263,1.47212,1.38437


In [27]:
def make_df_long(df):
    vals = df.columns[1:].values
    df_long = pd.melt(
        df, id_vars=['country'], value_vars=vals
    ).sort_values(
        ['country', 'variable']).rename(
        columns={'value':df.name, 'variable':'year'}
                         ).reset_index(drop=True
                                      )
    df_long.name = df.name
    df_long['year']=df_long['year'].astype('str')
    return df_long

In [28]:
long_dfs = [make_df_long(dfs[i]) for i, _ in enumerate(dfs)]
len(long_dfs)

9

# Join long DFs

In [106]:
# thank you http://notconfusing.com/joining-many-dataframes-at-once-in-pandas-n-ary-join/
def merge_dfs(ldf, rdf):
    right_on = list(rdf.columns[0:2])
    left_on = list(ldf.columns[0:2])
    return ldf.merge(rdf, how='left', left_on=left_on, right_on=right_on)
    

In [279]:
final_df = reduce(merge_dfs, long_dfs) #that's the magic
final_df.tail(20)

Unnamed: 0,country,year,CO2_pc,energy_use_pc,hdi,income_pc,motor_vehicles_per_1000,population,roads_paved_%,sulfur_emissions_pc_kg,surviving_kids_per_woman
60207,Zimbabwe,1994,1.53736,0.842736,0.475,2520.0,,11476807.0,54.9,7.72,3.47
60208,Zimbabwe,1995,1.29491,0.842225,0.465,2480.0,,11683136.0,48.0,7.25,3.24
60209,Zimbabwe,1996,1.26229,0.824112,0.46,2690.0,,11877664.0,47.4,6.82,3.04
60210,Zimbabwe,1997,1.19457,0.802734,0.451,2710.0,,12059858.0,47.4,6.3,2.85
60211,Zimbabwe,1998,1.16567,0.809675,0.442,2750.0,,12226742.0,47.4,6.64,2.67
60212,Zimbabwe,1999,1.27951,0.853539,0.434,2690.0,,12374019.0,,7.3,2.56
60213,Zimbabwe,2000,1.11379,0.790319,0.427,2570.0,,12499981.0,,7.15,2.46
60214,Zimbabwe,2001,0.99929,0.772111,0.427,2580.0,,12603988.0,,,2.36
60215,Zimbabwe,2002,0.94271,0.770138,0.418,2320.0,,12691431.0,19.0,,2.28
60216,Zimbabwe,2003,0.83413,0.754592,0.407,1910.0,,12774162.0,,,2.25


## Add region & sub_region columns

In [280]:
regions = pd.read_csv('data/countries_with_regions.csv')

# make 'country' lowercase in both df's
regions['country'] = regions['country'].str.lower()
final_df['country'] = final_df['country'].str.lower()

final_df = final_df.merge(regions, on='country', how='left')
final_df.head()

Unnamed: 0,country,year,CO2_pc,energy_use_pc,hdi,income_pc,motor_vehicles_per_1000,population,roads_paved_%,sulfur_emissions_pc_kg,surviving_kids_per_woman,region,sub-region
0,afghanistan,1751,,,,,,,,,,Asia,Southern Asia
1,afghanistan,1752,,,,,,,,,,Asia,Southern Asia
2,afghanistan,1753,,,,,,,,,,Asia,Southern Asia
3,afghanistan,1754,,,,,,,,,,Asia,Southern Asia
4,afghanistan,1755,,,,,,,,,,Asia,Southern Asia


In [273]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60227 entries, 0 to 60226
Data columns (total 13 columns):
country                     60227 non-null object
year                        60227 non-null object
CO2_pc                      17055 non-null float64
energy_use_pc               5366 non-null float64
hdi                         3948 non-null float64
income_pc                   40660 non-null float64
motor_vehicles_per_1000     457 non-null float64
population                  17769 non-null float64
roads_paved_%               2031 non-null float64
sulfur_emissions_pc_kg      24613 non-null float64
surviving_kids_per_woman    31527 non-null float64
region                      51548 non-null object
sub-region                  51548 non-null object
dtypes: float64(9), object(4)
memory usage: 6.4+ MB


## Remove rows without CO2

This process can add rows without CO2 values. 

Since I'm most primarily interested by CO2, might as well remove those rows: less visual clutter, and it saves memory.

In [281]:
# how many rows are missing CO2 emission values?,
len(final_df.loc[final_df['CO2_pc'].isnull()])

43172

In [282]:
# drop all countries missing CO2 data
no_co2 = final_df[final_df['CO2_pc'].isnull()].index
final_df.drop(no_co2, inplace=True)
final_df.reset_index(inplace=True, drop=True)

In [283]:
final_df.head()

Unnamed: 0,country,year,CO2_pc,energy_use_pc,hdi,income_pc,motor_vehicles_per_1000,population,roads_paved_%,sulfur_emissions_pc_kg,surviving_kids_per_woman,region,sub-region
0,afghanistan,1949,0.00182,,,1030.0,,,,0.0477,,Asia,Southern Asia
1,afghanistan,1950,0.01088,,,1040.0,,7752118.0,,0.0758,2.89,Asia,Southern Asia
2,afghanistan,1951,0.01169,,,1060.0,,7839426.0,,0.0932,2.89,Asia,Southern Asia
3,afghanistan,1952,0.01155,,,1070.0,,7934798.0,,0.11,3.02,Asia,Southern Asia
4,afghanistan,1953,0.01323,,,1120.0,,8038312.0,,0.126,3.02,Asia,Southern Asia


In [284]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17055 entries, 0 to 17054
Data columns (total 13 columns):
country                     17055 non-null object
year                        17055 non-null object
CO2_pc                      17055 non-null float64
energy_use_pc               5324 non-null float64
hdi                         3898 non-null float64
income_pc                   14969 non-null float64
motor_vehicles_per_1000     456 non-null float64
population                  12517 non-null float64
roads_paved_%               1977 non-null float64
sulfur_emissions_pc_kg      11230 non-null float64
surviving_kids_per_woman    13919 non-null float64
region                      15328 non-null object
sub-region                  15328 non-null object
dtypes: float64(9), object(4)
memory usage: 1.7+ MB


## Fill in missing regions

In [356]:
# troubleshooting
country = missing_regions['country'].unique()
countries = pd.DataFrame(country, columns=['country'])
countries['possible_matches'] = ""
countries

Unnamed: 0,country,possible_matches
0,central african republic,
1,christmas island,
2,congo dem rep,
3,congo rep,
4,cook is,
5,cura ao,
6,czech republic,
7,czechoslovakia,
8,dominican republic,
9,east germany,


In [404]:
# troubleshooting cont'd

def match_first_word(country):
    first_word = country.split()[0]
    possible_matches = regions['country'].loc[regions['country'].str.startswith(first_word)].to_dict()
    return possible_matches

In [405]:
countries['possible_matches'] = countries['country'].apply(match_first_word)
countries

Unnamed: 0,country,possible_matches
0,central african republic,{42: u'central african republic'}
1,christmas island,{46: u'christmas island'}
2,congo dem rep,"{50: u'congo', 51: u'congo democratic republic..."
3,congo rep,"{50: u'congo', 51: u'congo democratic republic..."
4,cook is,{52: u'cook islands'}
5,cura ao,{57: u'curacao'}
6,czech republic,{59: u'czech republic'}
7,czechoslovakia,{}
8,dominican republic,{63: u'dominican republic'}
9,east germany,{}


In [412]:
# and it continues with the troubleshooting (er, testing)

def match_second_word(country):
    if len(country.split()) > 1:
        second_word = country.split()[1]
        possible_matches = regions['country'].loc[regions['country'].str.contains(second_word)].to_dict()
        return possible_matches
    else:
        pass

In [414]:
countries['more_matches'] = countries['country'].apply(match_second_word)
countries

Unnamed: 0,country,possible_matches,more_matches
0,central african republic,{42: u'central african republic'},{42: u'central african republic'}
1,christmas island,{46: u'christmas island'},"{96: u'heard island and mcdonald islands', 1: ..."
2,congo dem rep,"{50: u'congo', 51: u'congo democratic republic...","{121: u'lao people s democratic republic', 51:..."
3,congo rep,"{50: u'congo', 51: u'congo democratic republic...",{131: u'macedonia the former yugoslav republic...
4,cook is,{52: u'cook islands'},"{0: u'afghanistan', 1: u'aland islands', 236: ..."
5,cura ao,{57: u'curacao'},"{57: u'curacao', 130: u'macao', 193: u'sao tom..."
6,czech republic,{59: u'czech republic'},{131: u'macedonia the former yugoslav republic...
7,czechoslovakia,{},
8,dominican republic,{63: u'dominican republic'},{131: u'macedonia the former yugoslav republic...
9,east germany,{},{82: u'germany'}


In [None]:
# make country & year index
final_df.set_index(['country', 'year'], inplace=True)

# how many countries missing region data?
missing_regions = final_df.loc[final_df['region'].isnull()]

# save this index to later assign filled in region data to final_df
missing_idx = final_df['region'].isnull().index

# reset index, as 'country' column is needed now
final_df.reset_index(inplace=True)
final_df.head()

In [288]:
missing_regions.reset_index(inplace=True)
print(missing_regions['country'].unique())
print(len(missing_regions['country'].unique()))

[u'central african republic' u'christmas island' u'congo dem rep'
 u'congo rep' u'cook is' u'cura ao' u'czech republic' u'czechoslovakia'
 u'dominican republic' u'east germany' u'falkland is malvinas'
 u'kyrgyz republic' u'lao' u'liechtenstein' u'micronesia fed sts'
 u'north korea' u'north yemen former' u'palestine' u'puerto rico'
 u'serbia and montenegro' u'south korea' u'south sudan'
 u'south yemen former' u'st helena' u'st kitts and nevis' u'st lucia'
 u'st pierre et miquelon' u'st vincent and the grenadines'
 u'united korea former' u'ussr' u'west germany' u'yemen' u'yugoslavia']
33


In [289]:
# try matching with original json regions file
regions = pd.read_csv('data/regions_cleaned.csv')

# make 'country' lowercase in this df
regions['country'] = regions['country'].str.lower()

# drop region & sub-region first, since this 2nd 'merge' will add them
missing_regions = missing_regions.drop( 
    ['region', 'sub-region'], axis=1).merge(
    regions, on='country', how='left'
)

In [290]:
# how many countries found matches?
len(missing_regions['country'].loc[~missing_regions['region'].isnull()].unique())

7

## Painful partial-string matching

It's not elegant, but it works in *most* cases.
Would love feedback on how to make this better.

### Troubleshooting matches

In [297]:
# since partial string matching isn't working for 
# country names where keyword is NOT the first word

print(regions.loc[regions['country'].str.contains('kitt')])

# were there matches in final_df?
print(final_df['country'].loc[final_df['country'].str.contains('kitt')]).unique()

                   country    region sub-region
186  saint kitts and nevis  Americas  Caribbean
[u'st kitts and nevis']


### Resume partial-string matching

In [318]:
# how many countries _still_ missing region data?
missing_regions
#still_missing = missing_regions['country'].loc[missing_regions['region'].isnull()].unique()
#len(still_missing)

Unnamed: 0,country,year,CO2_pc,energy_use_pc,hdi,income_pc,motor_vehicles_per_1000,population,roads_paved_%,sulfur_emissions_pc_kg,surviving_kids_per_woman,region,sub-region
0,central african republic,1959,0.06692,,,1440.0,,1479472.0,,8.24,2.92,Asia,Western Asia
1,central african republic,1960,0.05853,,,1430.0,,1503501.0,,7.51,2.94,Asia,Western Asia
2,central african republic,1961,0.05755,,,1450.0,,1529229.0,,7.74,3.04,Asia,Western Asia
3,central african republic,1962,0.04711,,,1380.0,,1556656.0,,7.90,3.06,Asia,Western Asia
4,central african republic,1963,0.04624,,,1350.0,,1585765.0,,7.50,3.16,Asia,Western Asia
5,central african republic,1964,0.04537,,,1340.0,,1616515.0,,8.36,3.17,Asia,Western Asia
6,central african republic,1965,0.05337,,,1320.0,,1648830.0,,8.88,3.26,Asia,Western Asia
7,central african republic,1966,0.05011,,,1280.0,,1682874.0,,8.54,3.27,Asia,Western Asia
8,central african republic,1967,0.05334,,,1310.0,,1718558.0,,8.07,3.36,Asia,Western Asia
9,central african republic,1968,0.10445,,,1290.0,,1755260.0,,8.55,3.44,Asia,Western Asia


In [298]:
# partial string matching
matches = []
for country in still_missing:
    country = str(country)
    poss_match = {'co2_country': country}
    country_split = str.split(country)
    
    # does the field start with country name?
    if regions['country'].str.startswith(country).any():
            poss_match['regions_country'] = dict(regions[regions['country'].str.startswith(country)].country)
    
    # does the field contain the first word of the country match?
    elif regions['country'].str.contains(country_split[0]).any():
            poss_match['regions_country'] = dict(regions[regions['country'].str.contains(country_split[0])].country)
            
    # does the field contain the second word of the country?
    elif len(country_split) > 1: 
        if regions['country'].str.contains(country_split[1]).any():
            poss_match['regions_country'] = dict(regions[regions['country'].str.contains(country_split[1])].country)
        else: 
            poss_match['regions_country'] = None
    else:
        poss_match['regions_country'] = None
        
    matches.append(poss_match)

In [299]:
# confirm matches when there is only one suggested match found
for dct in matches:
    if dct['regions_country'] == None:
        dct['matched'] = "n"
    elif  len(dct['regions_country']) > 1:
        dct['matched'] = 'n'
    elif len(dct['regions_country']) == 1:
        print dct
        dct['matched'] = raw_input('Is this a match? ([y] or n) ') or 'y'
        if dct['matched'] == 'y':
            #print dct['regions_country'].keys()[0]
            matching_key = dct['regions_country'].keys()[0]
            dct['regions_country'] = dct['regions_country'].values()[0]
            dct['matched'] = 'y'
            
            
            
            # add region & sub-region values from regions df
            missing_regions['region'] = regions['region'].at[int(matching_key)]
            missing_regions['sub-region'] = regions['sub-region'].at[int(matching_key)]
            
            # remove from this list
            matches.remove(dct)
            
    else:
        dct['matched'] = 'n'

{'co2_country': 'christmas island', 'regions_country': {46: 'christmas island'}}
Is this a match? ([y] or n) y
{'co2_country': 'cook is', 'regions_country': {52: 'cook islands'}}
Is this a match? ([y] or n) 
{'co2_country': 'east germany', 'regions_country': {82: 'germany'}}
Is this a match? ([y] or n) 
{'co2_country': 'kyrgyz republic', 'regions_country': {120: 'kyrgyzstan'}}
Is this a match? ([y] or n) 
{'co2_country': 'micronesia fed sts', 'regions_country': {144: 'micronesia federated states of '}}
Is this a match? ([y] or n) 
{'co2_country': 'palestine', 'regions_country': {169: 'palestine state of'}}
Is this a match? ([y] or n) 
{'co2_country': 'west germany', 'regions_country': {245: 'western sahara'}}
Is this a match? ([y] or n) n


In [300]:
# what does this look like now?
pp.pprint(matches)

[{'co2_country': 'congo dem rep',
  'regions_country': {50: 'congo', 51: 'congo democratic republic of the '}},
 {'co2_country': 'congo rep',
  'matched': 'n',
  'regions_country': {50: 'congo', 51: 'congo democratic republic of the '}},
 {'co2_country': 'cura ao', 'regions_country': {57: 'curacao'}},
 {'co2_country': 'czechoslovakia', 'matched': 'n', 'regions_country': None},
 {'co2_country': 'falkland is malvinas',
  'regions_country': {71: 'falkland islands malvinas '}},
 {'co2_country': 'lao',
  'regions_country': {121: 'lao people s democratic republic'}},
 {'co2_country': 'north korea',
  'regions_country': {164: 'northern mariana islands',
                      234: 'united kingdom of great britain and northern ireland'}},
 {'co2_country': 'north yemen former',
  'matched': 'n',
  'regions_country': {164: 'northern mariana islands',
                      234: 'united kingdom of great britain and northern ireland'}},
 {'co2_country': 'serbia and montenegro', 'regions_country': {1

In [314]:
##### for multiple matches (stored as dict), select correct country
## TODO: confirm selected key, then drop that key from dict

for dct in matches:
    if dct['matched'] != 'y': 
        pp.pprint(dct)
        matching_key = raw_input('Enter the key number of the matching country. If none match, type \'n\'.')
        if matching_key == 'n':
            dct['regions_country'] = None
            dct['matched'] = matching_key

        elif matching_key != 'n':
            while int(matching_key) not in dct['regions_country'].keys():
                print "Error: you entered "+ matching_key
                print "This is not a valid key number. Please try again."
                print "Valid keys are:"
                print dct['regions_country'].keys()
                matching_key = raw_input('Enter the key number of the matching country. If none match, type \'n\'.')


            print "You selected " + dct['regions_country'][int(matching_key)] + "."
            dct['regions_country'] = dct['regions_country'][int(matching_key)]
            dct['matched'] = 'y'

            # add region & sub-region values from regions df
            missing_regions['region'] = regions.region.at[int(matching_key)]
            missing_regions['sub-region'] = regions['sub-region'].at[int(matching_key)]
            matches.remove(dct)

            # remove this country from all other 'possible match' dicts in matches
            #for __,dct in enumerate(matches):
                #if int(matching_key) in dct['regions_country'].keys():
                    #del dct['regions_country'][int(matching_key)]
                #else:
                    #pass
                

{'co2_country': 'czechoslovakia', 'matched': 'n', 'regions_country': None}
Enter the key number of the matching country. If none match, type 'n'.n
{'co2_country': 'north korea', 'matched': 'n', 'regions_country': None}
Enter the key number of the matching country. If none match, type 'n'.n
{'co2_country': 'north yemen former', 'matched': 'n', 'regions_country': None}
Enter the key number of the matching country. If none match, type 'n'.n
{'co2_country': 'south korea', 'matched': 'n', 'regions_country': None}
Enter the key number of the matching country. If none match, type 'n'.n
{'co2_country': 'south yemen former', 'matched': 'n', 'regions_country': None}
Enter the key number of the matching country. If none match, type 'n'.n
{'co2_country': 'st kitts and nevis', 'matched': 'n', 'regions_country': None}
Enter the key number of the matching country. If none match, type 'n'.n
{'co2_country': 'st lucia', 'matched': 'n', 'regions_country': None}
Enter the key number of the matching countr

KeyboardInterrupt: 

In [160]:
matches

[{'co2_country': 'christmas island',
  'matched': 'y',
  'region': nan,
  'regions_country': 'christmas island',
  'sub-region': nan},
 {'co2_country': 'congo dem rep',
  'matched': 'y',
  'region': 'Africa',
  'regions_country': 'congo democratic republic of the ',
  'sub-region': 'Middle Africa'},
 {'co2_country': 'congo rep',
  'matched': 'y',
  'region': 'Africa',
  'regions_country': 'congo',
  'sub-region': 'Middle Africa'},
 {'co2_country': 'cook is',
  'matched': 'y',
  'region': 'Oceania',
  'regions_country': 'cook islands',
  'sub-region': 'Polynesia'},
 {'co2_country': 'cura ao',
  'matched': 'y',
  'region': 'Americas',
  'regions_country': 'curacao',
  'sub-region': 'Caribbean'},
 {'co2_country': 'czechoslovakia', 'matched': 'n', 'regions_country': None},
 {'co2_country': 'east germany', 'matched': 'n', 'regions_country': None},
 {'co2_country': 'falkland is malvinas',
  'matched': 'y',
  'region': 'Americas',
  'regions_country': 'falkland islands malvinas ',
  'sub-regi

In [None]:
def add_missing_regions(merged_df, matching_key, current_country):
    # update merged_df with region & sub-region values
    reg, sub_reg = regions.loc[matching_key, ['region', 'sub-region']]
    merged_df.loc[merged_df['country']== current_country, 'region'] = reg
    merged_df.loc[merged_df['country']== current_country, 'sub-region'] = sub_reg
    return merged_df

## Rearrange columns

In [31]:
# rearrange column order so region info beside country col
cols = final_df.columns.tolist()

newcols = [cols[0]]
newcols.extend(cols[-2:])
newcols.extend(cols[1:-2])

newcols

['country',
 'region',
 'sub-region',
 'year',
 'CO2_pc',
 'energy_use_pc',
 'hdi',
 'income_pc',
 'motor_vehicles_per_1000',
 'population',
 'roads_paved_%',
 'sulfur_emissions_pc_kg',
 'surviving_kids_per_woman']

In [32]:
final_df = final_df[newcols]
final_df.head()

Unnamed: 0,country,region,sub-region,year,CO2_pc,energy_use_pc,hdi,income_pc,motor_vehicles_per_1000,population,roads_paved_%,sulfur_emissions_pc_kg,surviving_kids_per_woman
0,Afghanistan,Asia,Southern Asia,1751,,,,,,,,,
1,Afghanistan,Asia,Southern Asia,1752,,,,,,,,,
2,Afghanistan,Asia,Southern Asia,1753,,,,,,,,,
3,Afghanistan,Asia,Southern Asia,1754,,,,,,,,,
4,Afghanistan,Asia,Southern Asia,1755,,,,,,,,,


In [863]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17055 entries, 198 to 60226
Data columns (total 13 columns):
country                     17055 non-null object
region                      14970 non-null object
sub-region                  14970 non-null object
year                        17055 non-null object
CO2_pc                      17055 non-null float64
energy_use_pc               5324 non-null float64
hdi                         3898 non-null float64
income_pc                   14969 non-null float64
motor_vehicles_per_1000     456 non-null float64
population                  12517 non-null float64
roads_paved_%               1977 non-null float64
sulfur_emissions_pc_kg      11230 non-null float64
surviving_kids_per_woman    13919 non-null float64
dtypes: float64(9), object(4)
memory usage: 1.8+ MB


In [865]:
# save as csv
final_df.to_csv('data/final/final_df.csv', index=False)

# Updates

The original CO2 dataset went up to 2012, but 2010 was the last year there was substantial data.
I'd seen references online to 2014 data, but finding the actual data files was a challenge. I had several false starts.

Finally, it was the [Open Numbers github account](https://github.com/open-numbers/ddf--gapminder--co2_emission) that helped me.

In [746]:
# load data
url_co2 = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--co2_emission/master/ddf--datapoints--co2_emissions_tonnes_per_person--by--country--year.csv'
url_countries = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--co2_emission/master/ddf--entities--country.csv'

co2 = pd.read_csv(url_co2) # emissions values
countries = pd.read_csv(url_countries) # country key

df = countries.merge(co2, on='country')

# hoping this fixes concat issues when trying add rows from previous version of DF
df['year'] = df['year'].astype('str')

# put into standard wide format
df['country'] = df['name']
df = df.drop('name', axis=1).rename(columns={'co2_emissions_tonnes_per_person': 'CO2'})
df = df.pivot('country', 'year', 'CO2').reset_index()
df.name = 'CO2_2013'
df.head()

year,country,1751,1752,1753,1754,1755,1756,1757,1758,1759,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
0,Afghanistan,,,,,,,,,,...,0.04041,0.0544,0.06552,0.08785,0.15895,0.24905,0.30291,0.42522,0.68802,0.69312
1,Albania,,,,,,,,,,...,1.34203,1.37998,1.27761,1.30428,1.47382,1.49426,1.58449,1.81554,1.63562,1.66974
2,Algeria,,,,,,,,,,...,2.72677,3.21986,2.99727,3.19557,3.16824,3.42982,3.30686,3.30026,3.47163,3.51446
3,Andorra,,,,,,,,,,...,7.49969,7.39095,6.83994,6.62244,6.52724,6.17852,6.0921,5.70224,5.61408,5.52625
4,Angola,,,,,,,,,,...,1.08651,1.06932,1.20077,1.31098,1.29557,1.35427,1.36921,1.38263,1.47212,1.38437


In [747]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229 entries, 0 to 228
Columns: 264 entries, country to 2013
dtypes: float64(263), object(1)
memory usage: 472.4+ KB


In [748]:
# compare with previous co2 data (last complete year: 2010)
df_old = pd.read_csv('data/archive/1indicator CDIAC carbon_dioxide_emissions_per_capita.csv')
df_old.rename(columns={'CO2 per capita': 'country'}, inplace=True)
df_old.head()

Unnamed: 0,country,1751,1755,1762,1763,1764,1765,1766,1767,1768,...,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012
0,Abkhazia,,,,,,,,,,...,,,,,,,,,,
1,Afghanistan,,,,,,,,,,...,0.022704,0.027472,0.03678,0.04709,0.068312,0.131602,0.213325,0.262174,,
2,Akrotiri and Dhekelia,,,,,,,,,,...,,,,,,,,,,
3,Albania,,,,,,,,,,...,1.382066,1.332966,1.353789,1.22431,1.27942,1.297753,1.215055,1.336544,,
4,Algeria,,,,,,,,,,...,2.899236,2.76222,3.25701,3.113135,3.312875,3.328945,3.564361,3.480977,3.562504,3.785654


In [743]:
df_old.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Columns: 254 entries, country to 2012
dtypes: float64(253), object(1)
memory usage: 466.4+ KB


In [749]:
# df_old has 235 countries, 6 more than the current df
# But the current df has _9_ more columns (years) than df old.

# the most crucial diff is the countries in the old df that are not in the current one.

# first clean colums in each df
df_old_clean = clean_country_col(df_old)
df_clean = clean_country_col(df)

In [752]:
# 2010 countries NOT in 2013 dataset
notin_2013 = df_old.loc[~df_old['country'].isin(df['country'])]
notin_2013

Unnamed: 0,country,1751,1755,1762,1763,1764,1765,1766,1767,1768,...,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012
0,abkhazia,,,,,,,,,,...,,,,,,,,,,
2,akrotiri and dhekelia,,,,,,,,,,...,,,,,,,,,,
5,american samoa,,,,,,,,,,...,,,,,,,,,,
41,channel islands,,,,,,,,,,...,,,,,,,,,,
44,cocos island,,,,,,,,,,...,,,,,,,,,,
63,eritrea and ethiopia,,,,,,,,,,...,,,,,,,,,,
82,guam,,,,,,,,,,...,,,,,,,,,,
84,guernsey,,,,,,,,,,...,,,,,,,,,,
89,holy see,,,,,,,,,,...,,,,,,,,,,
99,isle of man,,,,,,,,,,...,,,,,,,,,,


OMG. 30 countries. But they seem be mostly NANs.
Let's verify

In [753]:
# of these countries, how many actually have data?
notin_2013.set_index('country', inplace=True)
notin_2013.isnull().all(1)

country
abkhazia                     True
akrotiri and dhekelia        True
american samoa               True
channel islands              True
cocos island                 True
eritrea and ethiopia         True
guam                         True
guernsey                     True
holy see                     True
isle of man                  True
jersey                       True
kosovo                       True
mayotte                      True
monaco                       True
norfolk island               True
northern cyprus              True
northern mariana islands     True
pitcairn                     True
san marino                   True
serbia excluding kosovo      True
somaliland                   True
south ossetia                True
svalbard                     True
tokelau                      True
transnistria                 True
tuvalu                       True
west bank and gaza          False
western sahara              False
virgin islands u s           True
land  

In [754]:
# drop all-NaN rows
notin_2013 = notin_2013.dropna(how='all')
notin_2013

Unnamed: 0_level_0,1751,1755,1762,1763,1764,1765,1766,1767,1768,1769,...,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
west bank and gaza,,,,,,,,,,,...,0.374235,0.535766,0.771369,0.622923,0.623526,0.536602,0.531716,0.585513,,
western sahara,,,,,,,,,,,...,0.612034,0.57268,0.54109,0.516309,0.496383,0.479618,0.464255,0.449262,,


In [755]:
# add notin_2013 rows to the current df

df.set_index('country', inplace=True)
df_appended = df.append(notin_2013)

# there now should be 231 rows instead of 229
df_appended.info()

<class 'pandas.core.frame.DataFrame'>
Index: 231 entries, afghanistan to western sahara
Columns: 263 entries, 1751 to 2013
dtypes: float64(263)
memory usage: 476.4+ KB


In [757]:
df_appended.tail()

Unnamed: 0_level_0,1751,1752,1753,1754,1755,1756,1757,1758,1759,1760,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
yugoslavia,,,,,,,,,,,...,,,,,,,,,,
zambia,,,,,,,,,,,...,0.18293,0.18998,0.16939,0.1396,0.16635,0.18703,0.1947,0.20451,0.23533,0.25084
zimbabwe,,,,,,,,,,,...,0.73716,0.82938,0.79573,0.74338,0.57572,0.60741,0.66307,0.81561,0.88788,0.92491
west bank and gaza,,,,,,,,,,,...,0.535766,0.771369,0.622923,0.623526,0.536602,0.531716,0.585513,,,
western sahara,,,,,,,,,,,...,0.57268,0.54109,0.516309,0.496383,0.479618,0.464255,0.449262,,,


In [758]:
# save as a csv so it can be 
# batched-processed with the other datasets
co2_2013.to_csv(originalpath + '!CO2_2013.csv', index=False)