In [28]:
import numpy as np
import pandas as pd
from os import walk # to get list of filenames
import sys # to print function name within function

#for country name cleanup
import unidecode
import re

# Helper functions

In [29]:
# create dfs for later merging

def make_df(filename):
    name = filename.split('.')[0]
    f_type = filename.split('.')[1]
    if f_type=='xlsx':
        df = pd.read_excel(mypath + filename)
    elif f_type=='csv':
        df = pd.read_csv(mypath + filename)
    else:
        print 'Unknown file type'
    #val_name = df.name.split('_')[0]
    df.rename(columns={df.columns[0]:'country'}, inplace=True)
    df.name = name
    #df.columns = map(lambda col: 
                    #'{}_{}'.format(str(col), name), df.columns)
    
    return df


In [348]:
# clean up country names before merging

# helper functions

# https://gist.github.com/gornostal/1f123aaf838506038710
def force_to_unicode(text):
    "If text is unicode, it is returned as is. If it's str, convert it to Unicode using UTF-8 encoding"
    return text if isinstance(text, unicode) else text.decode('utf8')

def remove_special_char(series):
    clean_series = re.sub('\W+',' ', series)
    return clean_series

def fix_country_col(df):
    country_col = str(df.columns[0])
    df.rename(columns = {country_col : 'country'}, inplace = True)
    return df

def clean_country_col(df):
    # trim, remove accents & special chars
    # make lowercase
    df['country'] = (
        df['country'].apply(force_to_unicode)
        .apply(remove_special_char)
        .str.lower().str.strip()
    )
    return df

def add_regions(df):
    regions = pd.read_csv('data/countries_with_regions.csv')
    regions = clean_country_col(regions)
    df = regions.merge(df, on='country', how='left')
    
    return df

def clean_gapminder_df(df):
    name = df.name
    df = fix_country_col(df)

    # remove special chars & accents
    df = clean_country_col(df)
    #df['country'] = (
     #   df['country'].apply(force_to_unicode)
     #   .apply(remove_special_char)
     #   .str.lower().str.strip()
    #)
    # add continent & sub-continent data
    df = add_regions(df)
    
    # restore title case
    df['country'] = df['country'].str.title()
    
    # re-assign original df name
    df.name = name

    return df

# create df
#regions = pd.read_json('data/all_countries.json')
#regions = regions[['name', 'region', 'sub-region']]

#regions = clean_gapminder_df(regions)

# save as csv
#regions.to_csv('data/regions_cleaned.csv', index = False)

In [31]:
# create list of filenames

def list_files(mypath):
    files = []
    [files.extend(filenames) for (dirpath, dirnames, filenames) in walk(mypath)]
    return files    

In [32]:
# inspect DF

def df_min_max(df):
    #print '\n'
    print sys._getframe().f_code.co_name
    print df.columns[1:].min()
    print df.columns[1:].max()

# any years with missing data?
def df_yrs_nan_vals(df):
    #print '\n'  
    print sys._getframe().f_code.co_name
    print df.isnull().any().sum()

# any countries that have no data at all?
def df_countries_no_data(df):
    #print '\n'   
    print sys._getframe().f_code.co_name
    print df.iloc[:,3:].isnull().all().sum()

def inspect_df(df):
    name = str.upper(df.name)
    #print '\n'  
    print 'Inspecting %s:' % name
    df_min_max(df)
    df_yrs_nan_vals(df)
    df_countries_no_data(df)
    raw_input('Press <ENTER> to continue')
    print '\n'
    print df.head()
    raw_input('Press <ENTER> to continue')

In [33]:
def get_val_names(dfs):
    names = [df.name.split('_')[0] for df in dfs]
    return names

In [34]:
# reshape into long format for easier plotting

def reshape_for_plot(df):
    #df = df.reset_index().copy()
    #df = df.set_index(['region', 'sub-region', 'country'])
    name = df.name.split('_')[0]
    df = df.set_index(['country'])
    df = df.sort_index(level = 0)

    t = df.T
    t = t.unstack(level = 1)
    df = pd.DataFrame(t)
    
    df = df.reset_index()
    df = df.rename(columns={'level_1':'year', 0: name})
    
    # make sure year column is int type
    df['year'] = df['year'].astype('int64')
    df.name = name

    return df

# Look at source files

In [35]:
originalpath = 'data/original/'
cleanpath = 'data/cleaned/'
mypath = originalpath

originals = list_files(mypath)
originals

['1indicator CDIAC carbon_dioxide_emissions_per_capita.csv',
 'energy use per person.xlsx',
 'hdi_human_development_index.csv',
 'income_per_person_gdppercapita_ppp_inflation_adjusted.csv',
 'motor_vehicles_per_1000_pop2010.xlsx',
 'population.xlsx',
 'roads_paved_percent_of_total_roads.csv',
 'sulfur_emissions_per_person_kg.csv',
 'surviving_kids_per_woman.csv']

In [36]:
# convert to DF by splitting file type as new column
files_df = pd.Series(originals).str.split('.', expand=True).rename(columns={0:'clean_name', 1:'type'})

# keep original filename
files_df['orig_name']=originals

# convert files to DF
dfs = files_df['orig_name'].apply(make_df)

# remove accents and special chars
dfs = dfs.apply(clean_gapminder_df)

In [213]:
# all files converted to df?
print len(originals)
len(dfs)

9


9

In [38]:
files_df

Unnamed: 0,clean_name,type,orig_name
0,1indicator CDIAC carbon_dioxide_emissions_per_...,csv,1indicator CDIAC carbon_dioxide_emissions_per_...
1,energy use per person,xlsx,energy use per person.xlsx
2,hdi_human_development_index,csv,hdi_human_development_index.csv
3,income_per_person_gdppercapita_ppp_inflation_a...,csv,income_per_person_gdppercapita_ppp_inflation_a...
4,motor_vehicles_per_1000_pop2010,xlsx,motor_vehicles_per_1000_pop2010.xlsx
5,population,xlsx,population.xlsx
6,roads_paved_percent_of_total_roads,csv,roads_paved_percent_of_total_roads.csv
7,sulfur_emissions_per_person_kg,csv,sulfur_emissions_per_person_kg.csv
8,surviving_kids_per_woman,csv,surviving_kids_per_woman.csv


## Clean filenames

In [39]:
def clean_names(series):
    # replace spaces with undescores,
    # remove or shorten meaningless words
    series = series.str.upper()\
    .str.replace(' ', '_')\
    .str.replace('INDICATOR_', '')\
    .str.replace('PER_CAPITA', 'pc')\
    .str.replace('PER_PERSON', 'pc')\
    .str.lower()

    return series

In [40]:
files_df['clean_name'] = clean_names(files_df['clean_name'])

In [41]:
files_df

Unnamed: 0,clean_name,type,orig_name
0,1cdiac_carbon_dioxide_emissions_pc,csv,1indicator CDIAC carbon_dioxide_emissions_per_...
1,energy_use_pc,xlsx,energy use per person.xlsx
2,hdi_human_development_index,csv,hdi_human_development_index.csv
3,income_pc_gdppercapita_ppp_inflation_adjusted,csv,income_per_person_gdppercapita_ppp_inflation_a...
4,motor_vehicles_per_1000_pop2010,xlsx,motor_vehicles_per_1000_pop2010.xlsx
5,population,xlsx,population.xlsx
6,roads_paved_percent_of_total_roads,csv,roads_paved_percent_of_total_roads.csv
7,sulfur_emissions_pc_kg,csv,sulfur_emissions_per_person_kg.csv
8,surviving_kids_per_woman,csv,surviving_kids_per_woman.csv


In [42]:
# manually fix certain rows
fix_these_rows = [0,2,3,4,6]
needs_fixing = files_df.iloc[fix_these_rows]['clean_name']
needs_fixing.values

array(['1cdiac_carbon_dioxide_emissions_pc',
       'hdi_human_development_index',
       'income_pc_gdppercapita_ppp_inflation_adjusted',
       'motor_vehicles_per_1000_pop2010',
       'roads_paved_percent_of_total_roads'], dtype=object)

In [43]:
# copy-paste-modify
fixed = np.array(['CO2_pc','hdi',
       'income_pc',
       'motor_vehicles_per_1000',
       'roads_paved_%'])


In [44]:
# update rows with cleaned names
files_df['clean_name'].iloc[fix_these_rows] = fixed

In [45]:
# verify
files_df['clean_name']

0                      CO2_pc
1               energy_use_pc
2                         hdi
3                   income_pc
4     motor_vehicles_per_1000
5                  population
6               roads_paved_%
7      sulfur_emissions_pc_kg
8    surviving_kids_per_woman
Name: clean_name, dtype: object

In [46]:
# assign clean name as the df.name attribute
def update_name(df, clean_name):
    df.name = clean_name

# could not figure out how to do this with pd.Series.apply
map(update_name, dfs, files_df['clean_name'].values)

[None, None, None, None, None, None, None, None, None]

In [47]:
# update files_df to list df.names

dfnames = [df.name for df in dfs]
files_df['df_val_name'] = dfnames
files_df.rename(columns={'df_val_name' : 'df_name'}, inplace=True)

In [48]:
valnames = [name.split('_')[0] for name in dfnames]
files_df['val_name'] = valnames
files_df

Unnamed: 0,clean_name,type,orig_name,df_name,val_name
0,CO2_pc,csv,1indicator CDIAC carbon_dioxide_emissions_per_...,CO2_pc,CO2
1,energy_use_pc,xlsx,energy use per person.xlsx,energy_use_pc,energy
2,hdi,csv,hdi_human_development_index.csv,hdi,hdi
3,income_pc,csv,income_per_person_gdppercapita_ppp_inflation_a...,income_pc,income
4,motor_vehicles_per_1000,xlsx,motor_vehicles_per_1000_pop2010.xlsx,motor_vehicles_per_1000,motor
5,population,xlsx,population.xlsx,population,population
6,roads_paved_%,csv,roads_paved_percent_of_total_roads.csv,roads_paved_%,roads
7,sulfur_emissions_pc_kg,csv,sulfur_emissions_per_person_kg.csv,sulfur_emissions_pc_kg,sulfur
8,surviving_kids_per_woman,csv,surviving_kids_per_woman.csv,surviving_kids_per_woman,surviving


In [49]:
# save as CSV using cleaned name
'''def clean_to_csv(df):
    cleanname = files_df['clean_name'].str.cat(files_df['type'], sep='.')
    df.to_csv(cleanpath + cleanname, index=False)
    return none'''

"def clean_to_csv(df):\n    cleanname = files_df['clean_name'].str.cat(files_df['type'], sep='.')\n    df.to_csv(cleanpath + cleanname, index=False)\n    return none"

In [50]:
# IS THIS STEP EVEN NEEDED????
# Save to csv files
dfs.apply(lambda df: df.to_csv(cleanpath + df.name + '.' + 'csv', index = False))

0    None
1    None
2    None
3    None
4    None
5    None
6    None
7    None
8    None
Name: orig_name, dtype: object

# Create wide DFs

In [51]:
# create "driving" DF, ie the left-most DF when merging
#co2_regions = pd.read_pickle('data/co2_regions.pkl')
#co2_regions.to_csv('data/cleaned/!co2_regions.csv')

## Inspect resulting DFs

In [52]:



#len(dfs)
#[inspect_df(df) for df in dfs]

Not sure what happened here. The same problem is in the original CSV, so I downloaded it again from Gapminder. Problem solved.

## Surviving DF contains projections

Unlike the other DFs, this one looks into the future, which will create NANs all over the place in the merged DF.

Let's drop these projections.

In [53]:
# find DF
files_df['clean_name']

0                      CO2_pc
1               energy_use_pc
2                         hdi
3                   income_pc
4     motor_vehicles_per_1000
5                  population
6               roads_paved_%
7      sulfur_emissions_pc_kg
8    surviving_kids_per_woman
Name: clean_name, dtype: object

In [54]:
# it's the final (8th) df
# remove projection years (post 2015) for Surviving DF

start = dfs[8].columns.get_loc('2016')
end = dfs[8].columns.get_loc('2099') + 1

dfs[8].drop(dfs[8].columns[start:end], axis=1, inplace=True)

In [55]:
# verify
dfs[8].tail()

Unnamed: 0,country,1760,1761,1762,1763,1764,1765,1766,1767,1768,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
175,Venezuela,,,,,,,,,,...,2.44,2.42,2.39,2.36,2.33,,,,,
176,Vietnam,,,,,,,,,,...,1.78,1.77,1.76,1.75,1.73,,,,,
177,Yemen,,,,,,,,,,...,4.82,4.77,4.68,4.64,4.55,,,,,
178,Zambia,,,,,,,,,,...,3.92,4.02,4.12,4.22,4.33,,,,,
179,Zimbabwe,,,,,,,,,,...,2.26,2.27,2.28,2.32,2.32,,,,,


# Create long DFs

In [56]:
long_dfs = [reshape_for_plot(dfs[i]) for i, _ in enumerate(dfs)]
len(long_dfs)

9

# Join long DFs

In [57]:
# thank you http://notconfusing.com/joining-many-dataframes-at-once-in-pandas-n-ary-join/
def merge_dfs(ldf, rdf):
    right_on = list(rdf.columns[0:2])
    left_on = list(ldf.columns[0:2])
    return ldf.merge(rdf, how='left', left_on=left_on, right_on=right_on)
    

final_df = reduce(merge_dfs, long_dfs) #that's the magic
final_df.tail(20)

Unnamed: 0,country,year,CO2,energy,hdi,income,motor,population,roads,sulfur,surviving
59435,Zimbabwe,1993,1.441291,0.865807,0.479,2350.0,,11256512.0,17.0,8.27,3.69
59436,Zimbabwe,1994,1.538291,0.842736,0.475,2520.0,,11476807.0,54.9,7.72,3.47
59437,Zimbabwe,1995,1.294742,0.842225,0.465,2480.0,,11683136.0,48.0,7.25,3.24
59438,Zimbabwe,1996,1.260839,0.824112,0.46,2690.0,,11877664.0,47.4,6.82,3.04
59439,Zimbabwe,1997,1.191934,0.802734,0.451,2710.0,,12059858.0,47.4,6.3,2.85
59440,Zimbabwe,1998,1.162342,0.809675,0.442,2750.0,,12226742.0,47.4,6.64,2.67
59441,Zimbabwe,1999,1.276289,0.853539,0.434,2690.0,,12374019.0,,7.3,2.56
59442,Zimbabwe,2000,1.110012,0.790319,0.427,2570.0,,12499981.0,,7.15,2.46
59443,Zimbabwe,2001,0.998965,0.772111,0.427,2580.0,,12603988.0,,,2.36
59444,Zimbabwe,2002,0.946346,0.770138,0.418,2320.0,,12691431.0,19.0,,2.28


In [58]:
# add region data as columns
countries_regions = pd.read_csv('data/countries_with_regions.csv')
final_df = final_df.merge(countries_regions, on='country', how='left')
final_df.tail(25)

Unnamed: 0,country,year,CO2,energy,hdi,income,motor,population,roads,sulfur,surviving,region,sub-region
59430,Zimbabwe,1988,1.634179,0.878731,,2450.0,,9866776.0,,8.3,4.71,Africa,Eastern Africa
59431,Zimbabwe,1989,1.59154,0.870441,,2490.0,,10184966.0,,8.3,4.49,Africa,Eastern Africa
59432,Zimbabwe,1990,1.480788,0.888059,0.499,2590.0,,10484771.0,14.0,8.07,4.33,Africa,Eastern Africa
59433,Zimbabwe,1991,1.472027,0.916924,0.501,2670.0,,10763036.0,15.0,9.28,4.13,Africa,Eastern Africa
59434,Zimbabwe,1992,1.535539,0.924668,0.486,2370.0,,11019717.0,16.0,9.8,3.88,Africa,Eastern Africa
59435,Zimbabwe,1993,1.441291,0.865807,0.479,2350.0,,11256512.0,17.0,8.27,3.69,Africa,Eastern Africa
59436,Zimbabwe,1994,1.538291,0.842736,0.475,2520.0,,11476807.0,54.9,7.72,3.47,Africa,Eastern Africa
59437,Zimbabwe,1995,1.294742,0.842225,0.465,2480.0,,11683136.0,48.0,7.25,3.24,Africa,Eastern Africa
59438,Zimbabwe,1996,1.260839,0.824112,0.46,2690.0,,11877664.0,47.4,6.82,3.04,Africa,Eastern Africa
59439,Zimbabwe,1997,1.191934,0.802734,0.451,2710.0,,12059858.0,47.4,6.3,2.85,Africa,Eastern Africa


In [59]:
# rearrange column order so region info beside country col
cols = final_df.columns.tolist()

newcols = [cols[0]]
newcols.extend(cols[-2:])
newcols.extend(cols[1:-2])

newcols

['country',
 'region',
 'sub-region',
 'year',
 'CO2',
 'energy',
 'hdi',
 'income',
 'motor',
 'population',
 'roads',
 'sulfur',
 'surviving']

In [60]:
final_df = final_df[newcols]
final_df.head()

Unnamed: 0,country,region,sub-region,year,CO2,energy,hdi,income,motor,population,roads,sulfur,surviving
0,land,,,1751,,,,,,,,,
1,land,,,1755,,,,,,,,,
2,land,,,1762,,,,,,,,,
3,land,,,1763,,,,,,,,,
4,land,,,1764,,,,,,,,,


In [61]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59455 entries, 0 to 59454
Data columns (total 13 columns):
country       59455 non-null object
region        50853 non-null object
sub-region    50853 non-null object
year          59455 non-null int64
CO2           15072 non-null float64
energy        5139 non-null float64
hdi           3562 non-null float64
income        38553 non-null float64
motor         440 non-null float64
population    17631 non-null float64
roads         1931 non-null float64
sulfur        23556 non-null float64
surviving     29952 non-null float64
dtypes: float64(9), int64(1), object(3)
memory usage: 6.4+ MB


In [62]:
# how many countries missing CO2 emission values?
final_df.loc[final_df['CO2'].isnull()]

Unnamed: 0,country,region,sub-region,year,CO2,energy,hdi,income,motor,population,roads,sulfur,surviving
0,land,,,1751,,,,,,,,,
1,land,,,1755,,,,,,,,,
2,land,,,1762,,,,,,,,,
3,land,,,1763,,,,,,,,,
4,land,,,1764,,,,,,,,,
5,land,,,1765,,,,,,,,,
6,land,,,1766,,,,,,,,,
7,land,,,1767,,,,,,,,,
8,land,,,1768,,,,,,,,,
9,land,,,1769,,,,,,,,,


In [63]:
# drop all countries missing CO2 data
no_co2 = final_df[final_df['CO2'].isnull()].index
final_df.drop(no_co2, inplace=True)

In [64]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15072 entries, 695 to 59452
Data columns (total 13 columns):
country       15072 non-null object
region        15072 non-null object
sub-region    15072 non-null object
year          15072 non-null int64
CO2           15072 non-null float64
energy        5082 non-null float64
hdi           3283 non-null float64
income        13823 non-null float64
motor         439 non-null float64
population    10943 non-null float64
roads         1863 non-null float64
sulfur        10775 non-null float64
surviving     13244 non-null float64
dtypes: float64(9), int64(1), object(3)
memory usage: 1.6+ MB


# clean up regional descrepancies

* Greenland: make part of N. Europe, not N. America
* Mexico: add to N. America!!

In [65]:
# save as csv
#final_df.to_csv('data/final/final_df.csv', index=False)

# Updates

## A better dataset?

In [None]:
url_co2 = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--co2_emission/master/ddf--datapoints--co2_emissions_tonnes_per_person--by--country--year.csv'
url_countries = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--co2_emission/master/ddf--entities--country.csv'

co2 = pd.read_csv(url_co2)
countries = pd.read_csv(url_countries)

df = countries.merge(co2, on='country')
df_wide = df.drop('country', axis=1).pivot('name', 'year', 'co2_emissions_tonnes_per_person')
df_wide.reset_index(inplace=True)

In [385]:
test = clean_gapminder_df(df_wide)
test.head()



Unnamed: 0,country,region,sub-region,1751,1752,1753,1754,1755,1756,1757,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
0,Afghanistan,Asia,Southern Asia,,,,,,,,...,0.04041,0.0544,0.06552,0.08785,0.15895,0.24905,0.30291,0.42522,0.68802,0.69312
1,Albania,Europe,Southern Europe,,,,,,,,...,1.34203,1.37998,1.27761,1.30428,1.47382,1.49426,1.58449,1.81554,1.63562,1.66974
2,Algeria,Africa,Northern Africa,,,,,,,,...,2.72677,3.21986,2.99727,3.19557,3.16824,3.42982,3.30686,3.30026,3.47163,3.51446
3,Andorra,Europe,Southern Europe,,,,,,,,...,7.49969,7.39095,6.83994,6.62244,6.52724,6.17852,6.0921,5.70224,5.61408,5.52625
4,Angola,Africa,Middle Africa,,,,,,,,...,1.08651,1.06932,1.20077,1.31098,1.29557,1.35427,1.36921,1.38263,1.47212,1.38437


In [386]:
# any missing regions?
test.loc[test['region'].isnull()]

Unnamed: 0,country,region,sub-region,1751,1752,1753,1754,1755,1756,1757,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013


In [389]:
# any null rows?

test.set_index(['region', 'sub-region', 'country']).loc[test.set_index(['region', 'sub-region', 'country']).sum(1)==0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,1751,1752,1753,1754,1755,1756,1757,1758,1759,1760,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
region,sub-region,country,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Asia,Western Asia,West Bank And Gaza,,,,,,,,,,,...,,,,,,,,,,
Africa,Northern Africa,Western Sahara,,,,,,,,,,,...,,,,,,,,,,


## dataset missing France & Italy :(

In [375]:
# more recent CO2

updatepath = 'data/updates/'
new_co2 = pd.read_csv(updatepath + 'co2_1751_2014.csv')

# remove meta data rows
new_co2.drop([0,1,2], inplace=True)
new_co2.head()

Unnamed: 0,Nation,Year,Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C),Emissions from solid fuel consumption,Emissions from liquid fuel consumption,Emissions from gas fuel consumption,Emissions from cement production,Emissions from gas flaring,Per capita CO2 emissions (metric tons of carbon),Emissions from bunker fuels (not included in the totals)
3,AFGHANISTAN,1949.0,4.0,4,0,0,0.0,.,.,0.0
4,AFGHANISTAN,1950.0,23.0,6,18,0,0.0,0,0,0.0
5,AFGHANISTAN,1951.0,25.0,7,18,0,0.0,0,0,0.0
6,AFGHANISTAN,1952.0,25.0,9,17,0,0.0,0,0,0.0
7,AFGHANISTAN,1953.0,29.0,10,18,0,0.0,0,0,0.0


In [378]:
# how many countries? Existing has 201

len(new_co2['Nation'].unique())

256

In [220]:
# get p.c. values
new_co2.pivot('Nation', 'Year','Per capita CO2 emissions (metric tons of carbon)').loc['CANADA',2005:2014]

Year
2005.0    4.71
2006.0    4.55
2007.0    4.58
2008.0    4.58
2009.0    4.34
2010.0    4.27
2011.0    4.25
2012.0    4.05
2013.0       4
2014.0    4.12
Name: CANADA, dtype: object

In [221]:
# compare with existing
dfs[0].set_index('country').loc['Canada','2005':'2012']

2005    17.439926
2006    16.862318
2007    17.004124
2008    16.350399
2009    15.260099
2010    14.672016
2011    14.807039
2012    14.573717
Name: Canada, dtype: float64

Holy crap, these are off. Perhaps the newer csv is actually for *carbon* instead of *CO2*.
Let's check.

In [222]:
# multiply 2005 so-called 'CO2 p.c.' value by 3.67
4.71 * 3.67

17.2857

In [111]:
# ok, that's more like it, but still not the exact same values as in the dataset from Gapminder.
# Perhaps the population data used for determine per capita values is different.
# let's take total Carbon and divide by Gapminder population

# get names of dfs again to see where population is

[df.name for df in dfs]

['CO2_pc',
 'energy_use_pc',
 'hdi',
 'income_pc',
 'motor_vehicles_per_1000',
 'population',
 'roads_paved_%',
 'sulfur_emissions_pc_kg',
 'surviving_kids_per_woman']

In [153]:
# pop is 6th df in list
# get Canada 2005-2014
y = [2005,2006,2007,2008,2009,2010,2011,2012]
can_pop = dfs[5].query("country=='Canada'")[y]
print can_pop.values[0]

[32256333. 32611436. 32982275. 33363256. 33746559. 34126173. 34499905.
 34868151.]


In [164]:
# get canada data from new CO2 df
can_totalCarbon = new_co2.pivot('Nation', 'Year','Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C)')\
.loc['CANADA',2005:2012]

# convert total C in metric tonne values to CO2 values
(can_totalCarbon/can_pop) * (1000 * 3.67)

Unnamed: 0,2005,2006,2007,2008,2009,2010,2011,2012
37,17.294992,16.689381,16.821416,16.829532,15.918766,15.680282,15.581267,14.852552


In [214]:
# let's compare with the original Gapminder data again

dfs[0].set_index('country').loc['Canada','2005':'2012']

2005    17.439926
2006    16.862318
2007    17.004124
2008    16.350399
2009    15.260099
2010    14.672016
2011    14.807039
2012    14.573717
Name: Canada, dtype: float64

So the derived numbers are the same as the per capita column. In other words, still slightly different from the CO2 dataset I orignially used from the Gapminder sight. 

But it's close enough.

In [358]:
new_co2 = new_co2.loc[:,['Nation','Year','Per capita CO2 emissions (metric tons of carbon)']]

new_co2.rename(columns={'Year': 'year', 
                        'Per capita CO2 emissions (metric tons of carbon)': 'CO2'},\
              inplace=True)

# convert CO2 to float
new_co2['CO2'] = pd.to_numeric(new_co2['CO2'], errors='coerce')

# convert C to CO2 values
new_co2['CO2'] = new_co2['CO2'].apply(lambda C: C * 3.67)

# convert year to string
new_co2['year'] = new_co2['year'].astype(str).apply(lambda x: x[0:4])
new_co2.head()

Unnamed: 0,Nation,year,CO2
3,AFGHANISTAN,1949,
4,AFGHANISTAN,1950,0.0
5,AFGHANISTAN,1951,0.0
6,AFGHANISTAN,1952,0.0
7,AFGHANISTAN,1953,0.0


In [359]:
# make wide
co2_new = new_co2.pivot('Nation', 'year', 'CO2')
co2_new.head()

year,1751,1752,1753,1754,1755,1756,1757,1758,1759,1760,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014
Nation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AFGHANISTAN,,,,,,,,,,,...,0.0367,0.0734,0.0734,0.1468,0.2569,0.2936,0.4404,0.367,0.3303,0.2936
ALBANIA,,,,,,,,,,,...,1.3579,1.2478,1.2478,1.3946,1.5047,1.5781,1.835,1.6882,1.7616,1.9818
ALGERIA,,,,,,,,,,,...,3.2663,3.0461,3.1195,3.0828,3.4498,3.303,3.303,3.4865,3.5232,3.7434
ANDORRA,,,,,,,,,,,...,7.4134,6.8629,6.606,6.7161,6.0555,6.1656,5.9821,6.1656,6.2757,6.3858
ANGOLA,,,,,,,,,,,...,1.1744,1.3212,1.4313,1.3946,1.3579,1.3579,1.3946,1.468,1.3946,1.4313


In [364]:
co2_new.loc[co2_new.sum(axis=1)==0]

year,1751,1752,1753,1754,1755,1756,1757,1758,1759,1760,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014
Nation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DEMOCRATIC REPUBLIC OF VIETNAM,,,,,,,,,,,...,,,,,,,,,,
FORMER PANAMA CANAL ZONE,,,,,,,,,,,...,,,,,,,,,,
JAPAN (EXCLUDING THE RUYUKU ISLANDS),,,,,,,,,,,...,,,,,,,,,,
KUWAITI OIL FIRES,,,,,,,,,,,...,,,,,,,,,,
PUERTO RICO,,,,,,,,,,,...,,,,,,,,,,
REPUBLIC OF SOUTH VIETNAM,,,,,,,,,,,...,,,,,,,,,,
RYUKYU ISLANDS,,,,,,,,,,,...,,,,,,,,,,
TUVALU,,,,,,,,,,,...,,,,,,,,,,
UNITED KOREA,,,,,,,,,,,...,,,,,,,,,,


In [360]:
# sanity check
co2_new.query("Nation=='CANADA'").tail()

year,1751,1752,1753,1754,1755,1756,1757,1758,1759,1760,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014
Nation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CANADA,,,,,,,,,,,...,17.2857,16.6985,16.8086,16.8086,15.9278,15.6709,15.5975,14.8635,14.68,15.1204


In [363]:
# more sanity checking
co2_new.query("Nation=='ITALY'")

year,1751,1752,1753,1754,1755,1756,1757,1758,1759,1760,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014
Nation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [372]:
# WTF are Italy & France?

co2_new.reset_index().loc[co2_new.reset_index()['Nation'].str.contains('VIETNAM')]

year,Nation,1751,1752,1753,1754,1755,1756,1757,1758,1759,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014
56,DEMOCRATIC REPUBLIC OF VIETNAM,,,,,,,,,,...,,,,,,,,,,
184,REPUBLIC OF SOUTH VIETNAM,,,,,,,,,,...,,,,,,,,,,


In [265]:
# prepare to merge with regions data
co2_new.name = 'CO2_2014'
co2_new.head()


year,Nation,1751,1752,1753,1754,1755,1756,1757,1758,1759,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014
0,AFGHANISTAN,,,,,,,,,,...,0.0367,0.0734,0.0734,0.1468,0.2569,0.2936,0.4404,0.367,0.3303,0.2936
1,ALBANIA,,,,,,,,,,...,1.3579,1.2478,1.2478,1.3946,1.5047,1.5781,1.835,1.6882,1.7616,1.9818
2,ALGERIA,,,,,,,,,,...,3.2663,3.0461,3.1195,3.0828,3.4498,3.303,3.303,3.4865,3.5232,3.7434
3,ANDORRA,,,,,,,,,,...,7.4134,6.8629,6.606,6.7161,6.0555,6.1656,5.9821,6.1656,6.2757,6.3858
4,ANGOLA,,,,,,,,,,...,1.1744,1.3212,1.4313,1.3946,1.3579,1.3579,1.3946,1.468,1.3946,1.4313


In [None]:
#
#co2_new.set_index('Nation', inplace=True)
co2_new.reset_index(inplace=True)
co2_new.head()

## Merge with regions

In [349]:
co2_new_clean = clean_gapminder_df(co2_new)

In [350]:
# verify
co2_new_clean.tail()

Unnamed: 0,country,region,sub-region,1751,1752,1753,1754,1755,1756,1757,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014
196,West Bank And Gaza,Asia,Western Asia,,,,,,,,...,,,,,,,,,,
197,Western Sahara,Africa,Northern Africa,,,,,,,,...,,,,,,,,,,
198,Vietnam,Asia,South-Eastern Asia,,,,,,,,...,,,,,,,,,,
199,Zambia,Africa,Eastern Africa,,,,,,,,...,0.1835,0.1835,0.1468,0.1835,0.1835,0.1835,0.2202,0.2569,0.2569,0.2936
200,Zimbabwe,Africa,Eastern Africa,,,,,,,,...,0.8441,0.8441,0.7707,0.6239,0.4037,0.5505,0.6606,0.5505,0.7707,0.8074


In [351]:
# how many all-null rows?
co2_new_clean.loc[co2_new_clean.sum(1).isnull()]

Unnamed: 0,country,region,sub-region,1751,1752,1753,1754,1755,1756,1757,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014


In [352]:
# how many rows without region data?
no_match = co2_new_clean.loc[co2_new_clean['region'].isnull()]['country']
no_match

Series([], Name: country, dtype: object)

In [355]:
co2_new_clean.to_csv('data/final/co2_regions_2014.csv', index=False)

In [82]:
# new sulfur
SOx_new = pd.read_csv(updatepath + 'DP_LIVE_06092018113153525.csv')
SOx_new.head()

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,AUS,AIREMISSION,SOX,KG_CAP,A,2000,124.032,
1,AUS,AIREMISSION,SOX,KG_CAP,A,2001,131.641,
2,AUS,AIREMISSION,SOX,KG_CAP,A,2002,138.627,
3,AUS,AIREMISSION,SOX,KG_CAP,A,2003,136.647,
4,AUS,AIREMISSION,SOX,KG_CAP,A,2004,122.506,


In [96]:
SOx_new.pivot('LOCATION', 'TIME', 'Value').info()

<class 'pandas.core.frame.DataFrame'>
Index: 39 entries, AUS to USA
Data columns (total 15 columns):
2000    36 non-null float64
2001    35 non-null float64
2002    35 non-null float64
2003    36 non-null float64
2004    36 non-null float64
2005    38 non-null float64
2006    38 non-null float64
2007    38 non-null float64
2008    39 non-null float64
2009    38 non-null float64
2010    38 non-null float64
2011    38 non-null float64
2012    38 non-null float64
2013    38 non-null float64
2014    37 non-null float64
dtypes: float64(15)
memory usage: 4.9+ KB


In [91]:
dfs[7].query("country=='Canada'").head()

Unnamed: 0,country,1850,1851,1852,1853,1854,1855,1856,1857,1858,...,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000
28,Canada,0.237,0.291,0.561,0.808,1.03,1.25,1.5,1.72,3.75,...,64.4,55.0,44.8,43.0,44.3,42.1,41.6,41.4,40.4,38.0


In [94]:
SOx_new2 = pd.read_csv(updatepath + 'sulfur.dat', sep=' ')
SOx_new2.head()

Unnamed: 0,COUNTRY,Unnamed: 1,YEAR,Unnamed: 3,HARD,COAL,Unnamed: 6,B,COAL.1,Unnamed: 9,OIL,Unnamed: 11,GAS,Unnamed: 13,MINING,Unnamed: 15,TOTAL
0,AUSTRALIA,1850.0,0.0,0.0,0.0,0.0,6.0,6.0,,,,,,,,,
1,AUSTRIA,1850.0,5.525482,1.819125,0.0,0.0,0.041739,7.386346,,,,,,,,,
2,BELGIUM,1850.0,31.16598,0.0,0.0,0.0,0.0,31.16598,,,,,,,,,
3,CHILE,1850.0,0.0,0.0,0.0,0.0,14.4,14.4,,,,,,,,,
4,DENMARK,1850.0,1.572654,0.0,0.0,0.0,0.0,1.572654,,,,,,,,,
