In [452]:
import numpy as np
import pandas as pd
from os import walk # to get list of filenames
import sys # to print function name within function

#for country name cleanup
import unidecode
import re

# Helper functions

In [453]:
# create dfs for later merging

def make_df(filename):
    name = filename.split('.')[0]
    f_type = filename.split('.')[1]
    if f_type=='xlsx':
        df = pd.read_excel(mypath + filename)
    elif f_type=='csv':
        df = pd.read_csv(mypath + filename)
    else:
        print 'Unknown file type'
    
    # make sure all DFs have a 'country' column
    # to simplify .join() later
    df.rename(columns={df.columns[0]:'country'}, inplace=True)
    
    # the .name attribute will provide be used
    # when reshaping, as the label for the value column
    df.name = name
    
    return df


In [454]:
# clean up country names before merging

# helper functions

# https://gist.github.com/gornostal/1f123aaf838506038710
def force_to_unicode(text):
    "If text is unicode, it is returned as is. If it's str, convert it to Unicode using UTF-8 encoding"
    return text if isinstance(text, unicode) else text.decode('utf8')

def remove_special_char(series):
    clean_series = re.sub('\W+',' ', series)
    return clean_series

def fix_country_col(df):
    country_col = str(df.columns[0])
    df.rename(columns = {country_col : 'country'}, inplace = True)
    return df

def clean_country_col(df):
    # trim, remove accents & special chars
    # make lowercase
    df['country'] = (
        df['country'].apply(force_to_unicode)
        .apply(remove_special_char)
        .str.lower().str.strip()
    )
    return df

def add_regions(df):
    regions = pd.read_csv('data/countries_with_regions.csv')
    regions = clean_country_col(regions)
    df = regions.merge(df, on='country', how='left')
    
    return df

def clean_gapminder_df(df):
    name = df.name
    df = fix_country_col(df)

    # remove special chars & accents
    df = clean_country_col(df)
    df['country'] = df['country'].apply(force_to_unicode).apply(remove_special_char).str.lower().str.strip()

    
    # restore title case
    df['country'] = df['country'].str.title()
    
    # re-assign original df name
    df.name = name

    return df

# create df
#regions = pd.read_json('data/all_countries.json')
#regions = regions[['name', 'region', 'sub-region']]

#regions = clean_gapminder_df(regions)

# save as csv
#regions.to_csv('data/regions_cleaned.csv', index = False)

In [455]:
# create list of filenames

def list_files(mypath):
    files = []
    [files.extend(filenames) for (dirpath, dirnames, filenames) in walk(mypath)]
    return files    

In [456]:
# inspect DF

def df_min_max(df):
    #print '\n'
    print sys._getframe().f_code.co_name
    print df.columns[1:].min()
    print df.columns[1:].max()

# any years with missing data?
def df_yrs_nan_vals(df):
    #print '\n'  
    print sys._getframe().f_code.co_name
    print df.isnull().any().sum()

# any countries that have no data at all?
def df_countries_no_data(df):
    #print '\n'   
    print sys._getframe().f_code.co_name
    print df.iloc[:,3:].isnull().all().sum()

def inspect_df(df):
    name = str.upper(df.name)
    #print '\n'  
    print 'Inspecting %s:' % name
    df_min_max(df)
    df_yrs_nan_vals(df)
    df_countries_no_data(df)
    raw_input('Press <ENTER> to continue')
    print '\n'
    print df.head()
    raw_input('Press <ENTER> to continue')

In [33]:
def get_val_names(dfs):
    names = [df.name.split('_')[0] for df in dfs]
    return names

In [399]:
# reshape into long format for easier plotting

def reshape_for_plot(df):
    #df = df.reset_index().copy()
    #df = df.set_index(['region', 'sub-region', 'country'])
    name = df.name.split('_')[0]
    df = df.set_index(['region', 'sub-region', 'country'])
    df = df.sort_index(level = 0)

    t = df.T
    t = t.unstack(level = 1)
    df = pd.DataFrame(t)
    
    df = df.reset_index()
    df = df.rename(columns={'level_1':'year', 0: name})
    
    # make sure year column is int type
    #df['year'] = df['year'].astype('int64')
    df.name = name

    return df

# Look at source files

In [615]:
originalpath = 'data/original/'
cleanpath = 'data/cleaned/'
mypath = originalpath

originals = list_files(mypath)
originals

['!CO2_2013.csv',
 'energy use per person.xlsx',
 'hdi_human_development_index.csv',
 'income_per_person_gdppercapita_ppp_inflation_adjusted.csv',
 'motor_vehicles_per_1000_pop2010.xlsx',
 'population.xlsx',
 'roads_paved_percent_of_total_roads.csv',
 'sulfur_emissions_per_person_kg.csv',
 'surviving_kids_per_woman.csv']

In [621]:
test = pd.read_csv(originalpath + '!CO2_2013.csv' )
test.head()

Unnamed: 0,1751,1752,1753,1754,1755,1756,1757,1758,1759,1760,...,2003.1,2004.1,2005.1,2006.1,2007.1,2008.1,2009.1,2010.1,2011.1,2012.1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [620]:
# convert to DF

# extract file extension as new column
files_df = pd.Series(originals).str.split('.', expand=True).rename(columns={0:'clean_name', 1:'type'})

# keep original filename
files_df['orig_name']=originals

# convert files to DF
dfs = files_df['orig_name'].apply(make_df)

# remove accents and special chars
#dfs = dfs.apply(clean_gapminder_df)

Unnamed: 0,country,1752,1753,1754,1755,1756,1757,1758,1759,1760,...,2003.1,2004.1,2005.1,2006.1,2007.1,2008.1,2009.1,2010.1,2011.1,2012.1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [213]:
# all files converted to df?
print len(originals)
len(dfs)

9


9

In [38]:
files_df

Unnamed: 0,clean_name,type,orig_name
0,1indicator CDIAC carbon_dioxide_emissions_per_...,csv,1indicator CDIAC carbon_dioxide_emissions_per_...
1,energy use per person,xlsx,energy use per person.xlsx
2,hdi_human_development_index,csv,hdi_human_development_index.csv
3,income_per_person_gdppercapita_ppp_inflation_a...,csv,income_per_person_gdppercapita_ppp_inflation_a...
4,motor_vehicles_per_1000_pop2010,xlsx,motor_vehicles_per_1000_pop2010.xlsx
5,population,xlsx,population.xlsx
6,roads_paved_percent_of_total_roads,csv,roads_paved_percent_of_total_roads.csv
7,sulfur_emissions_per_person_kg,csv,sulfur_emissions_per_person_kg.csv
8,surviving_kids_per_woman,csv,surviving_kids_per_woman.csv


## Clean filenames

In [39]:
def clean_names(series):
    # replace spaces with undescores,
    # remove or shorten meaningless words
    series = series.str.upper()\
    .str.replace(' ', '_')\
    .str.replace('INDICATOR_', '')\
    .str.replace('PER_CAPITA', 'pc')\
    .str.replace('PER_PERSON', 'pc')\
    .str.lower()

    return series

In [40]:
files_df['clean_name'] = clean_names(files_df['clean_name'])

In [41]:
files_df

Unnamed: 0,clean_name,type,orig_name
0,1cdiac_carbon_dioxide_emissions_pc,csv,1indicator CDIAC carbon_dioxide_emissions_per_...
1,energy_use_pc,xlsx,energy use per person.xlsx
2,hdi_human_development_index,csv,hdi_human_development_index.csv
3,income_pc_gdppercapita_ppp_inflation_adjusted,csv,income_per_person_gdppercapita_ppp_inflation_a...
4,motor_vehicles_per_1000_pop2010,xlsx,motor_vehicles_per_1000_pop2010.xlsx
5,population,xlsx,population.xlsx
6,roads_paved_percent_of_total_roads,csv,roads_paved_percent_of_total_roads.csv
7,sulfur_emissions_pc_kg,csv,sulfur_emissions_per_person_kg.csv
8,surviving_kids_per_woman,csv,surviving_kids_per_woman.csv


In [42]:
# manually fix certain rows
fix_these_rows = [0,2,3,4,6]
needs_fixing = files_df.iloc[fix_these_rows]['clean_name']
needs_fixing.values

array(['1cdiac_carbon_dioxide_emissions_pc',
       'hdi_human_development_index',
       'income_pc_gdppercapita_ppp_inflation_adjusted',
       'motor_vehicles_per_1000_pop2010',
       'roads_paved_percent_of_total_roads'], dtype=object)

In [43]:
# copy-paste-modify
fixed = np.array(['CO2_pc','hdi',
       'income_pc',
       'motor_vehicles_per_1000',
       'roads_paved_%'])


In [44]:
# update rows with cleaned names
files_df['clean_name'].iloc[fix_these_rows] = fixed

In [45]:
# verify
files_df['clean_name']

0                      CO2_pc
1               energy_use_pc
2                         hdi
3                   income_pc
4     motor_vehicles_per_1000
5                  population
6               roads_paved_%
7      sulfur_emissions_pc_kg
8    surviving_kids_per_woman
Name: clean_name, dtype: object

In [46]:
# assign clean name as the df.name attribute
def update_name(df, clean_name):
    df.name = clean_name

# could not figure out how to do this with pd.Series.apply
map(update_name, dfs, files_df['clean_name'].values)

[None, None, None, None, None, None, None, None, None]

In [47]:
# update files_df to list df.names

dfnames = [df.name for df in dfs]
files_df['df_val_name'] = dfnames
files_df.rename(columns={'df_val_name' : 'df_name'}, inplace=True)

In [48]:
valnames = [name.split('_')[0] for name in dfnames]
files_df['val_name'] = valnames
files_df

Unnamed: 0,clean_name,type,orig_name,df_name,val_name
0,CO2_pc,csv,1indicator CDIAC carbon_dioxide_emissions_per_...,CO2_pc,CO2
1,energy_use_pc,xlsx,energy use per person.xlsx,energy_use_pc,energy
2,hdi,csv,hdi_human_development_index.csv,hdi,hdi
3,income_pc,csv,income_per_person_gdppercapita_ppp_inflation_a...,income_pc,income
4,motor_vehicles_per_1000,xlsx,motor_vehicles_per_1000_pop2010.xlsx,motor_vehicles_per_1000,motor
5,population,xlsx,population.xlsx,population,population
6,roads_paved_%,csv,roads_paved_percent_of_total_roads.csv,roads_paved_%,roads
7,sulfur_emissions_pc_kg,csv,sulfur_emissions_per_person_kg.csv,sulfur_emissions_pc_kg,sulfur
8,surviving_kids_per_woman,csv,surviving_kids_per_woman.csv,surviving_kids_per_woman,surviving


In [49]:
# save as CSV using cleaned name
'''def clean_to_csv(df):
    cleanname = files_df['clean_name'].str.cat(files_df['type'], sep='.')
    df.to_csv(cleanpath + cleanname, index=False)
    return none'''

"def clean_to_csv(df):\n    cleanname = files_df['clean_name'].str.cat(files_df['type'], sep='.')\n    df.to_csv(cleanpath + cleanname, index=False)\n    return none"

In [50]:
# IS THIS STEP EVEN NEEDED????
# Save to csv files
dfs.apply(lambda df: df.to_csv(cleanpath + df.name + '.' + 'csv', index = False))

0    None
1    None
2    None
3    None
4    None
5    None
6    None
7    None
8    None
Name: orig_name, dtype: object

# Create wide DFs

In [51]:
# create "driving" DF, ie the left-most DF when merging
#co2_regions = pd.read_pickle('data/co2_regions.pkl')
#co2_regions.to_csv('data/cleaned/!co2_regions.csv')

## Inspect resulting DFs

In [52]:



#len(dfs)
#[inspect_df(df) for df in dfs]

Not sure what happened here. The same problem is in the original CSV, so I downloaded it again from Gapminder. Problem solved.

## Surviving DF contains projections

Unlike the other DFs, this one looks into the future, which will create NANs all over the place in the merged DF.

Let's drop these projections.

In [53]:
# find DF
files_df['clean_name']

0                      CO2_pc
1               energy_use_pc
2                         hdi
3                   income_pc
4     motor_vehicles_per_1000
5                  population
6               roads_paved_%
7      sulfur_emissions_pc_kg
8    surviving_kids_per_woman
Name: clean_name, dtype: object

In [54]:
# it's the final (8th) df
# remove projection years (post 2015) for Surviving DF

start = dfs[8].columns.get_loc('2016')
end = dfs[8].columns.get_loc('2099') + 1

dfs[8].drop(dfs[8].columns[start:end], axis=1, inplace=True)

In [55]:
# verify
dfs[8].tail()

Unnamed: 0,country,1760,1761,1762,1763,1764,1765,1766,1767,1768,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
175,Venezuela,,,,,,,,,,...,2.44,2.42,2.39,2.36,2.33,,,,,
176,Vietnam,,,,,,,,,,...,1.78,1.77,1.76,1.75,1.73,,,,,
177,Yemen,,,,,,,,,,...,4.82,4.77,4.68,4.64,4.55,,,,,
178,Zambia,,,,,,,,,,...,3.92,4.02,4.12,4.22,4.33,,,,,
179,Zimbabwe,,,,,,,,,,...,2.26,2.27,2.28,2.32,2.32,,,,,


# Create long DFs

In [56]:
long_dfs = [reshape_for_plot(dfs[i]) for i, _ in enumerate(dfs)]
len(long_dfs)

9

# Join long DFs

In [57]:
# thank you http://notconfusing.com/joining-many-dataframes-at-once-in-pandas-n-ary-join/
def merge_dfs(ldf, rdf):
    right_on = list(rdf.columns[0:2])
    left_on = list(ldf.columns[0:2])
    return ldf.merge(rdf, how='left', left_on=left_on, right_on=right_on)
    

final_df = reduce(merge_dfs, long_dfs) #that's the magic
final_df.tail(20)

Unnamed: 0,country,year,CO2,energy,hdi,income,motor,population,roads,sulfur,surviving
59435,Zimbabwe,1993,1.441291,0.865807,0.479,2350.0,,11256512.0,17.0,8.27,3.69
59436,Zimbabwe,1994,1.538291,0.842736,0.475,2520.0,,11476807.0,54.9,7.72,3.47
59437,Zimbabwe,1995,1.294742,0.842225,0.465,2480.0,,11683136.0,48.0,7.25,3.24
59438,Zimbabwe,1996,1.260839,0.824112,0.46,2690.0,,11877664.0,47.4,6.82,3.04
59439,Zimbabwe,1997,1.191934,0.802734,0.451,2710.0,,12059858.0,47.4,6.3,2.85
59440,Zimbabwe,1998,1.162342,0.809675,0.442,2750.0,,12226742.0,47.4,6.64,2.67
59441,Zimbabwe,1999,1.276289,0.853539,0.434,2690.0,,12374019.0,,7.3,2.56
59442,Zimbabwe,2000,1.110012,0.790319,0.427,2570.0,,12499981.0,,7.15,2.46
59443,Zimbabwe,2001,0.998965,0.772111,0.427,2580.0,,12603988.0,,,2.36
59444,Zimbabwe,2002,0.946346,0.770138,0.418,2320.0,,12691431.0,19.0,,2.28


## Add region & sub_region columns

In [58]:
countries_regions = pd.read_csv('data/countries_with_regions.csv')
final_df = final_df.merge(countries_regions, on='country', how='left')
final_df.tail(25)

Unnamed: 0,country,year,CO2,energy,hdi,income,motor,population,roads,sulfur,surviving,region,sub-region
59430,Zimbabwe,1988,1.634179,0.878731,,2450.0,,9866776.0,,8.3,4.71,Africa,Eastern Africa
59431,Zimbabwe,1989,1.59154,0.870441,,2490.0,,10184966.0,,8.3,4.49,Africa,Eastern Africa
59432,Zimbabwe,1990,1.480788,0.888059,0.499,2590.0,,10484771.0,14.0,8.07,4.33,Africa,Eastern Africa
59433,Zimbabwe,1991,1.472027,0.916924,0.501,2670.0,,10763036.0,15.0,9.28,4.13,Africa,Eastern Africa
59434,Zimbabwe,1992,1.535539,0.924668,0.486,2370.0,,11019717.0,16.0,9.8,3.88,Africa,Eastern Africa
59435,Zimbabwe,1993,1.441291,0.865807,0.479,2350.0,,11256512.0,17.0,8.27,3.69,Africa,Eastern Africa
59436,Zimbabwe,1994,1.538291,0.842736,0.475,2520.0,,11476807.0,54.9,7.72,3.47,Africa,Eastern Africa
59437,Zimbabwe,1995,1.294742,0.842225,0.465,2480.0,,11683136.0,48.0,7.25,3.24,Africa,Eastern Africa
59438,Zimbabwe,1996,1.260839,0.824112,0.46,2690.0,,11877664.0,47.4,6.82,3.04,Africa,Eastern Africa
59439,Zimbabwe,1997,1.191934,0.802734,0.451,2710.0,,12059858.0,47.4,6.3,2.85,Africa,Eastern Africa


In [59]:
# rearrange column order so region info beside country col
cols = final_df.columns.tolist()

newcols = [cols[0]]
newcols.extend(cols[-2:])
newcols.extend(cols[1:-2])

newcols

['country',
 'region',
 'sub-region',
 'year',
 'CO2',
 'energy',
 'hdi',
 'income',
 'motor',
 'population',
 'roads',
 'sulfur',
 'surviving']

In [60]:
final_df = final_df[newcols]
final_df.head()

Unnamed: 0,country,region,sub-region,year,CO2,energy,hdi,income,motor,population,roads,sulfur,surviving
0,land,,,1751,,,,,,,,,
1,land,,,1755,,,,,,,,,
2,land,,,1762,,,,,,,,,
3,land,,,1763,,,,,,,,,
4,land,,,1764,,,,,,,,,


In [61]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59455 entries, 0 to 59454
Data columns (total 13 columns):
country       59455 non-null object
region        50853 non-null object
sub-region    50853 non-null object
year          59455 non-null int64
CO2           15072 non-null float64
energy        5139 non-null float64
hdi           3562 non-null float64
income        38553 non-null float64
motor         440 non-null float64
population    17631 non-null float64
roads         1931 non-null float64
sulfur        23556 non-null float64
surviving     29952 non-null float64
dtypes: float64(9), int64(1), object(3)
memory usage: 6.4+ MB


In [62]:
# how many countries missing CO2 emission values?
final_df.loc[final_df['CO2'].isnull()]

Unnamed: 0,country,region,sub-region,year,CO2,energy,hdi,income,motor,population,roads,sulfur,surviving
0,land,,,1751,,,,,,,,,
1,land,,,1755,,,,,,,,,
2,land,,,1762,,,,,,,,,
3,land,,,1763,,,,,,,,,
4,land,,,1764,,,,,,,,,
5,land,,,1765,,,,,,,,,
6,land,,,1766,,,,,,,,,
7,land,,,1767,,,,,,,,,
8,land,,,1768,,,,,,,,,
9,land,,,1769,,,,,,,,,


In [63]:
# drop all countries missing CO2 data
no_co2 = final_df[final_df['CO2'].isnull()].index
final_df.drop(no_co2, inplace=True)

In [64]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15072 entries, 695 to 59452
Data columns (total 13 columns):
country       15072 non-null object
region        15072 non-null object
sub-region    15072 non-null object
year          15072 non-null int64
CO2           15072 non-null float64
energy        5082 non-null float64
hdi           3283 non-null float64
income        13823 non-null float64
motor         439 non-null float64
population    10943 non-null float64
roads         1863 non-null float64
sulfur        10775 non-null float64
surviving     13244 non-null float64
dtypes: float64(9), int64(1), object(3)
memory usage: 1.6+ MB


# clean up regional descrepancies

* Greenland: make part of N. Europe, not N. America
* Mexico: add to N. America!!

In [65]:
# save as csv
#final_df.to_csv('data/final/final_df.csv', index=False)

# Updates

## A better dataset?

In [626]:
url_co2 = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--co2_emission/master/ddf--datapoints--co2_emissions_tonnes_per_person--by--country--year.csv'
url_countries = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--co2_emission/master/ddf--entities--country.csv'

co2 = pd.read_csv(url_co2)
countries = pd.read_csv(url_countries)

df = countries.merge(co2, on='country')
co2_2013 = df.drop('country', axis=1).pivot('name', 'year', 'co2_emissions_tonnes_per_person')
co2_2013.reset_index(inplace=True)
co2_2013.rename(columns={'name':'country'}, inplace=True)

In [627]:
co2_2013.name = 'co2_2013'
co2_2013.head()

year,country,1751,1752,1753,1754,1755,1756,1757,1758,1759,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
0,Afghanistan,,,,,,,,,,...,0.04041,0.0544,0.06552,0.08785,0.15895,0.24905,0.30291,0.42522,0.68802,0.69312
1,Albania,,,,,,,,,,...,1.34203,1.37998,1.27761,1.30428,1.47382,1.49426,1.58449,1.81554,1.63562,1.66974
2,Algeria,,,,,,,,,,...,2.72677,3.21986,2.99727,3.19557,3.16824,3.42982,3.30686,3.30026,3.47163,3.51446
3,Andorra,,,,,,,,,,...,7.49969,7.39095,6.83994,6.62244,6.52724,6.17852,6.0921,5.70224,5.61408,5.52625
4,Angola,,,,,,,,,,...,1.08651,1.06932,1.20077,1.31098,1.29557,1.35427,1.36921,1.38263,1.47212,1.38437


In [628]:
# any missing regions?
co2_2013.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229 entries, 0 to 228
Columns: 264 entries, country to 2013
dtypes: float64(263), object(1)
memory usage: 472.4+ KB


In [629]:
# compare with previous version

co2_old = pd.read_csv('data/archive/1indicator CDIAC carbon_dioxide_emissions_per_capita.csv')
co2_old.rename(columns={'CO2 per capita': 'country'}, inplace=True)
co2_old.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Columns: 254 entries, country to 2012
dtypes: float64(253), object(1)
memory usage: 466.4+ KB


In [630]:
# there are more countries in the previous version
# need to compare

# first clean colums in each df
co2_old = clean_country_col(co2_old)
co2_2013 = clean_country_col(co2_2013)

In [631]:
# 2010 countries NOT in 2013 dataset
notin_2013 = co2_old.loc[~co2_old['country'].isin(co2_2013['country'])]
notin_2013

Unnamed: 0,country,1751,1755,1762,1763,1764,1765,1766,1767,1768,...,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012
0,abkhazia,,,,,,,,,,...,,,,,,,,,,
2,akrotiri and dhekelia,,,,,,,,,,...,,,,,,,,,,
5,american samoa,,,,,,,,,,...,,,,,,,,,,
41,channel islands,,,,,,,,,,...,,,,,,,,,,
44,cocos island,,,,,,,,,,...,,,,,,,,,,
63,eritrea and ethiopia,,,,,,,,,,...,,,,,,,,,,
82,guam,,,,,,,,,,...,,,,,,,,,,
84,guernsey,,,,,,,,,,...,,,,,,,,,,
89,holy see,,,,,,,,,,...,,,,,,,,,,
99,isle of man,,,,,,,,,,...,,,,,,,,,,


30 countries, but there are mostly NANs.
How many?

In [632]:
notin_2013.set_index('country', inplace=True)
notin_2013.isnull().all(1)

country
abkhazia                     True
akrotiri and dhekelia        True
american samoa               True
channel islands              True
cocos island                 True
eritrea and ethiopia         True
guam                         True
guernsey                     True
holy see                     True
isle of man                  True
jersey                       True
kosovo                       True
mayotte                      True
monaco                       True
norfolk island               True
northern cyprus              True
northern mariana islands     True
pitcairn                     True
san marino                   True
serbia excluding kosovo      True
somaliland                   True
south ossetia                True
svalbard                     True
tokelau                      True
transnistria                 True
tuvalu                       True
west bank and gaza          False
western sahara              False
virgin islands u s           True
land  

In [633]:
# drop all-NaN rows
notin_2013 = notin_2013.dropna(how='all')
notin_2013

Unnamed: 0_level_0,1751,1755,1762,1763,1764,1765,1766,1767,1768,1769,...,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
west bank and gaza,,,,,,,,,,,...,0.374235,0.535766,0.771369,0.622923,0.623526,0.536602,0.531716,0.585513,,
western sahara,,,,,,,,,,,...,0.612034,0.57268,0.54109,0.516309,0.496383,0.479618,0.464255,0.449262,,


In [634]:
# find 2013 countries NOT in 2010 df
notin2010 = co2_2013.loc[~co2_2013['country'].isin(co2_old['country'])]
notin2010

year,country,1751,1752,1753,1754,1755,1756,1757,1758,1759,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
37,central african republic,,,,,,,,,,...,0.05884,0.05786,0.06041,0.06021,0.05911,0.05801,0.05939,0.0615,0.0635,0.06305
41,christmas island,,,,,,,,,,...,,,,,,,,,,
46,cook is,,,,,,,,,,...,2.88336,3.21273,3.35724,3.3231,3.48003,3.45637,3.43389,3.41269,3.39374,3.37614
51,cura ao,,,,,,,,,,...,,,,,,,,,39.49705,34.05183
53,czech republic,,,,,,,,,,...,11.434,11.73878,11.88693,11.93474,11.22218,10.29118,10.61896,10.14792,9.58048,9.35502
54,czechoslovakia,,,,,,,,,,...,,,,,,,,,,
58,dominican republic,,,,,,,,,,...,2.01117,2.0176,2.11087,2.22754,2.22401,2.10479,2.18711,2.20209,2.23971,2.14656
68,falkland is malvinas,,,,,,,,,,...,17.38345,17.35407,19.77306,19.69341,19.60129,19.51652,18.23003,18.16981,18.11594,18.0624
107,kyrgyz republic,,,,,,,,,,...,1.15687,1.05367,1.02762,1.0848,1.44494,1.26213,1.16819,1.37851,1.79171,1.71282
108,lao,,,,,,,,,,...,0.24622,0.24444,0.26564,0.1531,0.15648,0.20439,0.2618,0.25512,0.33364,0.33045


In [None]:
notin_2010.set_index('country', inplace=True)
notin_2010.isnull().all(1)

In [635]:
# ibid
notin2010.set_index('country', inplace=True)
notin2010 = notin2010.dropna(how='all')
notin2010

year,1751,1752,1753,1754,1755,1756,1757,1758,1759,1760,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
central african republic,,,,,,,,,,,...,0.05884,0.05786,0.06041,0.06021,0.05911,0.05801,0.05939,0.0615,0.0635,0.06305
christmas island,,,,,,,,,,,...,,,,,,,,,,
cook is,,,,,,,,,,,...,2.88336,3.21273,3.35724,3.3231,3.48003,3.45637,3.43389,3.41269,3.39374,3.37614
cura ao,,,,,,,,,,,...,,,,,,,,,39.49705,34.05183
czech republic,,,,,,,,,,,...,11.434,11.73878,11.88693,11.93474,11.22218,10.29118,10.61896,10.14792,9.58048,9.35502
czechoslovakia,,,,,,,,,,,...,,,,,,,,,,
dominican republic,,,,,,,,,,,...,2.01117,2.0176,2.11087,2.22754,2.22401,2.10479,2.18711,2.20209,2.23971,2.14656
falkland is malvinas,,,,,,,,,,,...,17.38345,17.35407,19.77306,19.69341,19.60129,19.51652,18.23003,18.16981,18.11594,18.0624
kyrgyz republic,,,,,,,,,,,...,1.15687,1.05367,1.02762,1.0848,1.44494,1.26213,1.16819,1.37851,1.79171,1.71282
lao,,,,,,,,,,,...,0.24622,0.24444,0.26564,0.1531,0.15648,0.20439,0.2618,0.25512,0.33364,0.33045


In [636]:
# really?
notin2010.sum(1)

country
central african republic            3.64602
christmas island                 1038.49855
cook is                            80.71116
cura ao                            73.54888
czech republic                   1180.87949
czechoslovakia                    804.01304
dominican republic                 78.74027
falkland is malvinas             2076.80256
kyrgyz republic                    28.23056
lao                                 6.69821
north korea                       314.94578
north yemen former                  4.99095
palestine                           7.61111
puerto rico                         0.15930
south korea                       303.92670
south yemen former                 75.39023
st helena                          62.08906
st kitts and nevis                114.94430
st lucia                           70.48466
st vincent and the grenadines      50.74810
st pierre et miquelon             564.86305
ussr                              470.16513
united korea former     

In [637]:
# add notin_2013 rows to the 2013 dataset

co2_2013.set_index('country', inplace=True)
co2_2013 = co2_2013.append(notin_2013)

# there now should be 231 rows instead of 229
co2_2013.info()

<class 'pandas.core.frame.DataFrame'>
Index: 231 entries, 0 to western sahara
Columns: 517 entries, 1751 to country
dtypes: float64(516), object(1)
memory usage: 934.8+ KB


In [639]:
#co2_2013.reset_index().sort_values('country').reset_index(drop=True)

Unnamed: 0,1751,1752,1753,1754,1755,1756,1757,1758,1759,1760,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,country
0,,,,,,,,,,,...,,,,,,,,,,afghanistan
1,,,,,,,,,,,...,,,,,,,,,,albania
2,,,,,,,,,,,...,,,,,,,,,,algeria
3,,,,,,,,,,,...,,,,,,,,,,andorra
4,,,,,,,,,,,...,,,,,,,,,,angola
5,,,,,,,,,,,...,,,,,,,,,,anguilla
6,,,,,,,,,,,...,,,,,,,,,,antigua and barbuda
7,,,,,,,,,,,...,,,,,,,,,,argentina
8,,,,,,,,,,,...,,,,,,,,,,armenia
9,,,,,,,,,,,...,,,,,,,,,,aruba


In [611]:
# Hmmmm...one thing I didn't check for was all-NaN rows here! 
# only did it for the 'notin_2010/2013' DFs

#co2_2013.set_index('country', inplace=True)
co2_2013.isnull().all(1)

country
afghanistan                 False
albania                     False
algeria                     False
andorra                     False
angola                      False
anguilla                    False
antigua and barbuda         False
argentina                   False
armenia                     False
aruba                       False
australia                   False
austria                     False
azerbaijan                  False
bahamas                     False
bahrain                     False
bangladesh                  False
barbados                    False
belarus                     False
belgium                     False
belize                      False
benin                       False
bermuda                     False
bhutan                      False
bolivia                     False
bosnia and herzegovina      False
botswana                    False
brazil                      False
british virgin islands      False
brunei                      False
bulgar

In [624]:
co2_2013.loc['france']

1751   NaN
1752   NaN
1753   NaN
1754   NaN
1755   NaN
1756   NaN
1757   NaN
1758   NaN
1759   NaN
1760   NaN
1761   NaN
1762   NaN
1763   NaN
1764   NaN
1765   NaN
1766   NaN
1767   NaN
1768   NaN
1769   NaN
1770   NaN
1771   NaN
1772   NaN
1773   NaN
1774   NaN
1775   NaN
1776   NaN
1777   NaN
1778   NaN
1779   NaN
1780   NaN
        ..
1983   NaN
1984   NaN
1985   NaN
1986   NaN
1987   NaN
1988   NaN
1989   NaN
1990   NaN
1991   NaN
1992   NaN
1993   NaN
1994   NaN
1995   NaN
1996   NaN
1997   NaN
1998   NaN
1999   NaN
2000   NaN
2001   NaN
2002   NaN
2003   NaN
2004   NaN
2005   NaN
2006   NaN
2007   NaN
2008   NaN
2009   NaN
2010   NaN
2011   NaN
2012   NaN
Name: france, Length: 516, dtype: float64

In [614]:
# save as a csv so it can be 
# batched-processed with the other datasets
co2_2013.to_csv(originalpath + '!CO2_2013.csv', index=False)