In [23]:
import numpy as np
import pandas as pd
from os import walk # to get list of filenames
import sys # to print function name within function
import pprint as pp

#for country name cleanup
import unidecode
import re

# for visualizing data
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()


# Helper functions

In [2]:
# create dfs for later merging

def make_df(filename):
    name = filename.split('.')[0]
    f_type = filename.split('.')[1]
    if f_type=='xlsx':
        df = pd.read_excel(mypath + filename)
    elif f_type=='csv':
        df = pd.read_csv(mypath + filename)
    else:
        print 'Unknown file type'
    
    # make sure all DFs have a 'country' column
    # to simplify .join() later
    df.rename(columns={df.columns[0]:'country'}, inplace=True)
    
    # the .name attribute will provide be used
    # when reshaping, as the label for the value column
    df.name = name
    
    return df

def make_df_long(df):
    vals = df.columns[1:].values
    df_long = pd.melt(
        df, id_vars=['country'], value_vars=vals
    ).sort_values(
        ['country', 'variable']).rename(
        columns={'value':df.name, 'variable':'year'}
                         ).reset_index(drop=True
                                      )
    df_long.name = df.name
    df_long['year']=df_long['year'].astype('str')
    return df_long

In [3]:
# clean up country names before merging

# helper functions

# https://gist.github.com/gornostal/1f123aaf838506038710
def force_to_unicode(text):
    "If text is unicode, it is returned as is. If it's str, convert it to Unicode using UTF-8 encoding"
    return text if isinstance(text, unicode) else text.decode('utf8')

def remove_special_char(series):
    clean_series = re.sub('\W+',' ', series)
    return clean_series

def fix_country_col(df):
    country_col = str(df.columns[0])
    df.rename(columns = {country_col : 'country'}, inplace = True)
    return df

def clean_country_col(df):
    # trim, remove accents & special chars
    # make lowercase
    df['country'] = (
        df['country'].apply(force_to_unicode)
        .apply(remove_special_char)
        .str.lower().str.strip()
    )
    return df

def add_regions(df):
    regions = pd.read_csv('data/countries_with_regions.csv')
    regions = clean_country_col(regions)
    df = regions.merge(df, on='country', how='left')
    
    return df

def clean_gapminder_df(df):
    name = df.name
    df = fix_country_col(df)

    # remove special chars & accents
    df = clean_country_col(df)
    #df['country'] = df['country'].apply(force_to_unicode).apply(remove_special_char).str.lower().str.strip()
    
    # restore title case
    df['country'] = df['country'].str.title()
    
    # re-assign original df name
    df.name = name

    return df

# create regions df
#regions = pd.read_json('data/all_countries.json')
#regions = regions[['name', 'region', 'sub-region']]

#regions = clean_gapminder_df(regions)

# save as csv
#regions.to_csv('data/regions_cleaned.csv', index = False)

In [4]:
# create list of filenames

def list_files(mypath):
    files = []
    [files.extend(filenames) for (dirpath, dirnames, filenames) in walk(mypath)]
    return files    

In [5]:
# formatting stuff

def repeat_to_length(s, wanted):
    return (s * (wanted//len(s) + 1))[:wanted]

In [6]:
# inspect gapminder DF

def df_min_max(df):
    #print '\n'
    print sys._getframe().f_code.co_name
    print df.columns[1:].min()
    print df.columns[1:].max()

# any years with missing data?
def df_yrs_nan_vals(df):
    #print '\n'  
    print sys._getframe().f_code.co_name
    print df.isnull().any().sum()

# any countries that have no data at all?
def df_countries_no_data(df):
    #print '\n'   
    print sys._getframe().f_code.co_name
    print df.iloc[:,3:].isnull().all().sum()

def inspect_df(df):
    name = str.upper(df.name)
    #print '\n'  
    print 'Inspecting %s:' % name
    df_min_max(df)
    df_yrs_nan_vals(df)
    df_countries_no_data(df)
    print repeat_to_length('*', 30)
    raw_input('Press <ENTER> to continue')
    print '\n'
    print df.info()
    print repeat_to_length('*', 30)
    raw_input('Press <ENTER> to continue')
    print '\n'
    print df.head()
    raw_input('Press <ENTER> to continue')

In [7]:
def get_val_names(dfs):
    names = [df.name.split('_')[0] for df in dfs]
    return names

In [8]:
# reshape into long format for easier plotting

def reshape_for_plot(df):
    #df = df.reset_index().copy()
    #df = df.set_index(['region', 'sub-region', 'country'])
    name = df.name.split('_')[0]
    df = df.set_index(['region', 'sub-region', 'country'])
    df = df.sort_index(level = 0)

    t = df.T
    t = t.unstack(level = 1)
    df = pd.DataFrame(t)
    
    df = df.reset_index()
    df = df.rename(columns={'level_1':'year', 0: name})
    
    # make sure year column is int type
    #df['year'] = df['year'].astype('int64')
    df.name = name

    return df

# Look at source files

In [9]:
originalpath = 'data/updates_nov18/'
cleanpath = 'data/cleaned_nov18/'
mypath = originalpath

originals = list_files(mypath)
originals

['!CO2_2013.csv',
 'energy use per person.xlsx',
 'energy_production_per_person.csv',
 'hdi_human_development_index.csv',
 'income_per_person_gdppercapita_ppp_inflation_adjusted.csv',
 'inequality_index_gini.csv',
 'military_expenditure_percent_of_gdp.csv',
 'population.xlsx',
 'pump_price_for_gasoline_us_per_liter.csv']

# Convert to DF

In [10]:
# batch convert files to df's
# and save filename info in another df

# extract file extension as new column
files_df = pd.Series(originals).str.split('.', expand=True).rename(columns={0:'clean_name', 1:'type'})

# keep original filename
files_df['orig_name']=originals

# convert files to DF
dfs = files_df['orig_name'].apply(make_df)

# remove accents and special chars
dfs = dfs.apply(clean_gapminder_df)

files_df

Unnamed: 0,clean_name,type,orig_name
0,!CO2_2013,csv,!CO2_2013.csv
1,energy use per person,xlsx,energy use per person.xlsx
2,energy_production_per_person,csv,energy_production_per_person.csv
3,hdi_human_development_index,csv,hdi_human_development_index.csv
4,income_per_person_gdppercapita_ppp_inflation_a...,csv,income_per_person_gdppercapita_ppp_inflation_a...
5,inequality_index_gini,csv,inequality_index_gini.csv
6,military_expenditure_percent_of_gdp,csv,military_expenditure_percent_of_gdp.csv
7,population,xlsx,population.xlsx
8,pump_price_for_gasoline_us_per_liter,csv,pump_price_for_gasoline_us_per_liter.csv


# Clean filenames for use as labels

In [11]:
def clean_names(series):
    # replace spaces with undescores,
    # remove or shorten meaningless words
    series = series.str.upper()\
    .str.replace(' ', '_')\
    .str.replace('INDICATOR_', '')\
    .str.replace('PER_CAPITA', 'pc')\
    .str.replace('PER_PERSON', 'pc')\
    .str.lower()

    return series

In [12]:
files_df['clean_name'] = clean_names(files_df['clean_name'])

In [13]:
files_df

Unnamed: 0,clean_name,type,orig_name
0,!co2_2013,csv,!CO2_2013.csv
1,energy_use_pc,xlsx,energy use per person.xlsx
2,energy_production_pc,csv,energy_production_per_person.csv
3,hdi_human_development_index,csv,hdi_human_development_index.csv
4,income_pc_gdppercapita_ppp_inflation_adjusted,csv,income_per_person_gdppercapita_ppp_inflation_a...
5,inequality_index_gini,csv,inequality_index_gini.csv
6,military_expenditure_percent_of_gdp,csv,military_expenditure_percent_of_gdp.csv
7,population,xlsx,population.xlsx
8,pump_price_for_gasoline_us_per_liter,csv,pump_price_for_gasoline_us_per_liter.csv


In [15]:
keep_strings = [None, None, None, 0, 1, 0, 1, None, 1]
files_df['clean_name'].str.split("_")

0                                         [!co2, 2013]
1                                    [energy, use, pc]
2                             [energy, production, pc]
3                     [hdi, human, development, index]
4    [income, pc, gdppercapita, ppp, inflation, adj...
5                            [inequality, index, gini]
6            [military, expenditure, percent, of, gdp]
7                                         [population]
8         [pump, price, for, gasoline, us, per, liter]
Name: clean_name, dtype: object

In [16]:
# manually fix certain rows
fix_these_rows = [0,3,4,5,6,8]
needs_fixing = files_df.iloc[fix_these_rows]['clean_name']
needs_fixing.values

array(['!co2_2013', 'hdi_human_development_index',
       'income_pc_gdppercapita_ppp_inflation_adjusted',
       'inequality_index_gini', 'military_expenditure_percent_of_gdp',
       'pump_price_for_gasoline_us_per_liter'], dtype=object)

In [17]:
# copy-paste-modify
fixed = np.array(['CO2_pc', 'hdi',
       'income_pc',
       'gini_idx', 'military_%gdp',
       'gas_price_liter'])

In [18]:
# update rows with cleaned names
files_df['clean_name'].iloc[fix_these_rows] = fixed

In [19]:
# verify
files_df['clean_name']

0                  CO2_pc
1           energy_use_pc
2    energy_production_pc
3                     hdi
4               income_pc
5                gini_idx
6           military_%gdp
7              population
8         gas_price_liter
Name: clean_name, dtype: object

In [20]:
# assign clean name as the df.name attribute
# these will be used later in plot titles
def update_name(df, clean_name):
    df.name = clean_name

# could not figure out how to do this with pd.Series.apply
map(update_name, dfs, files_df['clean_name'].values)

[None, None, None, None, None, None, None, None, None]

In [21]:
# update files_df to list df.names
dfnames = [df.name for df in dfs]
files_df['df_name'] = dfnames
files_df

Unnamed: 0,clean_name,type,orig_name,df_name
0,CO2_pc,csv,!CO2_2013.csv,CO2_pc
1,energy_use_pc,xlsx,energy use per person.xlsx,energy_use_pc
2,energy_production_pc,csv,energy_production_per_person.csv,energy_production_pc
3,hdi,csv,hdi_human_development_index.csv,hdi
4,income_pc,csv,income_per_person_gdppercapita_ppp_inflation_a...,income_pc
5,gini_idx,csv,inequality_index_gini.csv,gini_idx
6,military_%gdp,csv,military_expenditure_percent_of_gdp.csv,military_%gdp
7,population,xlsx,population.xlsx,population
8,gas_price_liter,csv,pump_price_for_gasoline_us_per_liter.csv,gas_price_liter


In [22]:
# add value names to files_df
# for use later in long format df's
valnames = [name.split('_')[0] for name in dfnames]
files_df['val_name'] = valnames
files_df

Unnamed: 0,clean_name,type,orig_name,df_name,val_name
0,CO2_pc,csv,!CO2_2013.csv,CO2_pc,CO2
1,energy_use_pc,xlsx,energy use per person.xlsx,energy_use_pc,energy
2,energy_production_pc,csv,energy_production_per_person.csv,energy_production_pc,energy
3,hdi,csv,hdi_human_development_index.csv,hdi,hdi
4,income_pc,csv,income_per_person_gdppercapita_ppp_inflation_a...,income_pc,income
5,gini_idx,csv,inequality_index_gini.csv,gini_idx,gini
6,military_%gdp,csv,military_expenditure_percent_of_gdp.csv,military_%gdp,military
7,population,xlsx,population.xlsx,population,population
8,gas_price_liter,csv,pump_price_for_gasoline_us_per_liter.csv,gas_price_liter,gas


In [20]:
# IS THIS STEP EVEN NEEDED?
# Save all to new CSV files
#dfs.apply(lambda df: df.to_csv(cleanpath + df.name + '.' + 'csv', index = False))

# Inspect each DF

In [31]:
[inspect_df(df) for df in dfs]

Inspecting CO2_PC:
df_min_max
1751
2013
df_yrs_nan_vals
263
df_countries_no_data
0
******************************
Press <ENTER> to continue


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229 entries, 0 to 228
Columns: 264 entries, country to 2013
dtypes: float64(263), object(1)
memory usage: 472.4+ KB
None
******************************
Press <ENTER> to continue


       country  1751  1752  1753  1754  1755  1756  1757  1758  1759   ...     \
0  Afghanistan   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   ...      
1      Albania   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   ...      
2      Algeria   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   ...      
3      Andorra   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   ...      
4       Angola   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   ...      

      2004     2005     2006     2007     2008     2009     2010     2011  \
0  0.04041  0.05440  0.06552  0.08785  0.15895  0.24905  0.30291

Press <ENTER> to continue


       country   1990   1991   1992   1993   1994   1995   1996   1997   1998  \
0  Afghanistan  0.295  0.300  0.309  0.305  0.300  0.324  0.328  0.332  0.335   
1      Albania  0.635  0.618  0.603  0.608  0.616  0.628  0.637  0.636  0.646   
2      Algeria  0.577  0.581  0.587  0.591  0.595  0.600  0.609  0.617  0.627   
3      Andorra    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
4       Angola    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   

   ...     2006   2007   2008   2009   2010   2011   2012   2013   2014   2015  
0  ...    0.415  0.433  0.434  0.448  0.454  0.463  0.470  0.476  0.479  0.479  
1  ...    0.703  0.713  0.721  0.725  0.738  0.752  0.759  0.761  0.762  0.764  
2  ...    0.690  0.697  0.705  0.714  0.724  0.732  0.737  0.741  0.743  0.745  
3  ...      NaN    NaN    NaN    NaN  0.819  0.819  0.843  0.850  0.857  0.858  
4  ...    0.454  0.468  0.480  0.488  0.495  0.508  0.523  0.527  0.531  0.533  

Press <ENTER> to continue


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275 entries, 0 to 274
Data columns (total 82 columns):
country    275 non-null object
1800       229 non-null float64
1810       229 non-null float64
1820       229 non-null float64
1830       229 non-null float64
1840       229 non-null float64
1850       229 non-null float64
1860       229 non-null float64
1870       229 non-null float64
1880       229 non-null float64
1890       229 non-null float64
1900       229 non-null float64
1910       229 non-null float64
1920       229 non-null float64
1930       229 non-null float64
1940       229 non-null float64
1950       256 non-null float64
1951       256 non-null float64
1952       256 non-null float64
1953       256 non-null float64
1954       256 non-null float64
1955       256 non-null float64
1956       256 non-null float64
1957       256 non-null float64
1958       256 non-null float64
1959       256 non-null float64
1960       256 non-null float64
1961

[None, None, None, None, None, None, None, None, None]

# Create long DFs

In [812]:
dfs[0].head()

Unnamed: 0,country,1751,1752,1753,1754,1755,1756,1757,1758,1759,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
0,Afghanistan,,,,,,,,,,...,0.04041,0.0544,0.06552,0.08785,0.15895,0.24905,0.30291,0.42522,0.68802,0.69312
1,Albania,,,,,,,,,,...,1.34203,1.37998,1.27761,1.30428,1.47382,1.49426,1.58449,1.81554,1.63562,1.66974
2,Algeria,,,,,,,,,,...,2.72677,3.21986,2.99727,3.19557,3.16824,3.42982,3.30686,3.30026,3.47163,3.51446
3,Andorra,,,,,,,,,,...,7.49969,7.39095,6.83994,6.62244,6.52724,6.17852,6.0921,5.70224,5.61408,5.52625
4,Angola,,,,,,,,,,...,1.08651,1.06932,1.20077,1.31098,1.29557,1.35427,1.36921,1.38263,1.47212,1.38437


In [813]:
long_dfs = pd.Series(dfs).apply(make_df_long)
len(long_dfs)

9

# Join long DFs

In [242]:
# thank you http://notconfusing.com/joining-many-dataframes-at-once-in-pandas-n-ary-join/
def merge_dfs(ldf, rdf):
    right_on = list(rdf.columns[0:2])
    left_on = list(ldf.columns[0:2])
    return ldf.merge(rdf, how='left', left_on=left_on, right_on=right_on)
    

In [1042]:
final_df = reduce(merge_dfs, long_dfs) #that's the magic
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60227 entries, 0 to 60226
Data columns (total 11 columns):
country                 60227 non-null object
year                    60227 non-null object
CO2_pc                  17055 non-null float64
energy_use_pc           5366 non-null float64
energy_production_pc    6684 non-null float64
hdi                     3948 non-null float64
income_pc               40660 non-null float64
gini_idx                5670 non-null float64
military_%gdp           8910 non-null float64
population              17769 non-null float64
gas_price_liter         4094 non-null float64
dtypes: float64(9), object(2)
memory usage: 5.5+ MB


In [815]:
final_df.tail()

Unnamed: 0,country,year,CO2_pc,energy_use_pc,energy_production_pc,hdi,income_pc,gini_idx,military_%gdp,population,gas_price_liter
60222,Zimbabwe,2009,0.60741,0.754557,0.00068,0.436,1290.0,0.0,0.0,13720997.0,0.0
60223,Zimbabwe,2010,0.66307,0.764014,,0.452,1460.0,0.0,0.969,13973897.0,1.29
60224,Zimbabwe,2011,0.81561,,,0.464,1660.0,43.2,1.64,14255592.0,0.0
60225,Zimbabwe,2012,0.88788,,,0.488,1850.0,0.0,2.23,14565482.0,1.52
60226,Zimbabwe,2013,0.92491,,,0.498,1900.0,0.0,2.31,14898092.0,0.0


## Look at correlations

In [1043]:
cols = final_df.columns[2:].values
final_df.loc[:, cols].corr()

Unnamed: 0,CO2_pc,energy_use_pc,energy_production_pc,hdi,income_pc,gini_idx,military_%gdp,population,gas_price_liter
CO2_pc,1.0,0.838864,0.649464,0.562471,0.712615,0.045282,0.055104,-0.011788,-0.03856
energy_use_pc,0.838864,1.0,0.532408,0.61737,0.770349,0.006504,0.001763,-0.065478,-0.003242
energy_production_pc,0.649464,0.532408,1.0,0.261024,0.73499,0.003256,0.090581,-0.049489,-0.078085
hdi,0.562471,0.61737,0.261024,1.0,0.685285,0.223793,0.032425,-0.027315,0.088884
income_pc,0.712615,0.770349,0.73499,0.685285,1.0,0.145248,0.107434,-0.008184,0.058127
gini_idx,0.045282,0.006504,0.003256,0.223793,0.145248,1.0,-0.043002,-0.005998,0.135218
military_%gdp,0.055104,0.001763,0.090581,0.032425,0.107434,-0.043002,1.0,0.017828,-0.067794
population,-0.011788,-0.065478,-0.049489,-0.027315,-0.008184,-0.005998,0.017828,1.0,0.00364
gas_price_liter,-0.03856,-0.003242,-0.078085,0.088884,0.058127,0.135218,-0.067794,0.00364,1.0


## Add region & sub_region columns

In [1049]:
regions = pd.read_csv('data/countries_with_regions.csv')

# make 'country' lowercase in both df's
regions['country'] = regions['country'].str.lower()
final_df['country'] = final_df['country'].str.lower()

final_df = final_df.merge(regions, on='country', how='left')
final_df.head()

Unnamed: 0,country,year,CO2_pc,energy_use_pc,energy_production_pc,hdi,income_pc,gini_idx,military_%gdp,population,gas_price_liter,region,sub-region
0,afghanistan,1949,0.00182,,,,1030.0,,,,,Asia,Southern Asia
1,afghanistan,1950,0.01088,,,,1040.0,,,7752118.0,,Asia,Southern Asia
2,afghanistan,1951,0.01169,,,,1060.0,,,7839426.0,,Asia,Southern Asia
3,afghanistan,1952,0.01155,,,,1070.0,,,7934798.0,,Asia,Southern Asia
4,afghanistan,1953,0.01323,,,,1120.0,,,8038312.0,,Asia,Southern Asia


In [1050]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17055 entries, 0 to 17054
Data columns (total 13 columns):
country                 17055 non-null object
year                    17055 non-null object
CO2_pc                  17055 non-null float64
energy_use_pc           5324 non-null float64
energy_production_pc    5978 non-null float64
hdi                     3898 non-null float64
income_pc               14969 non-null float64
gini_idx                5281 non-null float64
military_%gdp           8016 non-null float64
population              12517 non-null float64
gas_price_liter         3995 non-null float64
region                  15328 non-null object
sub-region              15328 non-null object
dtypes: float64(9), object(4)
memory usage: 1.8+ MB


## Remove rows without CO2

This process can add rows without CO2 values. 

Since I'm most primarily interested by CO2, might as well remove those rows: less visual clutter, and it saves memory.

In [1051]:
# how many rows are missing CO2 emission values?,
len(final_df.loc[final_df['CO2_pc'].isnull()])

0

In [1046]:
# drop all countries missing CO2 data
no_co2 = final_df[final_df['CO2_pc'].isnull()].index
final_df.drop(no_co2, inplace=True)
final_df.reset_index(inplace=True, drop=True)

In [1052]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17055 entries, 0 to 17054
Data columns (total 13 columns):
country                 17055 non-null object
year                    17055 non-null object
CO2_pc                  17055 non-null float64
energy_use_pc           5324 non-null float64
energy_production_pc    5978 non-null float64
hdi                     3898 non-null float64
income_pc               14969 non-null float64
gini_idx                5281 non-null float64
military_%gdp           8016 non-null float64
population              12517 non-null float64
gas_price_liter         3995 non-null float64
region                  15328 non-null object
sub-region              15328 non-null object
dtypes: float64(9), object(4)
memory usage: 1.8+ MB


Looks like there are ~2000 rows without region data. Sigh.

# Fill in missing regions

This is insanely painful.

In [1053]:
# how many countries missing region data?
missing_regions = final_df.loc[final_df['region'].isnull()]

# which countries are missing regions?
print(missing_regions['country'].unique())
len(missing_regions['country'].unique())

[u'central african republic' u'christmas island' u'congo dem rep'
 u'congo rep' u'cook is' u'cura ao' u'czech republic' u'czechoslovakia'
 u'dominican republic' u'east germany' u'falkland is malvinas'
 u'kyrgyz republic' u'lao' u'liechtenstein' u'micronesia fed sts'
 u'north korea' u'north yemen former' u'palestine' u'puerto rico'
 u'serbia and montenegro' u'south korea' u'south sudan'
 u'south yemen former' u'st helena' u'st kitts and nevis' u'st lucia'
 u'st pierre et miquelon' u'st vincent and the grenadines'
 u'united korea former' u'ussr' u'west germany' u'yemen' u'yugoslavia']


33

In [1054]:
# try matching with original json regions file
regions = pd.read_csv('data/regions_cleaned.csv')

# make 'country' lowercase in this df
regions['country'] = clean_country_col(regions)

# drop region & sub-region first, since the merge will append them
missing_regions = missing_regions\
    .drop(['region', 'sub-region'], axis=1)\
    .merge(regions, on='country', how='left')

In [1055]:
# how many countries found matches?
len(missing_regions['country'].loc[~missing_regions['region'].isnull()].unique())

7

## Painful partial-string matching

It's not elegant, but it works in *most* cases.
Would love feedback on how to make this better.

### Remove 'stop' words

In [1056]:
stop_words = ['st','north','south', 'east','west', 'united']

still_missing = missing_regions['country'].loc[missing_regions['region'].isnull()]
s = pd.Series(still_missing.astype(str).unique())
co2_country = s

# remove stop words, and 
# convert country names to keywords
s.split = s.apply(str.split)
s = s.split.apply(lambda country: country[1] if country[0] in stop_words else country[0])\

match_this = pd.concat([co2_country, s], axis=1)\
    .rename(columns={0:'CO2_country', 1:'keyword'})\
    .set_index('CO2_country')
    
match_this.reset_index()

Unnamed: 0,CO2_country,keyword
0,christmas island,christmas
1,congo dem rep,congo
2,congo rep,congo
3,cook is,cook
4,cura ao,cura
5,czechoslovakia,czechoslovakia
6,east germany,germany
7,falkland is malvinas,falkland
8,kyrgyz republic,kyrgyz
9,lao,lao


In [941]:
def match_regions(kw):
    '''checks all countries in regions df
    to see if any contain the keyword.
    If so, returns the matching row as a df'''
    kw_match = regions['country'].str.contains(kw)
    if regions['country'].loc[kw_match].all():
        result = regions.loc[kw_match]
        result['keyword'] = kw
        return result
    else:
        pass

In [1057]:
# create list of matching dfs with region values
matched_regions_list = match_this['keyword'].apply(match_regions).tolist()

# combine these dfs
results = pd.concat(matched_regions_list).drop_duplicates()

# merge with df of countries still missing regions
match_this = match_this.reset_index()\
    .merge(results, on='keyword', how='outer')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [972]:
match_this

Unnamed: 0,CO2_country,keyword,country,region,sub-region
0,christmas island,christmas,christmas island,,
1,congo dem rep,congo,congo,Africa,Middle Africa
2,congo dem rep,congo,congo democratic republic of the,Africa,Middle Africa
3,congo rep,congo,congo,Africa,Middle Africa
4,congo rep,congo,congo democratic republic of the,Africa,Middle Africa
5,cook is,cook,cook islands,Oceania,Polynesia
6,cura ao,cura,curacao,Americas,Caribbean
7,czechoslovakia,czechoslovakia,,,
8,east germany,germany,germany,Europe,Western Europe
9,west germany,germany,germany,Europe,Western Europe


A challenge here was that some keywords naturally had more than one match, like 'congo' and 'germany'. So when merging the list of countries missing region data with the possible matches, it created additional rows, which I call 'virtual rows'.

In [1058]:
# drop the "virtual rows" created by merge
match_this = match_this.drop_duplicates(subset=['CO2_country'])

# remove redundant columns
match_this = match_this.drop(['keyword', 'country'], axis=1).rename(columns={'CO2_country':'country'})
match_this.reset_index(drop=True, inplace=True)

In [1059]:
# verify
match_this

Unnamed: 0,country,region,sub-region
0,christmas island,,
1,congo dem rep,Africa,Middle Africa
2,congo rep,Africa,Middle Africa
3,cook is,Oceania,Polynesia
4,cura ao,Americas,Caribbean
5,czechoslovakia,,
6,east germany,Europe,Western Europe
7,west germany,Europe,Western Europe
8,falkland is malvinas,Americas,South America
9,kyrgyz republic,Asia,Central Asia


In [1060]:
# manually add missing values
match_this.loc[[0,5,24,25], 'region'] = ['Oceania', 'Europe', 'Europe', 'Europe']
match_this.loc[[0,5,24,25], 'sub-region'] = ['Australia and New Zealand', 'Eastern Europe', 'Eastern Europe', 'Southern Europe']

In [1061]:
match_this

Unnamed: 0,country,region,sub-region
0,christmas island,Oceania,Australia and New Zealand
1,congo dem rep,Africa,Middle Africa
2,congo rep,Africa,Middle Africa
3,cook is,Oceania,Polynesia
4,cura ao,Americas,Caribbean
5,czechoslovakia,Europe,Eastern Europe
6,east germany,Europe,Western Europe
7,west germany,Europe,Western Europe
8,falkland is malvinas,Americas,South America
9,kyrgyz republic,Asia,Central Asia


In [833]:
## helper function

def update_df(ldf, rdf):
    
    # make 'country' lowercase
    try:
        ldf['country'] = ldf['country'].str.lower()
        rdf['country'] = rdf['country'].str.lower()
    except KeyError, e:
        print("The 'country' column is already indexed. Reset index & try again.")
        pass
    
    # set up common index labels in both df's
    ldf.set_index(['country', 'year'], inplace=True)
    rdf.set_index(['country', 'year'], inplace=True)
    
    ldf.update(rdf)
    
    # restore indexes
    ldf.reset_index(inplace=True)
    rdf.reset_index(inplace=True)

In [1064]:
# merge matched values with missing_regions df
update_missing = missing_regions.drop(['region', 'sub-region'], axis=1)\
    .loc[missing_regions['region'].isnull()]\
    .merge(match_this, on='country', how='left')

In [1065]:
update_df(missing_regions, update_missing)

In [1117]:
test = final_df.copy()

In [1118]:
test.set_index(['country', 'year'], inplace = True)
missing_regions.set_index(['country', 'year'], inplace = True)

In [1119]:
# replace values
test.loc[test['region'].isnull(), ['region']] = missing_regions['region']
test.loc[test['sub-region'].isnull(), ['sub-region']] = missing_regions['sub-region']

In [1120]:
test.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 17055 entries, (afghanistan, 1949) to (zimbabwe, 2013)
Data columns (total 11 columns):
CO2_pc                  17055 non-null float64
energy_use_pc           5324 non-null float64
energy_production_pc    5978 non-null float64
hdi                     3898 non-null float64
income_pc               14969 non-null float64
gini_idx                5281 non-null float64
military_%gdp           8016 non-null float64
population              12517 non-null float64
gas_price_liter         3995 non-null float64
region                  17055 non-null object
sub-region              17055 non-null object
dtypes: float64(9), object(2)
memory usage: 1.5+ MB


In [1121]:
# sanity check

test_countries = ['lao', 'congo rep', 'ussr', 'canada']
q = "country in @test_countries"
test.reset_index().query(q)

Unnamed: 0,country,year,CO2_pc,energy_use_pc,energy_production_pc,hdi,income_pc,gini_idx,military_%gdp,population,gas_price_liter,region,sub-region
2464,canada,1785,0.00677,,,,,,,,,Americas,Northern America
2465,canada,1786,0.00669,,,,,,,,,Americas,Northern America
2466,canada,1787,0.00661,,,,,,,,,Americas,Northern America
2467,canada,1788,0.00654,,,,,,,,,Americas,Northern America
2468,canada,1789,0.00646,,,,,,,,,Americas,Northern America
2469,canada,1790,0.00639,,,,,,,,,Americas,Northern America
2470,canada,1791,0.00631,,,,,,,,,Americas,Northern America
2471,canada,1792,0.00624,,,,,,,,,Americas,Northern America
2472,canada,1793,0.00617,,,,,,,,,Americas,Northern America
2473,canada,1794,0.00609,,,,,,,,,Americas,Northern America


## Rearrange columns

In [1122]:
# rearrange column order so region info beside country col
test.reset_index(inplace=True)
cols = test.columns.tolist()
cols

['country',
 'year',
 'CO2_pc',
 'energy_use_pc',
 'energy_production_pc',
 'hdi',
 'income_pc',
 'gini_idx',
 'military_%gdp',
 'population',
 'gas_price_liter',
 'region',
 'sub-region']

In [1123]:
newcols = ['country','region','sub-region','year', 'CO2_pc',\
           'energy_use_pc','energy_production_pc','hdi','income_pc','gini_idx',\
           'military_%gdp','population','gas_price_liter']
test = test.reindex(columns = newcols)
test.head()

Unnamed: 0,country,region,sub-region,year,CO2_pc,energy_use_pc,energy_production_pc,hdi,income_pc,gini_idx,military_%gdp,population,gas_price_liter
0,afghanistan,Asia,Southern Asia,1949,0.00182,,,,1030.0,,,,
1,afghanistan,Asia,Southern Asia,1950,0.01088,,,,1040.0,,,7752118.0,
2,afghanistan,Asia,Southern Asia,1951,0.01169,,,,1060.0,,,7839426.0,
3,afghanistan,Asia,Southern Asia,1952,0.01155,,,,1070.0,,,7934798.0,
4,afghanistan,Asia,Southern Asia,1953,0.01323,,,,1120.0,,,8038312.0,


In [1124]:
final_df = test

In [1128]:
# make country strings title case & save as csv
final_df['country'] = final_df['country'].str.title()
final_df.to_csv('data/final_nov18/final_df.csv', index=False)

In [1126]:
final_df.head()

Unnamed: 0,country,region,sub-region,year,CO2_pc,energy_use_pc,energy_production_pc,hdi,income_pc,gini_idx,military_%gdp,population,gas_price_liter
0,Afghanistan,Asia,Southern Asia,1949,0.00182,,,,1030.0,,,,
1,Afghanistan,Asia,Southern Asia,1950,0.01088,,,,1040.0,,,7752118.0,
2,Afghanistan,Asia,Southern Asia,1951,0.01169,,,,1060.0,,,7839426.0,
3,Afghanistan,Asia,Southern Asia,1952,0.01155,,,,1070.0,,,7934798.0,
4,Afghanistan,Asia,Southern Asia,1953,0.01323,,,,1120.0,,,8038312.0,


# Updates

The original CO2 dataset went up to 2012, but 2010 was the last year there was substantial data.
I'd seen references online to 2014 data, but finding the actual data files was a challenge. I had several false starts.

Finally, it was the [Open Numbers github account](https://github.com/open-numbers/ddf--gapminder--co2_emission) that helped me.

In [746]:
# load data
url_co2 = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--co2_emission/master/ddf--datapoints--co2_emissions_tonnes_per_person--by--country--year.csv'
url_countries = 'https://raw.githubusercontent.com/open-numbers/ddf--gapminder--co2_emission/master/ddf--entities--country.csv'

co2 = pd.read_csv(url_co2) # emissions values
countries = pd.read_csv(url_countries) # country key

df = countries.merge(co2, on='country')

# hoping this fixes concat issues when trying add rows from previous version of DF
df['year'] = df['year'].astype('str')

# put into standard wide format
df['country'] = df['name']
df = df.drop('name', axis=1).rename(columns={'co2_emissions_tonnes_per_person': 'CO2'})
df = df.pivot('country', 'year', 'CO2').reset_index()
df.name = 'CO2_2013'
df.head()

year,country,1751,1752,1753,1754,1755,1756,1757,1758,1759,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
0,Afghanistan,,,,,,,,,,...,0.04041,0.0544,0.06552,0.08785,0.15895,0.24905,0.30291,0.42522,0.68802,0.69312
1,Albania,,,,,,,,,,...,1.34203,1.37998,1.27761,1.30428,1.47382,1.49426,1.58449,1.81554,1.63562,1.66974
2,Algeria,,,,,,,,,,...,2.72677,3.21986,2.99727,3.19557,3.16824,3.42982,3.30686,3.30026,3.47163,3.51446
3,Andorra,,,,,,,,,,...,7.49969,7.39095,6.83994,6.62244,6.52724,6.17852,6.0921,5.70224,5.61408,5.52625
4,Angola,,,,,,,,,,...,1.08651,1.06932,1.20077,1.31098,1.29557,1.35427,1.36921,1.38263,1.47212,1.38437


In [747]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229 entries, 0 to 228
Columns: 264 entries, country to 2013
dtypes: float64(263), object(1)
memory usage: 472.4+ KB


In [748]:
# compare with previous co2 data (last complete year: 2010)
df_old = pd.read_csv('data/archive/1indicator CDIAC carbon_dioxide_emissions_per_capita.csv')
df_old.rename(columns={'CO2 per capita': 'country'}, inplace=True)
df_old.head()

Unnamed: 0,country,1751,1755,1762,1763,1764,1765,1766,1767,1768,...,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012
0,Abkhazia,,,,,,,,,,...,,,,,,,,,,
1,Afghanistan,,,,,,,,,,...,0.022704,0.027472,0.03678,0.04709,0.068312,0.131602,0.213325,0.262174,,
2,Akrotiri and Dhekelia,,,,,,,,,,...,,,,,,,,,,
3,Albania,,,,,,,,,,...,1.382066,1.332966,1.353789,1.22431,1.27942,1.297753,1.215055,1.336544,,
4,Algeria,,,,,,,,,,...,2.899236,2.76222,3.25701,3.113135,3.312875,3.328945,3.564361,3.480977,3.562504,3.785654


In [743]:
df_old.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Columns: 254 entries, country to 2012
dtypes: float64(253), object(1)
memory usage: 466.4+ KB


In [749]:
# df_old has 235 countries, 6 more than the current df
# But the current df has _9_ more columns (years) than df old.

# the most crucial diff is the countries in the old df that are not in the current one.

# first clean colums in each df
df_old_clean = clean_country_col(df_old)
df_clean = clean_country_col(df)

In [752]:
# 2010 countries NOT in 2013 dataset
notin_2013 = df_old.loc[~df_old['country'].isin(df['country'])]
notin_2013

Unnamed: 0,country,1751,1755,1762,1763,1764,1765,1766,1767,1768,...,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012
0,abkhazia,,,,,,,,,,...,,,,,,,,,,
2,akrotiri and dhekelia,,,,,,,,,,...,,,,,,,,,,
5,american samoa,,,,,,,,,,...,,,,,,,,,,
41,channel islands,,,,,,,,,,...,,,,,,,,,,
44,cocos island,,,,,,,,,,...,,,,,,,,,,
63,eritrea and ethiopia,,,,,,,,,,...,,,,,,,,,,
82,guam,,,,,,,,,,...,,,,,,,,,,
84,guernsey,,,,,,,,,,...,,,,,,,,,,
89,holy see,,,,,,,,,,...,,,,,,,,,,
99,isle of man,,,,,,,,,,...,,,,,,,,,,


OMG. 30 countries. But they seem be mostly NANs.
Let's verify

In [753]:
# of these countries, how many actually have data?
notin_2013.set_index('country', inplace=True)
notin_2013.isnull().all(1)

country
abkhazia                     True
akrotiri and dhekelia        True
american samoa               True
channel islands              True
cocos island                 True
eritrea and ethiopia         True
guam                         True
guernsey                     True
holy see                     True
isle of man                  True
jersey                       True
kosovo                       True
mayotte                      True
monaco                       True
norfolk island               True
northern cyprus              True
northern mariana islands     True
pitcairn                     True
san marino                   True
serbia excluding kosovo      True
somaliland                   True
south ossetia                True
svalbard                     True
tokelau                      True
transnistria                 True
tuvalu                       True
west bank and gaza          False
western sahara              False
virgin islands u s           True
land  

In [754]:
# drop all-NaN rows
notin_2013 = notin_2013.dropna(how='all')
notin_2013

Unnamed: 0_level_0,1751,1755,1762,1763,1764,1765,1766,1767,1768,1769,...,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
west bank and gaza,,,,,,,,,,,...,0.374235,0.535766,0.771369,0.622923,0.623526,0.536602,0.531716,0.585513,,
western sahara,,,,,,,,,,,...,0.612034,0.57268,0.54109,0.516309,0.496383,0.479618,0.464255,0.449262,,


In [755]:
# add notin_2013 rows to the current df

df.set_index('country', inplace=True)
df_appended = df.append(notin_2013)

# there now should be 231 rows instead of 229
df_appended.info()

<class 'pandas.core.frame.DataFrame'>
Index: 231 entries, afghanistan to western sahara
Columns: 263 entries, 1751 to 2013
dtypes: float64(263)
memory usage: 476.4+ KB


In [757]:
df_appended.tail()

Unnamed: 0_level_0,1751,1752,1753,1754,1755,1756,1757,1758,1759,1760,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
yugoslavia,,,,,,,,,,,...,,,,,,,,,,
zambia,,,,,,,,,,,...,0.18293,0.18998,0.16939,0.1396,0.16635,0.18703,0.1947,0.20451,0.23533,0.25084
zimbabwe,,,,,,,,,,,...,0.73716,0.82938,0.79573,0.74338,0.57572,0.60741,0.66307,0.81561,0.88788,0.92491
west bank and gaza,,,,,,,,,,,...,0.535766,0.771369,0.622923,0.623526,0.536602,0.531716,0.585513,,,
western sahara,,,,,,,,,,,...,0.57268,0.54109,0.516309,0.496383,0.479618,0.464255,0.449262,,,


In [758]:
# save as a csv so it can be 
# batched-processed with the other datasets
co2_2013.to_csv(originalpath + '!CO2_2013.csv', index=False)

# More updates

In [4]:
url = 'http://spreadsheets.google.com/pub?key%3Dpyj6tScZqmEdz8B4njtoHPA%26output%3Dxls&sa=D&ust=1542124721313000&usg=AFQjCNFKuU9qj8_w-rJBA0ZP-3yvtXISqg'
gas = pd.read_excel(url)
gas.head()

XLRDError: Unsupported format, or corrupt file: Expected BOF record; found '<HTML><H'