
# Data Prep: Energy Consumption per capita


## Table of Contents
<ul>
<li><a href="#intro">Introduction</a></li>
<li><a href="#wrangling">Data Wrangling</a></li>
<li><a href="#eda">Exploratory Data Analysis</a></li>
<li><a href="#conclusions">Conclusions</a></li>
</ul>

<a id='intro'></a>
## Introduction



In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pprint as pp
%matplotlib inline

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

<a id='wrangling'></a>
## Data Wrangling

> **Tip**: In this section of the report, you will load in the data, check for cleanliness, and then trim and clean your dataset for analysis. Make sure that you document your steps carefully and justify your cleaning decisions.

### General Properties

In [34]:
# read data

energy = pd.read_excel("data/energy use per person.xlsx")

In [35]:
# how big is the dataset?
print energy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275 entries, 0 to 274
Data columns (total 53 columns):
Energy use, per capita (toe)    275 non-null object
1960                            25 non-null float64
1961                            25 non-null float64
1962                            25 non-null float64
1963                            25 non-null float64
1964                            25 non-null float64
1965                            26 non-null float64
1966                            26 non-null float64
1967                            26 non-null float64
1968                            26 non-null float64
1969                            26 non-null float64
1970                            26 non-null float64
1971                            109 non-null float64
1972                            109 non-null float64
1973                            109 non-null float64
1974                            109 non-null float64
1975                            109 non-null float64
1976  

In [36]:
# what does it look like?
energy.head()

Unnamed: 0,"Energy use, per capita (toe)",1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011
0,Abkhazia,,,,,,,,,,...,,,,,,,,,,
1,Afghanistan,,,,,,,,,,...,,,,,,,,,,
2,Akrotiri and Dhekelia,,,,,,,,,,...,,,,,,,,,,
3,Albania,,,,,,,,,,...,0.649229,0.645355,0.705188,0.702235,0.662679,0.646799,0.655651,0.653179,0.648291,
4,Algeria,,,,,,,,,,...,0.9161,0.962688,0.956879,0.984091,1.038732,1.086022,1.087619,1.165831,1.138239,


In [37]:
# give country col better name
energy.rename(columns = {'Energy use, per capita (toe)' : 'country'}, inplace = True)

In [38]:
energy

Unnamed: 0,country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011
0,Abkhazia,,,,,,,,,,...,,,,,,,,,,
1,Afghanistan,,,,,,,,,,...,,,,,,,,,,
2,Akrotiri and Dhekelia,,,,,,,,,,...,,,,,,,,,,
3,Albania,,,,,,,,,,...,0.649229,0.645355,0.705188,0.702235,0.662679,0.646799,0.655651,0.653179,0.648291,
4,Algeria,,,,,,,,,,...,0.916100,0.962688,0.956879,0.984091,1.038732,1.086022,1.087619,1.165831,1.138239,
5,American Samoa,,,,,,,,,,...,,,,,,,,,,
6,Andorra,,,,,,,,,,...,,,,,,,,,,
7,Angola,,,,,,,,,,...,0.557683,0.583848,0.611989,0.568569,0.599436,0.625843,0.657618,0.679069,0.716494,
8,Anguilla,,,,,,,,,,...,,,,,,,,,,
9,Antigua and Barbuda,,,,,,,,,,...,,,1.629687,1.632585,1.683304,1.698959,,,,


#### Observations

* Data limited: 1960 to 2011 only
* More countries than for either Regions or for CO2 data sets
* Need to clean up special characters and accents

In [39]:
# summary stats
energy.describe()

Unnamed: 0,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011
count,25.0,25.0,25.0,25.0,25.0,26.0,26.0,26.0,26.0,26.0,...,135.0,135.0,167.0,168.0,168.0,168.0,136.0,136.0,136.0,34.0
mean,2.293823,2.328809,2.423456,2.555181,2.664216,2.702039,2.771418,2.842704,3.031177,3.213208,...,2.400366,2.490554,2.189576,2.183482,2.228795,2.229288,2.583925,2.480361,2.552229,4.354986
std,2.115968,2.101996,2.086214,2.10446,2.215092,2.165483,2.112981,2.106078,2.239071,2.37165,...,2.856155,2.953796,2.913443,2.81449,2.847418,2.829414,2.966619,2.882564,2.929312,2.908761
min,0.289057,0.322491,0.350101,0.367811,0.410252,0.433786,0.451751,0.47246,0.4863,0.501897,...,0.154673,0.159039,0.013679,0.015975,0.00911,0.009021,0.136886,0.142316,0.141716,1.55056
25%,1.318812,1.396466,1.41273,1.453962,1.522195,1.448053,1.515146,1.732269,1.895476,2.02635,...,0.570778,0.600391,0.501537,0.529688,0.519396,0.55198,0.645838,0.661271,0.692568,2.795252
50%,1.827976,1.894103,2.052054,2.182993,2.320013,2.326527,2.354048,2.318151,2.460666,2.654348,...,1.272482,1.326524,1.006898,1.060038,1.111236,1.104978,1.491151,1.380585,1.431341,3.6714
75%,2.698792,2.742123,2.887236,3.080414,3.212535,3.276062,3.246943,3.304896,3.544039,3.691779,...,3.353006,3.503907,2.793516,2.73813,2.877123,2.855199,3.419548,3.253202,3.260683,5.151622
max,10.523407,10.534018,10.414541,10.465813,11.15005,10.926395,10.480606,10.361973,11.114667,11.974193,...,19.375244,20.098118,23.071099,20.473801,18.749562,17.445729,16.868468,16.904904,16.882499,17.98331


In [40]:
# how many countries have data from 1960?
energy['1960'].count()

25

In [41]:
# Which country has data starting from that year?
energy.loc[pd.notnull(energy['1960'])]

Unnamed: 0,country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011
13,Australia,3.063554,3.115787,3.172975,3.284051,3.349414,3.463216,3.546538,3.692808,3.763558,...,5.570153,5.573862,5.514906,5.600235,5.605497,5.683007,5.778817,5.738863,5.593221,5.29594
14,Austria,1.546261,1.554035,1.675874,1.823995,1.855085,1.851844,1.902756,1.920212,2.045828,...,3.767459,3.966704,4.00078,4.103304,4.08586,4.022449,4.023801,3.796843,4.033582,3.875007
21,Belgium,2.519159,2.570532,2.809731,3.043221,3.021711,3.116016,3.051303,3.182923,3.590004,...,5.45473,5.709104,5.650965,5.600428,5.509344,5.366718,5.469582,5.288913,5.585584,5.082476
37,Canada,4.25121,4.307821,4.45156,4.69412,4.903608,5.153932,5.272665,5.557292,5.87152,...,7.914379,8.272314,8.364289,8.424041,8.23677,8.25185,7.945846,7.433974,7.379626,7.426204
59,Denmark,1.922974,2.023308,2.29629,2.502928,2.600156,2.822735,3.067428,3.06111,3.203752,...,3.535165,3.726408,3.594966,3.485343,3.724567,3.618512,3.495099,3.322982,3.470399,3.141668
75,Finland,2.196953,2.252779,2.361744,2.478912,2.678383,2.887694,3.065439,3.126426,3.342703,...,6.694238,7.047664,7.098453,6.529459,7.086904,6.957265,6.638823,6.227018,6.787212,6.358825
76,France,1.699542,1.744665,1.860525,1.982963,2.08469,2.109459,2.122623,2.247891,2.359811,...,4.224798,4.270583,4.301043,4.283718,4.191468,4.116148,4.113622,3.916691,4.030517,3.842635
82,Germany,1.952534,1.994321,2.124826,2.281288,2.341159,2.37835,2.38771,2.388411,2.561521,...,4.104246,4.135561,4.159426,4.101618,4.135783,4.02557,4.068954,3.871738,4.003264,3.75841
85,Greece,0.289057,0.322491,0.350101,0.367811,0.433559,0.478756,0.547197,0.607702,0.652082,...,2.577529,2.643513,2.685582,2.724029,2.710933,2.699663,2.70698,2.608921,2.440492,2.348195
100,Iceland,3.082712,2.916706,3.028298,3.279602,3.306816,3.444551,3.639771,3.594362,3.924134,...,11.423796,11.305961,11.531687,11.729269,13.690215,15.522149,16.868468,16.904904,16.882499,17.98331


Ok, Western Europe, Canada, US, NZ & AUS

In [42]:
# How many countries do not have data in the last year?

missing_recent = energy.loc[energy['2011'].isnull()]
len(missing_recent)

241

###  Sanity checks

Which countries have the highest cumulative energy consumption?


In [43]:
# change index for simpler aggregating
energy = energy.set_index('country')

In [44]:
# sum across columns
energy.sum(1).sort_values(ascending = False).head(10)

country
Qatar                   596.898621
Luxembourg              490.929908
Iceland                 412.410403
United States           388.084671
Canada                  370.498718
United Arab Emirates    366.442692
Bahrain                 351.186422
Kuwait                  317.606625
Brunei                  268.355709
Finland                 263.473843
dtype: float64

#### Observations

Most of the top 10 are unsurprising: oil-producing countries, and/or cold countries. 

However, once again we see Luxembourg. Perhaps this is reflects data that goes back further for those countries. 

Let's compare with the means.

In [45]:
# which countries have the highest mean CO2?
energy.mean(1).sort_values(ascending = False).head(10)

country
Qatar                   14.922466
Luxembourg               9.440960
United Arab Emirates     9.161067
Bahrain                  8.779661
Kuwait                   8.583963
Iceland                  7.930969
United States            7.463167
Canada                   7.124975
Brunei                   6.708893
Trinidad and Tobago      6.414252
dtype: float64

In [46]:
# which countries have the lowest mean CO2?
energy.mean(1).sort_values(ascending = False).tail(10)

country
Antarctica                NaN
Virgin Islands, British   NaN
Hawaiian Trade Zone       NaN
U.S. Pacific Islands      NaN
Wake Island               NaN
Bonaire                   NaN
Sark                      NaN
Chinese Taipei            NaN
Saint Eustatius           NaN
Saba                      NaN
dtype: float64

In [47]:
#How many countries are all null?
drop = list(energy.loc[energy.mean(1).isnull()].index)

In [48]:
# drop null rows
energy = energy.drop(drop)

In [49]:
len(energy)

169

#### Observations

Now the top 10 is nearly all oil-rich countries. Nonetheless, we still have Luxembourg in the top 10, along with Trinidad & Tobago.

Also...the bottom results show that we have many rows without any values.

> **Tip**: You should _not_ perform too many operations in each cell. Create cells freely to explore your data. One option that you can take with this project is to do a lot of explorations in an initial notebook. These don't have to be organized, but make sure you use enough comments to understand the purpose of each code cell. Then, after you're done with your analysis, create a duplicate notebook where you will trim the excess and organize your steps so that you have a flowing, cohesive report.

> **Tip**: Make sure that you keep your reader informed on the steps that you are taking in your investigation. Follow every code cell, or every set of related code cells, with a markdown cell to describe to the reader what was found in the preceding cell(s). Try to make it so that the reader can then understand what they will be seeing in the following cell(s).

## Data Cleaning

### Remove empty rows

In [50]:
# how many rows without values?
len(energy.loc[energy.sum(1).isnull()])

0

## Add region & sub-region columns

Data from https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes/blob/master/all/all.csv

CO2 patterns, I suspect, vary a lot by region and not just by country. 

For my analysis, I'd like to primarily focus on Europe since that's where I'm currently living. 

In [51]:
energy.reset_index(inplace = True)

In [52]:
energy['country']

0                     Albania
1                     Algeria
2                      Angola
3         Antigua and Barbuda
4                   Argentina
5                     Armenia
6                   Australia
7                     Austria
8                  Azerbaijan
9                     Bahamas
10                    Bahrain
11                 Bangladesh
12                   Barbados
13                    Belarus
14                    Belgium
15                     Belize
16                      Benin
17                     Bhutan
18                    Bolivia
19     Bosnia and Herzegovina
20                   Botswana
21                     Brazil
22                     Brunei
23                   Bulgaria
24                   Cambodia
25                   Cameroon
26                     Canada
27                 Cape Verde
28                      Chile
29                      China
                ...          
139                     Spain
140                 Sri Lanka
141       

### Create regions df

In [53]:
# based on cleaned json file
regions = pd.read_csv('data/regions_cleaned.csv')
regions.head()

Unnamed: 0,country,region,sub-region
0,Afghanistan,Asia,Southern Asia
1,Aland Islands,Europe,Northern Europe
2,Albania,Europe,Southern Europe
3,Algeria,Africa,Northern Africa
4,American Samoa,Oceania,Polynesia


In [54]:
regions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249 entries, 0 to 248
Data columns (total 3 columns):
country       249 non-null object
region        240 non-null object
sub-region    240 non-null object
dtypes: object(3)
memory usage: 5.9+ KB


So `regions` df has many more countries than `energy` df, which will be the driving table for the merge. This is easier to work with than vice versa.

### Clean up 'country' column

In [55]:
# remove accents from countries in regions df
import unidecode
energy['country'] = energy['country'].apply(unidecode.unidecode)

In [56]:
energy = energy.reset_index()

In [57]:
from regions_clean import remove_special_char
energy['country'] = energy['country'].apply(remove_special_char)

In [58]:
list(energy['country'])

['Albania',
 'Algeria',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo Dem Rep ',
 'Congo Rep ',
 'Costa Rica',
 'Cote d Ivoire',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guinea Bissau',
 'Guyana',
 'Haiti',
 'Honduras',
 'Hong Kong China',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Jamaica',

## Merge `energy` with `regions`

In [59]:
# merge co2 df with region df

energy_regions = energy.merge(regions, how = 'left', on='country')

In [60]:
# rearrange col order
cols = energy_regions[['region', 'sub-region']]

energy_regions.drop(labels=['index','region', 'sub-region'], axis=1, inplace = True)
energy_regions.insert(1, 'region', cols['region'])
energy_regions.insert(2, 'sub-region', cols['sub-region'])

### Match countries using partial string matching

When entering search queries, spellcheck puts a lot of weight on the first character, assuming that because it's so important, it's far less likely for users to missspell it. 

I am guessing that we can match up most countries by assuming the key word is the **first** word in the `country` field.

In [65]:
def add_missing_regions(merged_df, matching_key, current_country):
    # update merged_df with region & sub-region values
    reg, sub_reg = regions.loc[matching_key, ['region', 'sub-region']]
    merged_df.loc[merged_df['country']== current_country, 'region'] = reg
    merged_df.loc[merged_df['country']== current_country, 'sub-region'] = sub_reg
    return merged_df

In [66]:
# create df of countries that didn't have any matches in regions df

def match_countries(merged_df, regions_df):
    # loop through countries without region data
    # and look for partial string matches
    
    # check for correct index type
    '''if isinstance(merged_df.index, pd.Index):
        merged_df.reset_index(inplace = True)
    else:
        pass'''
    no_match = merged_df.loc[merged_df.region.isnull()].country
    #return no_match

    matches = []
    for country in no_match:
        poss_match = {'merged_country': country}
        country_split = str.split(country)

        # does the field start with country name?
        if regions.country.str.startswith(country).any():
                poss_match['regions_country'] = dict(regions[regions.country.str.startswith(country)].country)

        # does the field contain the first word of the country match?
        elif regions.country.str.contains(country_split[0]).any():
                poss_match['regions_country'] = dict(regions[regions.country.str.contains(country_split[0])].country)

        # does the field contain the second word of the country?
        elif len(country_split) > 1: 
            if regions.country.str.contains(country_split[1]).any():
                poss_match['regions_country'] = dict(regions[regions.country.str.contains(country_split[1])].country)
            else: 
                poss_match['regions_country'] = None
        else:
            poss_match['regions_country'] = None

        matches.append(poss_match)
        
    
    for dct in matches:
        ### TODO: try first assigning a 'matched' status to ALL rows, THEN confirm matches.
        #################
        ### confirm matches when there is only one suggested match found
        if dct['regions_country'] == None:
            dct['matched'] = "n"
            
        elif len(dct['regions_country']) > 1:
            dct['matched'] = 'multiple'
            
        elif len(dct['regions_country']) == 1:
            print dct
            dct['matched'] = raw_input('Is this a match? ([y] or n) ') or 'y'
            if dct['matched'] == 'y':
                matching_key = dct['regions_country'].keys()[0]
                #dct['regions_country'] = dct['regions_country'].values()[0]
                current_country = dct['merged_country']

                # update merged_df with region & sub-region values
                '''reg, sub_reg = regions.loc[matching_key, ['region', 'sub-region']]
                energy_regions.loc[energy_regions['country']== dct['merged_country'], 'region'] = reg
                energy_regions.loc[energy_regions['country']== dct['merged_country'], 'sub-region'] = sub_reg'''
                
                merged_df = add_missing_regions(merged_df, matching_key, current_country)
                
                # remove this country from matches
                matches.remove(dct)
        
            else:
                dct['matched'] = 'n'
                
            
    
    return matches, merged_df


In [71]:
matches, energy_regions = match_countries(energy_regions, regions)

In [72]:
energy_regions.loc[energy_regions['region'].isnull()]

Unnamed: 0,country,region,sub-region,1960,1961,1962,1963,1964,1965,1966,...,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011
32,Congo Dem Rep,,,,,,,,,,...,0.339893,0.342031,0.34492,0.347751,0.350483,0.353022,0.355989,0.356968,0.360174,
33,Congo Rep,,,,,,,,,,...,0.264049,0.296907,0.296096,0.306485,0.317979,0.324156,0.328359,0.359252,0.363457,
82,North Korea,,,,,,,,,,...,0.841002,0.8545,0.868473,0.898305,0.902083,0.762212,0.835761,0.791154,0.76116,
83,South Korea,,,,,,,,,,...,4.171743,4.235713,4.335731,4.366124,4.415774,4.571104,4.636383,4.659794,5.059912,5.174671
84,Kosovo,,,,,,,,,,...,1.063366,1.12045,1.12687,1.089584,1.090722,1.123934,1.209314,1.344213,1.372143,
125,St Kitts and Nevis,,,,,,,,,,...,,,1.565622,1.606573,1.585995,1.64503,,,,
126,St Lucia,,,,,,,,,,...,,,0.738762,0.748494,0.7152,0.760375,,,,
127,St Vincent and the Grenadines,,,,,,,,,,...,,,0.607925,0.616064,0.642685,0.641849,,,,
160,United States,,,5.641735,5.612053,5.774613,5.986795,6.136961,6.307897,6.591338,...,7.843393,7.794173,7.881754,7.846805,7.697188,7.758206,7.48793,7.057277,7.164462,7.069233
165,Vietnam,,,,,,,,,,...,0.420442,0.436329,0.478497,0.503139,0.509822,0.543534,0.575456,0.621322,0.681375,


In [73]:
# manually fix the US
energy_regions.loc[energy_regions['country']== 'United States', 'region'] = 'Americas'
energy_regions.loc[energy_regions['country']== 'United States', 'sub-region'] = 'Northern America'

# verify
energy_regions.loc[energy_regions['region'].isnull()]

Unnamed: 0,country,region,sub-region,1960,1961,1962,1963,1964,1965,1966,...,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011
32,Congo Dem Rep,,,,,,,,,,...,0.339893,0.342031,0.34492,0.347751,0.350483,0.353022,0.355989,0.356968,0.360174,
33,Congo Rep,,,,,,,,,,...,0.264049,0.296907,0.296096,0.306485,0.317979,0.324156,0.328359,0.359252,0.363457,
82,North Korea,,,,,,,,,,...,0.841002,0.8545,0.868473,0.898305,0.902083,0.762212,0.835761,0.791154,0.76116,
83,South Korea,,,,,,,,,,...,4.171743,4.235713,4.335731,4.366124,4.415774,4.571104,4.636383,4.659794,5.059912,5.174671
84,Kosovo,,,,,,,,,,...,1.063366,1.12045,1.12687,1.089584,1.090722,1.123934,1.209314,1.344213,1.372143,
125,St Kitts and Nevis,,,,,,,,,,...,,,1.565622,1.606573,1.585995,1.64503,,,,
126,St Lucia,,,,,,,,,,...,,,0.738762,0.748494,0.7152,0.760375,,,,
127,St Vincent and the Grenadines,,,,,,,,,,...,,,0.607925,0.616064,0.642685,0.641849,,,,
165,Vietnam,,,,,,,,,,...,0.420442,0.436329,0.478497,0.503139,0.509822,0.543534,0.575456,0.621322,0.681375,


In [74]:
# save to csv
energy_regions.to_csv('data/cleaned/energy_regions_matched.csv', index = False)

In [None]:
'''

### for multiple matches (stored as dict), select correct country
        #elif isinstance(dct['regions_country'], dict):
        elif  len(dct['regions_country']) > 1:
            #dct['matched'] = 'n'
            pp.pprint(dct)
            matching_key = raw_input('Enter the key number of the matching country. If none match, type \'n\'.')
            if matching_key == 'n':
                dct['regions_country'] = None
                dct['matched'] = matching_key

            elif matching_key != 'n':
                while int(matching_key) not in dct['regions_country'].keys():
                    print "Error: you entered "+ matching_key
                    print "This is not a valid key number. Please try again."
                    print "Valid keys are:"
                    print dct['regions_country'].keys()
                    matching_key = raw_input('Enter the key number of the matching country. If none match, type \'n\'.')


                print "You selected " + dct['regions_country'][int(matching_key)] + "."
                dct['regions_country'] = dct['regions_country'][int(matching_key)]
                dct['matched'] = 'y'

                # add region & sub-region values from regions df
                dct['region'] = regions.region.at[int(matching_key)]
                dct['sub-region'] = regions['sub-region'].at[int(matching_key)]

            else:
                pass
'''

In [None]:
matches_df = pd.DataFrame(matches)

### Less straightforward matching

In [None]:
# what's left?
matches_df.loc[matches_df.matched == 'n']

Since none of these countries are the focus of my analysis, I'm going to ignore them for now.

In [None]:
# add missing info for Netherland Antilles

def fill_empty_cols(regions_ix, matches_ix):
    regions_cntry= regions.loc[regions_ix].country # get country string from source df
    val_dict = create_val_dict(regions_cntry) # create dict of col values from source df
    matches_df = iloc_fillna(matches_ix, val_dict) # update columns in target df
    return matches_df


In [None]:
# verify
matches_df.loc[matches_df.matched=='n']

### Merge missing region data into main df

In [None]:
matches_df.head()

In [None]:
energy_regions.head()

In [None]:
# make sure source & target df's have same index vals
matches_df = matches_df.set_index('source_country')

In [None]:
matches_df.loc[matches_df['region'].isnull()]

In [None]:
energy_regions.set_index('country', inplace = True)

In [None]:
# replace missing values in co2_regions df 
# with cols from matches df
co2_regions['region'] = co2_regions['region'].fillna(matches_df['region'])
co2_regions['sub-region'] = co2_regions['sub-region'].fillna(matches_df['sub-region'])

In [None]:
# verify the coutries with previously missing region data are now complete
co2_regions.loc[matches_df.index.values][['region', 'sub-region']]


In [None]:
# verify no more missing values
co2_regions.loc[co2_regions.region.isnull()]

In [None]:
# save as pkl file
co2_regions.to_pickle('data/co2_regions.pkl')

## Visualize

In [None]:
energy_eur = energy_regions.loc[energy_regions['region'] == 'Europe']

In [None]:
# reshape df for plotting

def prep_for_plotly(df):
    #df = df.reset_index()
    df = df.set_index(['region', 'sub-region', 'country'])
    df = df.sort_index(level = 0)
    df = df.T.unstack(level = 1)
    df = pd.DataFrame(df)
    df = df.reset_index()
    df = df.rename(columns={'level_3': 'year', 0: 'value'})
    df = df.set_index('year')
    df = df.reset_index()
    return df

In [None]:
df = prep_for_plotly(energy_eur)

In [None]:
# make interactive
import plotly as py
import cufflinks as cf
import plotly.tools as tls
tls.embed('https://plot.ly/~cufflinks/8')

In [None]:
for sub in df['sub-region']:
    sub_df = df.loc[df['sub-region']==sub].pivot('year', 'country', 'value')
    print sub_df
    break

In [None]:
for sub in df['sub-region'].unique():
    sub_df = df.loc[df['sub-region']==sub].pivot('year', 'country', 'value')
    fig = sub_df.iplot(asFigure = True,\
                        title = sub,\
                        yTitle = 'Metric tons per person',\
                        theme = 'ggplot')
    py.offline.iplot(fig)

# CO2 kg per USD of GDP

In [None]:
# read data
datafile = 'data/co2_kg_per_USD.csv'

co2_usd = pd.read_csv(datafile)

# how big is the dataset?
print co2_usd.info()

In [None]:
co2_usd.head()

In [None]:
# give better name to first columns
co2_usd.rename(columns = {'CO2 emissions (kg per 2005 PPP $ of GDP)' : 'country'}, inplace = True)

In [None]:
co2_usd['country']