In [3]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as sns
import math

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics

%store -r years

%matplotlib inline

In [4]:
# Units are annual emissions in tonnes (in units of 1000 tonnes)
emissions_df = pd.read_csv('Data/emissions_df.csv')
emissions_df.head()

Unnamed: 0,Area Abbreviation,Area,Item,Element,Enviro Key,Total Emissions per Tonne,Y1961,Y1962,Y1963,Y1964,...,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
0,AFG,Afghanistan,Wheat and products,Food,Wheat & Rye (Bread),1400.0,2699200.0,2665600.0,2332400.0,2730000.0,...,4548600.0,4880400.0,5185600.0,5829600.0,5952800.0,6353200.0,6447000.0,6595400.0,6734000.0,6853000.0
1,AFG,Afghanistan,Rice (Milled Equivalent),Food,Rice,4000.0,732000.0,732000.0,728000.0,880000.0,...,1676000.0,1780000.0,2184000.0,1820000.0,1960000.0,1660000.0,1768000.0,1904000.0,1700000.0,1688000.0
2,AFG,Afghanistan,Barley and products,Feed,Barley (Beer),1100.0,83600.0,83600.0,83600.0,83600.0,...,63800.0,259600.0,288200.0,289300.0,253000.0,416900.0,346500.0,223300.0,403700.0,396000.0
3,AFG,Afghanistan,Barley and products,Food,Barley (Beer),1100.0,260700.0,260700.0,260700.0,261800.0,...,203500.0,47300.0,48400.0,52800.0,68200.0,60500.0,66000.0,79200.0,85800.0,97900.0
4,AFG,Afghanistan,Maize and products,Feed,Maize (Meal),1100.0,231000.0,231000.0,235400.0,237600.0,...,132000.0,228800.0,256300.0,273900.0,271700.0,214500.0,195800.0,210100.0,220000.0,220000.0


In [5]:
# Units are total amount of product that year (in units of 1 tonnes)
annual_product_sum_df = pd.read_csv('Data/yearly_total_product_per_country.csv')
annual_product_sum_df.head()

Unnamed: 0,Area Abbreviation,Y1961,Y1962,Y1963,Y1964,Y1965,Y1966,Y1967,Y1968,Y1969,...,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
0,AFG,9481000.0,9414000.0,9194000.0,10170000.0,10473000.0,10169000.0,11289000.0,11508000.0,11815000.0,...,16542000.0,17658000.0,18317000.0,19248000.0,19381000.0,20661000.0,21030000.0,21100000.0,22706000,23007000
1,ALB,1706000.0,1749000.0,1767000.0,1889000.0,1884000.0,1995000.0,2046000.0,2169000.0,2230000.0,...,6637000.0,6719000.0,6911000.0,6744000.0,7168000.0,7316000.0,7907000.0,8114000.0,8221000,8271000
2,DZA,7488000.0,7235000.0,6861000.0,7255000.0,7509000.0,7536000.0,7986000.0,8839000.0,9003000.0,...,48619000.0,49562000.0,51067000.0,49933000.0,50916000.0,57505000.0,60071000.0,65852000.0,69365000,72161000
3,AGO,4834000.0,4775000.0,5240000.0,5286000.0,5527000.0,5677000.0,5833000.0,5685000.0,6219000.0,...,25541000.0,26696000.0,28247000.0,29877000.0,32053000.0,36985000.0,38400000.0,40573000.0,38064000,48639000
4,ATG,92000.0,94000.0,105000.0,95000.0,84000.0,73000.0,64000.0,59000.0,68000.0,...,92000.0,115000.0,110000.0,122000.0,115000.0,114000.0,115000.0,118000.0,113000,119000


# SK Learn

In [6]:
area = emissions_df['Area'].unique()
area_array = []
num = 1
for a in area:
    this_arr = [a, num]
    area_array.append(this_arr)
    num += 1  

In [7]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit_transform(area_array)


<174x348 sparse matrix of type '<class 'numpy.float64'>'
	with 348 stored elements in Compressed Sparse Row format>

In [8]:
enc.categories_

[array(['Afghanistan', 'Albania', 'Algeria', 'Angola',
        'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
        'Austria', 'Azerbaijan', 'Bahamas', 'Bangladesh', 'Barbados',
        'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda',
        'Bolivia (Plurinational State of)', 'Bosnia and Herzegovina',
        'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria',
        'Burkina Faso', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada',
        'Central African Republic', 'Chad', 'Chile',
        'China, Hong Kong SAR', 'China, Macao SAR',
        'China, Taiwan Province of', 'China, mainland', 'Colombia',
        'Congo', 'Costa Rica', 'Croatia', 'Cuba', 'Cyprus', 'Czechia',
        "Côte d'Ivoire", "Democratic People's Republic of Korea",
        'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador',
        'Egypt', 'El Salvador', 'Estonia', 'Ethiopia', 'Fiji', 'Finland',
        'France', 'French Polynesia', 'Gabon', 'Gambia', 'Georgia',
        'Germa

In [24]:
X = emissions_df.drop(columns=['Area Abbreviation', 'Item', 'Element', 'Enviro Key'])
y = emissions_df['Area']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [9]:
regressor = LinearRegression()  
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [10]:
coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient'])  
coeff_df

Unnamed: 0,Coefficient
Total Emissions per Tonne,-0.922467
Y1961,0.270063
Y1962,-0.528816
Y1963,-0.162664
Y1964,0.158573
Y1965,0.419716
Y1966,-0.269215
Y1967,-0.037609
Y1968,0.223757
Y1969,-0.251877


# Keras

In [1]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline


In [25]:
enc = LabelEncoder()
enc.fit(y)
encoded_y = enc.transform(y)
dummy_y = np_utils.to_categorical(encoded_y)

In [34]:
print(len(X.columns))
print(len(y.unique()))

55
174


In [49]:
# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(110, input_dim=55, activation='relu'))
    model.add(Dense(174, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [50]:
estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)

In [51]:
kfold = KFold(n_splits=10, shuffle=True)

In [52]:
results = cross_val_score(estimator, X, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: nan% (nan%)
