# Load the Data

In [1]:
# basic package
import csv
import glob
import pandas as pd
import matplotlib as plt
from tqdm import tqdm
import numpy as np
import random
from operator import itemgetter
import matplotlib.pyplot as plt
import warnings


# ml related 
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
political = pd.read_csv("drive/Shareddrives/ML_&_Econometrics/Merged/political_selected.csv")
undernourish = pd.read_csv("drive/Shareddrives/ML_&_Econometrics/Merged/undernourish_selected.csv")

meaning_map = pd.read_csv("drive/Shareddrives/ML_&_Econometrics/Merged/final_new_meaning_A.csv")

# Preparatory Code

In [4]:
def map_code_to_meaning(mapping, code_no):
    return (mapping.loc[mapping['code']==code_no]).iloc[0].var_name

# print the total percetnage of missing in each dataset
def total_percentage_missing(df):
    return(np.count_nonzero(df.isna()) / df.size)

# drop the top N rows with most NAs
def drop_top_N_rows_with_most_NAs(df, N=300):
    if N/len(df)> 0.2:
      warnings.warn("Based on your speficied N, you are dropping more then 20% of the data")

    print("shape before drop", df.shape)
    dict_nas = {}
    for i in range(len(df)):
        percentage = total_percentage_missing(df.iloc[i])
        dict_nas[i] = percentage
    res = dict(sorted(dict_nas.items(), key = itemgetter(1), reverse = True)[:N])
    # print("here")
    top_NAs_rows = list(res.keys())
    # print(top_NAs_rows)
    df.drop(top_NAs_rows, axis=0, inplace=True)
    print("shape after drop ", df.shape)
    print("Missing data percentage ", total_percentage_missing(df) )
    return df

# split into two dataset by year (default=2017)
# fist one include that year, second one is year after that
def split_by_year(df, split_at = 2017):
    res1 = df.loc[df['Year']<= split_at]
    res2 = df.loc[df['Year']> split_at]
    return res1, res2

def print_all_coeff(list_coef, feature_name):
    sort_index = reversed(np.argsort(list_coef))
    list_of_lists = []

    # print(sort_index)
    for d in sort_index:
      for i in d:
          # print("[i]", feature_name[i])
          # print(i)
          # print(i)
          temp = int(feature_name[i])
          list_coef 
          # print("here", len(list_coef[0]))
          if list_coef[0][i] !=0.0:
              list_of_lists.append([round(list_coef[0][i],10), feature_name[i], map_code_to_meaning(meaning_map, temp)])
    return pd.DataFrame(list_of_lists, columns =['non_zero_coefficient', 'code', 'variable_name'])



# Ridge Pipeline Code

In [7]:
  # pipeline on returning the coefficient of lasso regression
# also returns the score of the regressions
def ridge_pipeline(df, target_name = 'political', split_year = 2017, ridge_alpha = 1.0):

    if target_name not in df.columns:
        raise ValueError("The input dataframe doesn't have the column: political")
    
    if 'Continent' in df.columns:
      df = df.drop(columns =['Continent'])

    # default split at 2017
    political_pre_2017, political_post_2017 = split_by_year(df, split_at = split_year)
    
    # Note, the variable names here is only names, y_politcal can be any dataframe
    # doesn't have to be political 
    y_political = political_pre_2017.pop(target_name)
    X_political = political_pre_2017.drop(columns = ['Year', 'Area Code'])

    y_political_test = political_post_2017.pop(target_name)
    X_political_test = political_post_2017.drop(columns = ['Year', 'Area Code'])

    
    feature_names = X_political_test.columns

    # scale the X
    scaler = StandardScaler()
    political_scaler_X = scaler.fit(X_political)
    X_political_scaled = political_scaler_X.transform(X_political)
    X_political_test_scaled = political_scaler_X.transform(X_political_test)

    # scale the y
    y_political = y_political.values.reshape(-1,1)
    y_political_test = y_political_test.values.reshape(-1,1)
    political_scaler_y = scaler.fit(y_political)
    y_political_scaled = political_scaler_y.transform(y_political)
    y_political_test_scaled = political_scaler_y.transform(y_political_test)
    
    # print shapes
    print("Training Shape:", X_political_scaled.shape)
    print("Testing Shape", X_political_test_scaled.shape)
    
    # Run LASSO
    reg = Ridge(alpha=ridge_alpha).fit(X_political_scaled, y_political_scaled)

    # evaluation
    print("score on training dataset", reg.score(X_political_scaled, y_political_scaled) )
    print("score on testing dataset", reg.score(X_political_test_scaled, y_political_test_scaled))
    y_train_pred = reg.predict(X_political_scaled) # predicting for training
    y_pred = reg.predict(X_political_test_scaled)  # predicting for testing
    print("R squared score on training", r2_score(y_political_scaled, y_train_pred))
    print("R squared score on testing", r2_score(y_political_test_scaled, y_pred))
    
    print("Mean Absolute Error on training", mean_absolute_error(y_political_scaled, y_train_pred))
    print("Mean Absolute Error on testing", mean_absolute_error(y_political_test_scaled, y_pred))
    res_df = print_all_coeff(reg.coef_, feature_names)
    return res_df

# Process Data Before Feeding in Pipeline: check missing data and fill in NAs

In [6]:
# this chunck can only be run once
political = drop_top_N_rows_with_most_NAs(df= political, N= 300)
undernourish = drop_top_N_rows_with_most_NAs(df= undernourish, N= 300)
# drinking = drop_top_N_rows_with_most_NAs(df= drinking, N= 500)



# fill NAs
political = political.fillna(0)
undernourish = undernourish.fillna(0)
# drinking = drinking.fillna(0)

shape before drop (3705, 1002)
shape after drop  (3405, 1002)
Missing data percentage  0.003907896395168547
shape before drop (3933, 1002)
shape after drop  (3633, 1002)
Missing data percentage  0.02336230374373741


# Ridge on Entire Political Dataset

In [8]:
ridge_pipeline(political, target_name = 'political', ridge_alpha=5e7)

Training Shape: (3041, 999)
Testing Shape (364, 999)
score on training dataset 0.0023974621123628026
score on testing dataset -6.742856438677597
R squared score on training 0.0023974621123628026
R squared score on testing -6.742856438677597
Mean Absolute Error on training 0.8170191163924648
Mean Absolute Error on testing 0.9187033070150712


Unnamed: 0,non_zero_coefficient,code,variable_name
0,-0.000032,220166187,"Value Added (Agriculture, Forestry and Fishing..."
1,-0.000032,220166186,"Value Added (Agriculture, Forestry and Fishing..."
2,-0.000031,220166103,"Value Added (Agriculture, Forestry and Fishing..."
3,-0.000031,220166157,"Value Added (Agriculture, Forestry and Fishing..."
4,-0.000024,2203061393,"Gross Fixed Capital Formation (Agriculture, Fo..."
...,...,...,...
969,0.000022,220306192,"Gross Fixed Capital Formation (Agriculture, Fo..."
970,0.000024,220306135,"Gross Fixed Capital Formation (Agriculture, Fo..."
971,0.000032,220086119,Gross Domestic Product Value US$ per capita
972,0.000032,220116119,Gross National Income Value US$ per capita


In [9]:
ridge_pipeline(undernourish, target_name = 'undernourish',  ridge_alpha=1e4)

Training Shape: (3055, 999)
Testing Shape (578, 999)
score on training dataset 0.5237278828367522
score on testing dataset 0.28826008090278166
R squared score on training 0.5237278828367522
R squared score on testing 0.28826008090278166
Mean Absolute Error on training 0.47777568565179196
Mean Absolute Error on testing 0.48803293758325406


Unnamed: 0,non_zero_coefficient,code,variable_name
0,-0.038479,17355419,Vegetables Primary Yield
1,-0.034616,8825420,"Milk, whole fresh cow Yield"
2,-0.033970,17205419,"Roots and Tubers, Total Yield"
3,-0.029152,17835410,Eggs Primary Yield
4,-0.028695,10625410,"Eggs, hen, in shell Yield"
...,...,...,...
985,0.024281,1882494,Agricultural Products Export Unit/Value Index ...
986,0.024774,1848494,Other food Export Unit/Value Index (2014-2016 ...
987,0.027687,220306139,"Gross Fixed Capital Formation (Agriculture, Fo..."
988,0.027725,2203061391,"Gross Fixed Capital Formation (Agriculture, Fo..."


# By Political Continent


## Asia

In [None]:
political_Asia = pd.read_csv("drive/Shareddrives/ML_&_Econometrics/Merged/Political_continent/political_Asia.csv")

In [None]:
political_Asia.shape

In [None]:
# do not drop top NAs because the dataset already small 
print(total_percentage_missing(political_Asia))

# fill NAs
political_Asia = political_Asia.fillna(0)

0.018762933168458014


In [None]:
lasso_pipeline(political_Asia, target_name = 'political')

Training Shape: (782, 999)
Testing Shape (138, 999)
score on training dataset 0.5954796547266706
score on testing dataset 0.6085946584993815
R squared score on training 0.5954796547266706
R squared score on testing 0.6085946584993815
Mean Absolute Error on training 0.5247211845823034
Mean Absolute Error on testing 0.5230428070350854


Unnamed: 0,non_zero_coefficient,code,variable_name
0,0.183134,68217266,Energy Emissions Share (N2O)
1,0.065124,220086185,"Gross Domestic Product Value US$ per capita, 2..."
2,0.055278,17527213,Major livestock types Livestock units per agri...
3,0.05497,220156157,Gross Fixed Capital Formation Share of GDP Loc...
4,0.036349,68197266,Other Emissions Share (N2O)
5,0.022375,68257264,All sectors with LULUCF Emissions Share (CO2)
6,0.022199,16335616,"Sawnwood, non-coniferous all Import Quantity"
7,0.017084,6825726313,All sectors with LULUCF Emissions Share (CO2eq...
8,0.016251,66467209,Forest land Share in Land area
9,0.006993,16335622,"Sawnwood, non-coniferous all Import Value"


## Africa

In [None]:
political_Africa= pd.read_csv("drive/Shareddrives/ML_&_Econometrics/Merged/Political_continent/political_Africa.csv")

In [None]:
political_Africa.shape

(1016, 1003)

In [None]:
# do not drop top NAs because the dataset already small 
print(total_percentage_missing(political_Africa))

# fill NAs
political_Africa = political_Africa.fillna(0)

0.01907957230670194


In [None]:
lasso_pipeline(political_Africa, target_name = 'political')

Training Shape: (908, 999)
Testing Shape (162, 999)
score on training dataset 0.5158326942687419
score on testing dataset 0.4858478192028174
R squared score on training 0.5158326942687419
R squared score on testing 0.4858478192028174
Mean Absolute Error on training 0.5674172366603345
Mean Absolute Error on testing 0.5066178683817737


Unnamed: 0,non_zero_coefficient,code,variable_name
0,0.10966,220116119,Gross National Income Value US$ per capita
1,0.094332,220156157,Gross Fixed Capital Formation Share of GDP Loc...
2,0.08883,68257264,All sectors with LULUCF Emissions Share (CO2)
3,0.05391,65157266,Food Waste Disposal Emissions Share (N2O)
4,0.046365,67937245,Closed and open shrubland Biomass burned (dry ...
5,0.014838,10577213,Chickens Livestock units per agricultural land...
6,0.010531,220086185,"Gross Domestic Product Value US$ per capita, 2..."
7,0.008764,1847464,Non-food Import Unit/Value Index (2014-2016 = ...
8,0.005504,69917266,Waste - food systems Emissions Share (N2O)
9,0.001988,6825726313,All sectors with LULUCF Emissions Share (CO2eq...


## Europe

In [None]:
political_Europe= pd.read_csv("drive/Shareddrives/ML_&_Econometrics/Merged/Political_continent/political_Europe.csv")

In [None]:
political_Europe.head(3)

Unnamed: 0,Year,Area Code,Continent,political,67907246,69937225,69705008,67897246,67897245,67897230,...,4725610,66555110,4635312,18675616,89864,89865,898464,898465,18675622,1719465
0,2000,1,Europe,-0.68,162.5807,0.0,32.889,141.3584,466.4827,0.0001,...,516.0,835.0,4900.0,0.0,6920.0,3952.0,93.0,54.0,0.0,8.0
1,2000,3,Europe,-0.54,4658.3563,0.0,15.4679,3824.9803,12622.4349,0.0027,...,6333.0,445.0,617.0,0.0,482.0,1635.0,57.0,187.0,52.0,120.0
2,2000,6,Europe,1.17,0.0,0.0,0.2325,0.0,0.0,0.0,...,,22.0,,80.0,,,,,23.0,


In [None]:
political_Europe.shape

(804, 1003)

In [None]:
# do not drop top NAs because the dataset already small 
print(total_percentage_missing(political_Europe))

# fill NAs
political_Europe = political_Europe.fillna(0)

0.033582089552238806


In [None]:
lasso_pipeline(political_Europe, target_name = 'political')

Training Shape: (718, 999)
Testing Shape (129, 999)
score on training dataset 0.6912999215572092
score on testing dataset 0.6927947324220374
R squared score on training 0.6912999215572092
R squared score on testing 0.6927947324220374
Mean Absolute Error on training 0.41655314486461364
Mean Absolute Error on testing 0.4138874600747007


Unnamed: 0,non_zero_coefficient,code,variable_name
0,0.2951035,220086185,"Gross Domestic Product Value US$ per capita, 2..."
1,0.134402,69795007,Terrestrial barren land Area from MODIS
2,0.02738184,10625410,"Eggs, hen, in shell Yield"
3,0.01073543,220156186,Gross Fixed Capital Formation Share of GDP Loc...
4,0.0,220156187,Gross Fixed Capital Formation Share of GDP US$...
5,-0.0,220166187,"Value Added (Agriculture, Forestry and Fishing..."
6,-0.0,220166157,"Value Added (Agriculture, Forestry and Fishing..."
7,-6.58e-08,6987723114,Solid food waste Emissions (CO2eq) (AR4)
8,-0.0005400878,65057264,Food Household Consumption Emissions Share (CO2)
9,-0.009252312,22465,Pastry Import Quantity Index (2014-2016 = 100)


##North America

In [None]:
political_NorthAmerica= pd.read_csv("drive/Shareddrives/ML_&_Econometrics/Merged/Political_continent/political_NorthAmerica.csv")

In [None]:
political_NorthAmerica.head(3)

Unnamed: 0,Year,Area Code,Continent,political,67907246,69937225,69705008,67897246,67897245,67897230,...,4725610,66555110,4635312,18675616,89864,89865,898464,898465,18675622,1719465
0,2000,8,North America,0.71,0.0,0.0,0.6975,0.0,0.0,0.0,...,90.0,4.0,40.0,69.0,107.0,80.0,22.0,17.0,6.0,63.0
1,2000,12,North America,1.28,11441.5951,0.0,7.8066,8938.5508,50838.3192,0.0107,...,1932.0,2.0,2000.0,900.0,150.0,601.0,81.0,284.0,254.0,414.0
2,2000,14,North America,1.15,0.0,0.0,3.5051,0.0,0.0,0.0,...,1060.0,2.0,310.0,1600.0,1859.0,3792.0,50.0,105.0,612.0,103.0


In [None]:
# do not drop top NAs because the dataset already small 
print(total_percentage_missing(political_NorthAmerica))

# fill NAs
political_NorthAmerica = political_NorthAmerica.fillna(0)

0.028166056155259276


In [None]:
lasso_pipeline(political_NorthAmerica, target_name = 'political')

Training Shape: (403, 999)
Testing Shape (72, 999)
score on training dataset 0.7273696899428135
score on testing dataset 0.7687525012674276
R squared score on training 0.7273696899428135
R squared score on testing 0.7687525012674276
Mean Absolute Error on training 0.42650202280633304
Mean Absolute Error on testing 0.3928829493208393


Unnamed: 0,non_zero_coefficient,code,variable_name
0,0.07181,220086185,"Gross Domestic Product Value US$ per capita, 2..."
1,0.062255,69805008,Permanent snow and glaciers Area from CCI_LC
2,0.051066,65057265,Food Household Consumption Emissions Share (CH4)
3,0.034537,6505726313,Food Household Consumption Emissions Share (CO...
4,0.026232,220316183,"Consumption of Fixed Capital (Agriculture, For..."
5,0.00424,18725922,Sawnwood Export Value
6,0.00353,18675616,"Industrial roundwood, non-coniferous Import Qu..."
7,-0.0,220166157,"Value Added (Agriculture, Forestry and Fishing..."
8,-0.0,65157264,Food Waste Disposal Emissions Share (CO2)
9,-0.0,220756103,Value Added (Total Manufacturing) Share of GDP...


## South America

In [None]:
political_SouthAmerica= pd.read_csv("drive/Shareddrives/ML_&_Econometrics/Merged/Political_continent/political_SouthAmerica.csv")

In [None]:
political_SouthAmerica.head(3)

Unnamed: 0,Year,Area Code,Continent,political,67907246,69937225,69705008,67897246,67897245,67897230,...,4725610,66555110,4635312,18675616,89864,89865,898464,898465,18675622,1719465
0,2000,9,South America,0.1,421792.1,0.0,356.5994,295232.0537,1631116.188,0.3425,...,11451.0,99870.0,53258.0,4000.0,272.0,944.0,45.0,155.0,800.0,110.0
1,2000,19,South America,-0.24,1104915.0,0.0,61.6915,317945.1242,1907280.682,0.4005,...,171.0,33831.0,4888.0,1200.0,3676.0,17791.0,59.0,300.0,33.0,4.0
2,2000,21,South America,0.19,4404393.0,0.0,1240.0209,933359.9312,5557032.985,1.167,...,6407.0,173453.5,193000.0,17240.0,194912.0,412029.0,58.0,135.0,1237.0,34.0


In [None]:
# do not drop top NAs because the dataset already small 
print(total_percentage_missing(political_SouthAmerica))

# fill NAs
political_SouthAmerica = political_SouthAmerica.fillna(0)

0.03315963042047945


In [None]:
lasso_pipeline(political_SouthAmerica, target_name = 'political')

Training Shape: (221, 999)
Testing Shape (39, 999)
score on training dataset 0.7515067863270835
score on testing dataset 0.6058596846412243
R squared score on training 0.7515067863270835
R squared score on testing 0.6058596846412243
Mean Absolute Error on training 0.38465874631348485
Mean Absolute Error on testing 0.4278209264126954


Unnamed: 0,non_zero_coefficient,code,variable_name
0,0.156366,220306135,"Gross Fixed Capital Formation (Agriculture, Fo..."
1,0.098586,1094432,"Meat indigenous, chicken Gross Production Inde..."
2,0.07812,17385419,Fruit Primary Yield
3,0.072218,6675610,Tea Import Quantity
4,0.066621,4725610,"Vegetables, preserved nes Import Quantity"
5,0.043961,220306192,"Gross Fixed Capital Formation (Agriculture, Fo..."
6,0.03929,1739432,Vegetables and Fruit Primary Gross Production ...
7,0.017148,1711726313,IPCC Agriculture Emissions Share (CO2eq) (AR5)
8,0.000247,2044432,Livestock Gross Production Index Number (2014-...
9,0.0,220306195,"Gross Fixed Capital Formation (Agriculture, Fo..."


## Oceania 

In [None]:
political_Oceania= pd.read_csv("drive/Shareddrives/ML_&_Econometrics/Merged/Political_continent/political_Oceania.csv")

In [None]:
political_Oceania.head(3)

Unnamed: 0,Year,Area Code,Continent,political,67907246,69937225,69705008,67897246,67897245,67897230,...,4725610,66555110,4635312,18675616,89864,89865,898464,898465,18675622,1719465
0,2000,10,Oceania,1.33,17674080.0,0.0,679.4146,5262833.997,30920120.0,6.4932,...,29405.0,431700.0,2545.0,2000.0,33986.0,33524.0,38.0,38.0,520.0,57.0
1,2000,25,Oceania,-0.74,0.0,0.0,0.7964,0.0,0.0,0.0,...,10.0,7.0,350.0,,691.0,1369.0,54.0,106.0,,20.0
2,2000,66,Oceania,0.3,13.64,0.0,1.7729,5.456,32.736,0.0,...,327.0,175.0,877.0,,8180.0,10401.0,34.0,41.0,,84.0


In [None]:
# do not drop top NAs because the dataset already small 
print(total_percentage_missing(political_Oceania))

# fill NAs
political_Oceania = political_Oceania.fillna(0)

0.11825238570004273


In [None]:
lasso_pipeline(political_Oceania, target_name = 'political')

Training Shape: (224, 999)
Testing Shape (42, 999)
score on training dataset 0.7392621272935311
score on testing dataset 0.6809267530877647
R squared score on training 0.7392621272935311
R squared score on testing 0.6809267530877647
Mean Absolute Error on training 0.39083227787443925
Mean Absolute Error on testing 0.40246683928712734


Unnamed: 0,non_zero_coefficient,code,variable_name
0,0.170949,65057264,Food Household Consumption Emissions Share (CO2)
1,0.149905,66107209,Agricultural land Share in Land area
2,0.021337,69967265,Farm-gate emissions Emissions Share (CH4)
3,0.010838,1719465,Roots and Tubers Import Quantity Index (2014-2...
4,0.010295,96072340,"Cattle, dairy Direct emissions (N2O) (Manure o..."
5,0.007305,96072366,"Cattle, dairy Indirect emissions (N2O) (Manure..."
6,0.000339,960723611,"Cattle, dairy Indirect emissions (N2O that vol..."
7,0.000278,96072341,"Cattle, dairy Direct emissions (N2O) (Manure a..."
8,0.000224,96072361,"Cattle, dairy Indirect emissions (N2O) (Manure..."
9,4.7e-05,96072360,"Cattle, dairy Indirect emissions (N2O) (Manure..."
