# Load the Data

In [1]:
# basic package
import csv
import glob
import pandas as pd
import matplotlib as plt
from tqdm import tqdm
import numpy as np
import random
from operator import itemgetter
import matplotlib.pyplot as plt
import warnings


# ml related 
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor



# neural network 
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import regularizers
# from keras.optimizers import SGD

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
political = pd.read_csv("drive/Shareddrives/ML_&_Econometrics/Merged/political_selected.csv")
undernourish = pd.read_csv("drive/Shareddrives/ML_&_Econometrics/Merged/undernourish_selected.csv")
# drinking = pd.read_csv("drive/Shareddrives/ML_&_Econometrics/Merged/drinking_selected.csv")

meaning_map = pd.read_csv("drive/Shareddrives/ML_&_Econometrics/Merged/final_new_meaning_A.csv")

# Preparatory Code

In [4]:

def map_code_to_meaning(mapping, code_no):
    return (mapping.loc[mapping['code']==code_no]).iloc[0].var_name

# print the total percetnage of missing in each dataset
def total_percentage_missing(df):
    return(np.count_nonzero(df.isna()) / df.size)

# drop the top N rows with most NAs
def drop_top_N_rows_with_most_NAs(df, N=300):
    if N/len(df)> 0.2:
      warnings.warn("Based on your speficied N, you are dropping more then 20% of the data")

    print("shape before drop", df.shape)
    dict_nas = {}
    for i in range(len(df)):
        percentage = total_percentage_missing(df.iloc[i])
        dict_nas[i] = percentage
    res = dict(sorted(dict_nas.items(), key = itemgetter(1), reverse = True)[:N])
    # print("here")
    top_NAs_rows = list(res.keys())
    # print(top_NAs_rows)
    df.drop(top_NAs_rows, axis=0, inplace=True)
    print("shape after drop ", df.shape)
    print("Missing data percentage ", total_percentage_missing(df) )
    return df

# split into two dataset by year (default=2017)
# fist one include that year, second one is year after that
def split_by_year(df, split_at = 2017):
    res1 = df.loc[df['Year']<= split_at]
    res2 = df.loc[df['Year']> split_at]
    return res1, res2

def print_all_coeff(list_coef, feature_name):
    sort_index = reversed(np.argsort(list_coef))
    list_of_lists = []

    # print(sort_index)
    for d in sort_index:
      for i in d:
          # print("[i]", feature_name[i])
          # print(i)
          # print(i)
          temp = int(feature_name[i])
          list_coef 
          # print("here", len(list_coef[0]))
          if list_coef[0][i] !=0.0:
              list_of_lists.append([round(list_coef[0][i],10), feature_name[i], map_code_to_meaning(meaning_map, temp)])
    return pd.DataFrame(list_of_lists, columns =['non_zero_coefficient', 'code', 'variable_name'])



# Neural Network Pipeline Code

In [6]:
  # pipeline on returning the coefficient of lasso regression
# also returns the score of the regressions
def neural_network_pipeline(df, target_name = 'political', split_year = 2017,  a = 0.0001 , hls = (100,)):

    if target_name not in df.columns:
        raise ValueError("The input dataframe doesn't have the column: political")
    
    if 'Continent' in df.columns:
      df = df.drop(columns =['Continent'])

    # default split at 2017
    political_pre_2017, political_post_2017 = split_by_year(df, split_at = split_year)
    
    # Note, the variable names here is only names, y_politcal can be any dataframe
    # doesn't have to be political 
    y_political = political_pre_2017.pop(target_name)
    X_political = political_pre_2017.drop(columns = ['Year', 'Area Code'])

    y_political_test = political_post_2017.pop(target_name)
    X_political_test = political_post_2017.drop(columns = ['Year', 'Area Code'])

    
    feature_names = X_political_test.columns

    # scale the X
    scaler = StandardScaler()
    political_scaler_X = scaler.fit(X_political)
    X_political_scaled = political_scaler_X.transform(X_political)
    X_political_test_scaled = political_scaler_X.transform(X_political_test)

    # scale the y
    y_political = y_political.values.reshape(-1,1)
    y_political_test = y_political_test.values.reshape(-1,1)
    political_scaler_y = scaler.fit(y_political)
    y_political_scaled = political_scaler_y.transform(y_political)
    y_political_test_scaled = political_scaler_y.transform(y_political_test)
    
    # print shapes
    print("Training Shape:", X_political_scaled.shape)
    print("Testing Shape", X_political_test_scaled.shape)
    
    #training and modelling
    model = Sequential()

    # model.add(Dropout(0.2))
    # model.add(Dense(100, init='uniform', activation='relu'))
    # model.add(Dense(100, init='uniform', activation='relu'))

    model = Sequential()
    model.add(Dense(80, input_dim=50, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(160, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(320, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(160, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(80, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')

    model.fit(X_political_scaled, y_political_scaled)

    y_train_pred = model.predict(X_political_scaled) # predicting for training
    y_pred = model.predict(X_political_test_scaled)  # predicting for testing

    
    print("Mean Absolute Error on training", mean_absolute_error(y_political_scaled, y_train_pred))
    print("Mean Absolute Error on testing", mean_absolute_error(y_political_test_scaled, y_pred))
    # res_df = print_all_coeff(reg.coef_, feature_names)
    return 

# Process Data Before Feeding in Pipeline: check missing data and fill in NAs

In [7]:
# this chunck can only be run once
political = drop_top_N_rows_with_most_NAs(df= political, N= 300)
undernourish = drop_top_N_rows_with_most_NAs(df= undernourish, N= 300)
# drinking = drop_top_N_rows_with_most_NAs(df= drinking, N= 500)



# fill NAs
political = political.fillna(0)
undernourish = undernourish.fillna(0)
# drinking = drinking.fillna(0)

shape before drop (3705, 1002)
shape after drop  (3405, 1002)
Missing data percentage  0.003907896395168547
shape before drop (3933, 1002)
shape after drop  (3633, 1002)
Missing data percentage  0.02336230374373741


# select a subset of features based on Random Forest results

In [8]:
select_subset = ['Year', 'Area Code',	'political', '69897230',
 '6990723112',
 '17385312',
 '68177230',
 '680072184',
 '69917273',
 '69907273',
 '69745008',
 '1664',
 '69785008',
 '6990723114',
 '6990723113',
 '69917225',
 '68217266',
 '68187225',
 '69725008',
 '65157225',
 '68217225',
 '65157273',
 '19545622',
 '17355312',
 '68187266',
 '6996726313',
 '66465110',
 '69815008',
 '6818723113',
 '6821726313',
 '69835007',
 '68007273',
 '66507208',
 '188294',
 '230826110',
 '17527213',
 '2041152',
 '17525118',
 '66505110',
 '3010512',
 '66217208',
 '68177266',
 '68187265',
 '65177225',
 '65077265',
 '69705008',
 '3010511',
 '2264',
 '3010551',
 '66705110',
 '202972380',
 '4635510',
 '220086185']

In [9]:
select_subset_2 = ['Year',	'Area Code',	'undernourish','17385419',
 '622465',
 '67515110',
 '17735922',
 '67607246',
 '65057273',
 '188295',
 '1802465',
 '185610',
 '180294',
 '17735910',
 '17805420',
 '1890462',
 '173464',
 '5155610',
 '25764',
 '1664',
 '17205419',
 '65664',
 '184795',
 '622462',
 '67917245',
 '220306161',
 '4635312',
 '65157230',
 '184864',
 '67917225',
 '220306192',
 '68247273',
 '4065622',
 '67937245',
 '67917230',
 '104164',
 '16305510',
 '2203061930',
 '17805318',
 '67917246',
 '65177230',
 '6575622',
 '17175419',
 '18955910',
 '12325910',
 '6824723113',
 '65057225',
 '10165111',
 '65965',
 '184794',
 '8825420',
 '4635510',
 '17355419']

In [10]:
undernourish.head()

Unnamed: 0,Year,Area Code,undernourish,18955610,18955622,17655510,19415610,19415622,19445610,19445622,...,6516723113,5063724313,5063723113,5062723113,5059724413,6824724413,5062724313,69967274,65007273,69745007
0,2002,1,26.1,2002.0,2503.0,50256.0,8687.0,4467.0,420090.0,56918.0,...,207.196,338.7942,338.7942,18.6964,28.1149,686.5898,18.6964,0.0,426.2235,275.5144
1,2002,2,47.8,4536.0,5532.0,304321.0,8972.0,6909.0,1377836.0,136375.0,...,0.0,2151.399,2151.399,313.8126,514.5073,9175.5191,313.8126,0.0,134.4448,562.0837
2,2002,3,4.9,62216.0,23493.0,71800.0,42852.0,25578.0,470073.0,56694.0,...,0.0,394.7005,394.7005,235.2513,253.88195,2856.9771,235.2513,0.0,582.999,1392.113
3,2002,4,8.0,16259.0,8490.0,549279.0,21083.0,12763.0,8610899.0,1284578.0,...,0.0,2665.4277,2665.4277,113.4703,217.9621,5218.6901,113.4703,0.0,11602.7719,585.2712
4,2002,7,67.5,303632.0,201066.0,160698.0,191346.0,60324.0,603222.0,90328.0,...,44189.8035,1863.2939,1863.2939,112.7669,155.2233,22595.405,112.7669,0.0,458.2622,79711.3947


# Neural Network on Entire Political Dataset and undernourish

In [11]:
neural_network_pipeline(political.filter(items = select_subset), target_name = 'political')

Training Shape: (3041, 50)
Testing Shape (364, 50)
Mean Absolute Error on training 0.5187929753265509
Mean Absolute Error on testing 0.48475009660605783


In [12]:
neural_network_pipeline(undernourish.filter(items = select_subset_2), target_name = 'undernourish')


Training Shape: (3055, 50)
Testing Shape (578, 50)
Mean Absolute Error on training 0.49700151662044967
Mean Absolute Error on testing 0.4778541476270234
