# Load the Data

In [1]:
# basic package
import csv
import glob
import pandas as pd
import matplotlib as plt
from tqdm import tqdm
import numpy as np
import random
from operator import itemgetter
import matplotlib.pyplot as plt
import warnings
from google.colab import files


# ml related 
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
political = pd.read_csv("drive/Shareddrives/ML_&_Econometrics/Merged/political_selected.csv")
undernourish = pd.read_csv("drive/Shareddrives/ML_&_Econometrics/Merged/undernourish_selected.csv")
meaning_map = pd.read_csv("drive/Shareddrives/ML_&_Econometrics/Merged/final_new_meaning_A.csv")

In [4]:
political_country = [41, 231, 68, 102,9, 238, 110, 2, 138, 100]

In [5]:
df = political.loc[political['Area Code'].isin(political_country)]
df.shape

(171, 1002)

In [6]:
row_number_to_extract = []
for row in df.index:
    row_number_to_extract.append(row)

In [7]:
len(row_number_to_extract)

171

# Preparatory Code

In [8]:

def map_code_to_meaning(mapping, code_no):
    return (mapping.loc[mapping['code']==code_no]).iloc[0].var_name

# print the total percetnage of missing in each dataset
def total_percentage_missing(df):
    return(np.count_nonzero(df.isna()) / df.size)

# drop the top N rows with most NAs
def drop_top_N_rows_with_most_NAs(df, N=300):
    if N/len(df)> 0.2:
      warnings.warn("Based on your speficied N, you are dropping more then 20% of the data")

    print("shape before drop", df.shape)
    dict_nas = {}
    for i in range(len(df)):
        percentage = total_percentage_missing(df.iloc[i])
        dict_nas[i] = percentage
    res = dict(sorted(dict_nas.items(), key = itemgetter(1), reverse = True)[:N])
    # print("here")
    top_NAs_rows = list(res.keys())
    # print(top_NAs_rows)
    df.drop(top_NAs_rows, axis=0, inplace=True)
    print("shape after drop ", df.shape)
    print("Missing data percentage ", total_percentage_missing(df) )
    return df

# split into two dataset by year (default=2017)
# fist one include that year, second one is year after that
def split_by_year(df, split_at = 2017):
    res1 = df.loc[df['Year']< split_at]
    res2 = df.loc[df['Year']>= split_at]
    return res1, res2

def print_all_coeff(list_coef, feature_name):
    sort_index = reversed(np.argsort(list_coef))
    list_of_lists = []
    for i in sort_index:
#         print(feature_name[i])
        temp = int(feature_name[i])
        if list_coef[i] !=0.0:
            list_of_lists.append([round(list_coef[i],10), feature_name[i], map_code_to_meaning(meaning_map, temp)])
    return pd.DataFrame(list_of_lists, columns =['non_zero_coefficient', 'code', 'variable_name'])



# Random Forest Regression 

In [9]:
  # pipeline on returning the coefficient of lasso regression
# also returns the score of the regressions
def random_forest_regression_for_ten(df, country_list, target_name = 'political', split_year = 2017 ,n_estimator = 10):

    if target_name not in df.columns:
        raise ValueError("The input dataframe doesn't have the column: political")
    
    if 'Continent' in df.columns:
      df = df.drop(columns =['Continent'])
    all_true = df.filter(items = [target_name, 'Year' , 'Area Code'])
    all_true = all_true.loc[all_true['Area Code'].isin(country_list)]

    # default split at 2017
    political_pre_2017, political_post_2017 = split_by_year(df, split_at = split_year)
    

    # Note, the variable names here is only names, y_politcal can be any dataframe
    # doesn't have to be political 
    y_political = political_pre_2017.pop(target_name)
    X_political = political_pre_2017
    # X_political = political_pre_2017.drop(columns = ['Year', 'Area Code'])

    y_political_test = political_post_2017.pop(target_name)
    X_political_test = political_post_2017
    # X_political_test = political_post_2017.drop(columns = ['Year', 'Area Code'])

    # X_selected_train = X_political.loc[X_political['Area Code'].isin(country_list)]
    X_selected_test   = X_political_test.loc[X_political_test['Area Code'].isin(country_list)]
    
    # dop the colum names 
    X_political = political_pre_2017.drop(columns = ['Year', 'Area Code'])
    X_political_test = political_post_2017.drop(columns = ['Year', 'Area Code'])

    # X_selected_train = X_selected_train.drop(columns = ['Year', 'Area Code'])
    X_selected_test_marker = X_selected_test[['Year', 'Area Code']]
    X_selected_test  = X_selected_test.drop(columns =  ['Year', 'Area Code'])
    
    feature_names = X_political_test.columns

    # scale the X
    scaler = StandardScaler()
    political_scaler_X = scaler.fit(X_political)
    X_political_scaled = political_scaler_X.transform(X_political)
    X_political_test_scaled = political_scaler_X.transform(X_political_test)
    X_selected_test_scaled = political_scaler_X.transform(X_selected_test)

    # scale the y
    y_political = y_political.values.reshape(-1,1)
    y_political_test = y_political_test.values.reshape(-1,1)
    political_scaler_y = scaler.fit(y_political)
    y_political_scaled = political_scaler_y.transform(y_political)
    y_political_test_scaled = political_scaler_y.transform(y_political_test)
    
    # print shapes
    print("Training Shape:", X_political_scaled.shape)
    print("Testing Shape", X_political_test_scaled.shape)
    
    # Run random foorest
    random.seed(1234)
    reg = RandomForestRegressor(n_estimators=n_estimator).fit(X_political_scaled, y_political_scaled.ravel())
    # evaluation

    y_train_pred = reg.predict(X_political_scaled) # predicting for training
    y_pred = reg.predict(X_political_test_scaled)  # predicting for testing
    y_selected = reg.predict(X_selected_test_scaled)
    #    

    # transform back y_selected 
    # print("hi")
    y_selected = political_scaler_y.inverse_transform(y_selected.reshape(-1, 1))
    # print("makr", X_selected_test_marker.shape)
    # print(X_selected_test_marker)
    # print("y" , np.array(y_selected).shape)
    # print(np.array(y_selected))

    res_ret = pd.DataFrame(X_selected_test_marker)
    res_ret['predicted'] = y_selected

    print("Mean Absolute Error on training", mean_absolute_error(y_political_scaled, y_train_pred))
    print("Mean Absolute Error on testing", mean_absolute_error(y_political_test_scaled, y_pred))
    # res_df = print_all_coeff(reg.coef_, feature_names)
    return all_true , res_ret

# Process Data Before Feeding in Pipeline: check missing data and fill in NAs

In [10]:
# this chunck can only be run once
political = drop_top_N_rows_with_most_NAs(df= political, N= 300)
undernourish = drop_top_N_rows_with_most_NAs(df= undernourish, N= 300)
# drinking = drop_top_N_rows_with_most_NAs(df= drinking, N= 500)



# fill NAs
political = political.fillna(0)
undernourish = undernourish.fillna(0)
# drinking = drinking.fillna(0)

shape before drop (3705, 1002)
shape after drop  (3405, 1002)
Missing data percentage  0.003907896395168547
shape before drop (3933, 1002)
shape after drop  (3633, 1002)
Missing data percentage  0.02336230374373741


# Random Forest  Regression for political 

In [11]:
all_selected, selected_predict= random_forest_regression_for_ten(political, political_country, target_name = 'political', n_estimator = 80)

Training Shape: (2859, 999)
Testing Shape (546, 999)
Mean Absolute Error on training 0.06336423823043894
Mean Absolute Error on testing 0.18821726987362544


In [12]:
all_selected.loc[all_selected['Year']== 2017]

Unnamed: 0,political,Year,Area Code
3115,-2.8,2017,2
3122,0.17,2017,9
3166,0.28,2017,68
3187,-0.76,2017,100
3189,-0.93,2017,102
3197,1.11,2017,110
3222,-0.72,2017,138
3293,0.34,2017,231
3299,-1.68,2017,238


In [13]:
selected_predict

Unnamed: 0,Year,Area Code,predicted
3115,2017,2,-2.436625
3122,2017,9,-0.040125
3166,2017,68,0.39825
3187,2017,100,-1.08375
3189,2017,102,-0.752625
3197,2017,110,0.70075
3222,2017,138,-0.71075
3293,2017,231,0.27425
3299,2017,238,-1.557375
3312,2018,2,-2.37775


## Allen Look here

In [26]:
# all_selected is the true political instability for all years across the time 
all_selected
all_selected.to_csv('political_true.csv')
files.download('political_true.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Allen Look Here! Selected_predict is the predicted poltical instability for the last three years 

In [27]:
selected_predict
selected_predict.to_csv('political_preds.csv')
files.download('political_preds.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Undernourish 

In [16]:
all_selected_2, selected_predict_2 = random_forest_regression_for_ten(undernourish, political_country, target_name = 'undernourish', n_estimator = 100)

Training Shape: (2862, 999)
Testing Shape (771, 999)
Mean Absolute Error on training 0.04136794081245503
Mean Absolute Error on testing 0.23102756642856967


In [28]:
all_selected_2[all_selected_2['Year'] == 2017]
all_selected_2.to_csv('undernourish_true.csv')
files.download('undernourish_true.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [29]:
selected_predict_2
selected_predict_2.to_csv('undernourish_preds.csv')
files.download('undernourish_preds.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>