**SYPA: Fundamental Analysis of Foreign Direct Investment** <br>
*5b_Predictions_Linear_and_RF* <br>
Harvard SYPA <br>
User: Jake Schneider <br>
Date Created: March 4, 2020 <br>
Date Updated: March 5, 2020

----

**Note: This needs to be run from the tfcs109a Conda Environment**

**Load Packages**

In [1]:
#Import libraries
import sys
import pandas as pd
from datetime import date, datetime, time, timedelta
import pendulum
import json
import requests
import numpy as np
import math

from scitime import Estimator 

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style='ticks', context='talk')

from matplotlib.offsetbox import AnchoredText
from sklearn import metrics
from sklearn import linear_model
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor 
from sklearn.tree import export_graphviz
import pydot

import pickle

import statsmodels.formula.api as sm
from pprint import pprint

import warnings
import itertools

import missingno as msno

from flask import jsonify, make_response

from varname import varname

from matplotlib.backends.backend_pdf import PdfPages
from fpdf import FPDF
from PIL import Image, ImageDraw, ImageFont

----

**Load Data**

In [2]:
# Load Data

final_df = pd.read_csv('../../2_Inputs/Final/final_df.csv')
final_df = final_df.drop(["Unnamed: 0"], axis = 1)
final_df.head()

Unnamed: 0,country,date,code,iso2Code,region,adminregion,incomeLevel,lendingType,capitalCity,longitude,...,Ratio.of.female.to.male.labor.force.participation.rate......modeled.ILO.estimate.,Unemployment..total....of.total.labor.force...modeled.ILO.estimate.,Net.migration,Prevalence.of.undernourishment....of.population.,Life.expectancy.at.birth..total..years.,Fertility.rate..total..births.per.woman.,Population.ages.65.and.above....of.total.population.,Unmet.need.for.contraception....of.married.women.ages.15.49.,Voice.and.Accountability..Estimate.y,year
0,Afghanistan,1960.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,74.856875,4.1602,276087.2,13.9,32.446,7.45,2.798308,15.06,-0.199767,1960.0
1,Afghanistan,1961.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,74.856875,7.6538,195999.6,13.9,32.962,7.45,2.808131,15.06,-0.199767,1961.0
2,Afghanistan,1962.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,71.94173,5.2414,-20000.0,13.9,33.471,7.45,2.804113,15.06,-0.199767,1962.0
3,Afghanistan,1963.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,54.693568,7.0762,278921.8,13.9,33.971,7.45,2.786171,15.06,-0.193007,1963.0
4,Afghanistan,1964.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,59.874157,6.8932,195999.6,13.9,34.463,7.45,2.754223,15.06,-0.193007,1964.0


In [3]:
# Load In Sample

in_sample = pd.read_csv('../../2_Inputs/Final/in_sample.csv')
in_sample = in_sample.drop(["Unnamed: 0"], axis = 1)
in_sample.head()

Unnamed: 0,country,date,code,iso2Code,region,adminregion,incomeLevel,lendingType,capitalCity,longitude,...,Ratio.of.female.to.male.labor.force.participation.rate......modeled.ILO.estimate.,Unemployment..total....of.total.labor.force...modeled.ILO.estimate.,Net.migration,Prevalence.of.undernourishment....of.population.,Life.expectancy.at.birth..total..years.,Fertility.rate..total..births.per.woman.,Population.ages.65.and.above....of.total.population.,Unmet.need.for.contraception....of.married.women.ages.15.49.,Voice.and.Accountability..Estimate.y,year
0,Afghanistan,1970.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,78.0033,8.3546,-23616.0,43.52,37.409,7.45,2.631613,29.48,-0.979531,1970.0
1,Afghanistan,1971.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,78.201934,8.5824,-22903.2,43.88,37.93,7.45,2.635235,29.68,-0.956633,1971.0
2,Afghanistan,1972.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,72.199611,8.7142,-20000.0,26.32,38.461,7.45,2.627456,26.7,-1.04392,1972.0
3,Afghanistan,1973.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,69.824106,6.7992,-14881.2,35.16,39.003,7.45,2.609505,27.0,-0.79202,1973.0
4,Afghanistan,1976.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,77.27871,4.2436,-261777.8,38.1,40.715,7.45,2.558353,28.06,-0.555133,1976.0


In [4]:
# Load Lock Box

lock_box = pd.read_csv('../../2_Inputs/Final/lock_box.csv')
lock_box = lock_box.drop(["Unnamed: 0"], axis = 1)
print(lock_box.shape)
lock_box.head()

(757, 2316)


Unnamed: 0,country,date,code,iso2Code,region,adminregion,incomeLevel,lendingType,capitalCity,longitude,...,Ratio.of.female.to.male.labor.force.participation.rate......modeled.ILO.estimate.,Unemployment..total....of.total.labor.force...modeled.ILO.estimate.,Net.migration,Prevalence.of.undernourishment....of.population.,Life.expectancy.at.birth..total..years.,Fertility.rate..total..births.per.woman.,Population.ages.65.and.above....of.total.population.,Unmet.need.for.contraception....of.married.women.ages.15.49.,Voice.and.Accountability..Estimate.y,year
0,Afghanistan,2015.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,57.110746,1.679,-16498.2,28.6,63.377,4.976,2.478662,24.5,-1.117563,2015.0
1,Afghanistan,2016.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,58.104539,1.634,-16498.2,29.5,63.763,4.8,2.519923,25.64,-1.039843,2016.0
2,Afghanistan,2017.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,59.008129,1.559,-314602.0,29.8,64.13,4.633,2.554251,22.98,-0.99247,2017.0
3,Afghanistan,2018.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,59.244647,1.542,-32328.2,27.96,63.3522,4.9862,2.584927,25.86,-0.99472,2018.0
4,Albania,2015.0,ALB,AL,Europe & Central Asia,Europe & Central Asia (excluding high income),Upper middle income,IBRD,Tirane,19.8172,...,73.166459,17.08,-131596.4,6.0,78.025,1.677,12.626548,17.16,0.157149,2015.0


In [5]:
# Lock Box Shape

lock_box.shape

(757, 2316)

In [6]:
# Split Lock Box

#lock_box = lock_box.loc[:,'longitude':]
lock_box_x = lock_box.drop(lock_box[['country', 'date', 'code', 'iso2Code', 'region', 'adminregion','incomeLevel', 'lendingType', 'capitalCity','Foreign direct investment, net inflows (% of GDP)']], axis = 1)
lock_box_y = lock_box['Foreign direct investment, net inflows (% of GDP)']

print(lock_box_x.shape)
print(lock_box_y.shape)

(757, 2306)
(757,)


In [7]:
# Load Validation Table

results = pd.read_csv('../../3_Outputs/Model Selection/Validation Table/Validation Table 3.csv')
results = results.drop(["Unnamed: 0"], axis = 1)
results.head()

Unnamed: 0,Model Name,Model Type,Minimum RMSE,Variables
0,Linear Regression,Linear Model,20021.89,2313
1,Ridge Regression,Linear Model,12131.99,2310
2,Lasso Regression,Linear Model,462.4,986
3,Random Forest,Tree-Based,143.22,2310
4,Deep Neural Networks (DNN),Neural Networks,91.71,2310


---

**Import Second Best non-NN Models: Lasso and Random Forest** <br>

*Lasso*

In [8]:
# load the model from disk
lasso_optimum = pickle.load(open('../../3_Outputs/Model Selection/Lasso Regression/lasso_optimum_model.sav', 'rb'))
print("Load rf optimum from disk")

Load rf optimum from disk


*Random Forest*

In [9]:
# load the model from disk
rf_optimum = pickle.load(open('../../3_Outputs/Model Selection/Random Forest/rf_optimum_model.sav', 'rb'))
print("Load rf optimum from disk")

Load rf optimum from disk


----

**Predict Values Using Lasso**

*Predict Lasso*

In [10]:
# Create Predictions

prediction_lasso = lasso_optimum.predict(lock_box_x)

In [11]:
# View Predictions

y_pred_lasso = prediction_lasso
print(len(y_pred_lasso))
print('Prediction with scaling - {}'.format(y_pred_lasso[0:10]))

757
Prediction with scaling - [ 19.33158518  13.73869089  26.25710913  38.68882972 -11.28288851
  -6.94807892  -3.56554522   0.23250231  -5.59801667  -9.22454087]


In [12]:
y_pred_lasso_df = pd.DataFrame(data = y_pred_lasso)
y_pred_lasso_df.head(25)

Unnamed: 0,0
0,19.331585
1,13.738691
2,26.257109
3,38.68883
4,-11.282889
5,-6.948079
6,-3.565545
7,0.232502
8,-5.598017
9,-9.224541


----

**Predict Values Using Random Forest**

*Predict Random Forest*

In [13]:
# Create Predictions

prediction_rf = rf_optimum.predict(lock_box_x)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 800 out of 800 | elapsed:    0.1s finished


In [14]:
# View Predictions

y_pred_rf = prediction_rf
print(len(y_pred_rf))
print('Prediction with scaling - {}'.format(y_pred_rf[0:10]))

757
Prediction with scaling - [3.02757827 3.26208424 3.38053037 3.16497445 5.59971484 5.66581733
 5.76952183 5.06197014 3.1209371  3.26449848]


In [15]:
y_pred_rf_df = pd.DataFrame(data = y_pred_rf)
y_pred_rf_df.head(25)

Unnamed: 0,0
0,3.027578
1,3.262084
2,3.38053
3,3.164974
4,5.599715
5,5.665817
6,5.769522
7,5.06197
8,3.120937
9,3.264498


In [16]:
lock_box_y.head(25)

0     0.849679
1     0.483360
2     0.255222
3     0.718898
4     8.690477
5     8.804978
6     7.852228
7     7.992357
8    -0.324014
9     1.023704
10    0.717464
11    0.866905
12    8.630605
13   -0.177523
14   -6.057209
15   -5.420745
16    8.039283
17    5.613403
18    7.693360
19    7.233055
20    1.977135
21    0.584750
22    1.791961
23    2.283806
24    1.744737
Name: Foreign direct investment, net inflows (% of GDP), dtype: float64