**SYPA: Fundamental Analysis of Foreign Direct Investment** <br>
*5b_Predictions_Linear_and_RF* <br>
Harvard SYPA <br>
User: Jake Schneider <br>
Date Created: March 4, 2020 <br>
Date Updated: March 5, 2020

----

**Note: This needs to be run from the tfcs109a Conda Environment**

**Load Packages**

In [1]:
#Import libraries
import sys
import pandas as pd
from datetime import date, datetime, time, timedelta
import pendulum
import json
import requests
import numpy as np
import math

from scitime import Estimator 

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style='ticks', context='talk')

from matplotlib.offsetbox import AnchoredText
from sklearn import metrics
from sklearn import linear_model
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor 
from sklearn.tree import export_graphviz
import pydot

import pickle

import statsmodels.formula.api as sm
from pprint import pprint

import warnings
import itertools

import missingno as msno

from flask import jsonify, make_response

from varname import varname

from matplotlib.backends.backend_pdf import PdfPages
from fpdf import FPDF
from PIL import Image, ImageDraw, ImageFont

----

**Load Data**

In [2]:
# Load Data

final_df = pd.read_csv('../../2_Inputs/Final/final_df.csv')
final_df = final_df.drop(["Unnamed: 0"], axis = 1)
final_df.head()

Unnamed: 0,country,date,code,iso2Code,region,adminregion,incomeLevel,lendingType,capitalCity,longitude,...,Labor.force.participation.rate..total....of.total.population.ages.15.64...modeled.ILO.estimate.,Ratio.of.female.to.male.labor.force.participation.rate......modeled.ILO.estimate.,Unemployment..total....of.total.labor.force...modeled.ILO.estimate.,Net.migration,Prevalence.of.undernourishment....of.population.,Life.expectancy.at.birth..total..years.,Fertility.rate..total..births.per.woman.,Population.ages.65.and.above....of.total.population.,Unmet.need.for.contraception....of.married.women.ages.15.49.,Voice.and.Accountability..Estimate.y
0,Afghanistan,1960.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,61.736799,74.856875,4.1602,276087.2,13.9,32.446,7.45,2.798308,15.06,-0.199767
1,Afghanistan,1961.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,61.736799,74.856875,7.6538,195999.6,13.9,32.962,7.45,2.808131,15.06,-0.199767
2,Afghanistan,1962.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,69.058401,71.94173,5.2414,-20000.0,13.9,33.471,7.45,2.804113,15.06,-0.199767
3,Afghanistan,1963.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,63.8738,54.693568,7.0762,278921.8,13.9,33.971,7.45,2.786171,15.06,-0.193007
4,Afghanistan,1964.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,66.003,59.874157,6.8932,195999.6,13.9,34.463,7.45,2.754223,15.06,-0.193007


In [3]:
# Load In Sample

in_sample = pd.read_csv('../../2_Inputs/Final/in_sample.csv')
in_sample = in_sample.drop(["Unnamed: 0"], axis = 1)
in_sample.head()

Unnamed: 0,country,date,code,iso2Code,region,adminregion,incomeLevel,lendingType,capitalCity,longitude,...,Labor.force.participation.rate..total....of.total.population.ages.15.64...modeled.ILO.estimate.,Ratio.of.female.to.male.labor.force.participation.rate......modeled.ILO.estimate.,Unemployment..total....of.total.labor.force...modeled.ILO.estimate.,Net.migration,Prevalence.of.undernourishment....of.population.,Life.expectancy.at.birth..total..years.,Fertility.rate..total..births.per.woman.,Population.ages.65.and.above....of.total.population.,Unmet.need.for.contraception....of.married.women.ages.15.49.,Voice.and.Accountability..Estimate.y
0,Afghanistan,1960.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,61.736799,74.856875,4.1602,276087.2,13.9,32.446,7.45,2.798308,15.06,-0.199767
1,Afghanistan,1961.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,61.736799,74.856875,7.6538,195999.6,13.9,32.962,7.45,2.808131,15.06,-0.199767
2,Afghanistan,1962.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,69.058401,71.94173,5.2414,-20000.0,13.9,33.471,7.45,2.804113,15.06,-0.199767
3,Afghanistan,1963.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,63.8738,54.693568,7.0762,278921.8,13.9,33.971,7.45,2.786171,15.06,-0.193007
4,Afghanistan,1964.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,66.003,59.874157,6.8932,195999.6,13.9,34.463,7.45,2.754223,15.06,-0.193007


In [4]:
# Load Lock Box

lock_box = pd.read_csv('../../2_Inputs/Final/lock_box.csv')
lock_box = lock_box.drop(["Unnamed: 0"], axis = 1)
lock_box.head()

Unnamed: 0,country,date,code,iso2Code,region,adminregion,incomeLevel,lendingType,capitalCity,longitude,...,Labor.force.participation.rate..total....of.total.population.ages.15.64...modeled.ILO.estimate.,Ratio.of.female.to.male.labor.force.participation.rate......modeled.ILO.estimate.,Unemployment..total....of.total.labor.force...modeled.ILO.estimate.,Net.migration,Prevalence.of.undernourishment....of.population.,Life.expectancy.at.birth..total..years.,Fertility.rate..total..births.per.woman.,Population.ages.65.and.above....of.total.population.,Unmet.need.for.contraception....of.married.women.ages.15.49.,Voice.and.Accountability..Estimate.y
0,Afghanistan,2015.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,66.914001,57.110746,1.679,52807.2,28.6,63.377,4.976,2.478662,24.5,-1.117563
1,Afghanistan,2016.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,67.268997,58.104539,1.634,18138.6,29.5,63.763,4.8,2.519923,15.06,-1.039843
2,Afghanistan,2017.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,67.606003,59.008129,1.559,-314602.0,29.8,64.13,4.633,2.554251,15.06,-0.99247
3,Afghanistan,2018.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,67.686996,59.244647,1.542,52807.2,27.86,69.075083,2.18,2.584927,15.06,-0.99472
4,Afghanistan,2019.0,AFG,AF,South Asia,South Asia,Low income,IDA,Kabul,69.1761,...,67.772003,59.479106,1.519,26108.8,3.68,66.4434,3.9094,8.794946,14.66,0.853073


In [5]:
# Split Lock Box

lock_box = lock_box.loc[:,'longitude':]
lock_box_x = lock_box.drop(['Foreign direct investment, net inflows (% of GDP)'], axis = 1)
lock_box_y = lock_box['Foreign direct investment, net inflows (% of GDP)']

print(lock_box_x.shape)
print(lock_box_y.shape)

(1085, 2313)
(1085,)


In [6]:
# Load Validation Table

results = pd.read_csv('../../3_Outputs/Model Selection/Validation Table/Validation Table 3.csv')
results = results.drop(["Unnamed: 0"], axis = 1)
results.head()

Unnamed: 0,Model Name,Model Type,Minimum RMSE,Variables
0,Linear Regression,Linear Model,20021.89,2313
1,Ridge Regression,Linear Model,12131.99,2310
2,Lasso Regression,Linear Model,462.4,986
3,Random Forest,Tree-Based,143.22,2310
4,Deep Neural Networks (DNN),Neural Networks,91.71,2310


---

**Import Second Best non-NN Models: Lasso and Random Forest** <br>

*Lasso*

In [7]:
# load the model from disk
lasso_optimum = pickle.load(open('../../3_Outputs/Model Selection/Lasso Regression/lasso_optimum_model.sav', 'rb'))
print("Load rf optimum from disk")

Load rf optimum from disk


*Random Forest*

In [8]:
# load the model from disk
rf_optimum = pickle.load(open('../../3_Outputs/Model Selection/Random Forest/rf_optimum_model.sav', 'rb'))
print("Load rf optimum from disk")

Load rf optimum from disk


----

**Predict Values Using Lasso**

*Predict Lasso*

In [9]:
# Create Predictions

prediction_lasso = lasso_optimum.predict(lock_box_x)

In [10]:
# View Predictions

y_pred_lasso = prediction_lasso
print(len(y_pred_lasso))
print('Prediction with scaling - {}'.format(y_pred_lasso[0:10]))

1085
Prediction with scaling - [ 36.05068266  32.95242445 -46.72317981  16.63894249  96.17860397
 -10.09723226 -30.16369756 -19.67548537 -22.76787551  68.19122888]


In [11]:
y_pred_lasso_df = pd.DataFrame(data = y_pred_lasso)
y_pred_lasso_df.head(25)

Unnamed: 0,0
0,36.050683
1,32.952424
2,-46.72318
3,16.638942
4,96.178604
5,-10.097232
6,-30.163698
7,-19.675485
8,-22.767876
9,68.191229


----

**Predict Values Using Random Forest**

*Predict Random Forest*

In [8]:
# Create Predictions

prediction_rf = rf_optimum.predict(lock_box_x)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 800 out of 800 | elapsed:    0.4s finished


In [9]:
# View Predictions

y_pred_rf = prediction_rf
print(len(y_pred_rf))
print('Prediction with scaling - {}'.format(y_pred_rf[0:10]))

1085
Prediction with scaling - [ 4.77967361 18.55214114 18.62580537 18.43299079 58.02412093  7.93022825
  9.17577838  9.39707285 14.67699516 63.46917604]


In [10]:
y_pred_rf_df = pd.DataFrame(data = y_pred_rf)
y_pred_rf_df.head(25)

Unnamed: 0,0
0,4.779674
1,18.552141
2,18.625805
3,18.432991
4,58.024121
5,7.930228
6,9.175778
7,9.397073
8,14.676995
9,63.469176


In [11]:
lock_box_y.head(25)

0      0.849679
1      0.483360
2      0.255222
3      0.718898
4      0.377893
5      8.690477
6      8.804978
7      7.852228
8      7.992357
9      3.820657
10    -0.324014
11     1.023704
12     0.717464
13     0.866905
14     0.173327
15    -2.339329
16     1.700898
17     1.700898
18    -2.166225
19     2.551114
20    11.554708
21    13.754450
22    13.096499
23     9.877110
24     2.024756
Name: Foreign direct investment, net inflows (% of GDP), dtype: float64