In [157]:
import pandas
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from plotnine import ggplot, aes, geom_point, geom_line
from plotnine.themes import theme_minimal


In [158]:
data=pandas.read_csv('refugee_data/refugee_Model_Data.csv')

In [159]:
data.head()

Unnamed: 0,country,percent_Refugees_of_Conflict,gdp_per_cap,normalized_pop,normalized_refugee,normalized_gdp,normalized_qrdp,normalized_lib,normalized_migr_ratio,normalized_bilateral_migr,...,Afghanistan,Burundi,Central African Republic,Democratic Republic of the Congo,Nigeria,Somalia,South Sudan,Syria,Ukraine,Venezuela
0,Angola,0.037758,0.00226,0.145513,-0.367685,0.019443,-1.330296,-1.234557,-0.279793,1.174665,...,0,0,0,1,0,0,0,0,0,0
1,Argentina,0.03447,0.009981,-0.033002,-0.475331,0.073612,0.432706,-0.934349,0.490914,-0.116315,...,0,0,0,0,0,0,0,0,0,1
2,Brazil,0.063906,0.007738,2.526772,-0.22742,2.265861,-1.28622,-0.874756,-1.121042,-0.73268,...,0,0,0,0,0,0,0,0,0,1
3,Burundi,0.086505,0.000269,-0.758244,0.025586,-0.538995,-0.135406,-0.185223,0.105496,0.446657,...,0,0,0,1,0,0,0,0,0,0
4,Cameroon,0.471566,0.001688,-0.315123,1.481046,0.664062,0.744556,0.822715,-0.061142,1.111459,...,0,0,1,0,0,0,0,0,0,0


In [160]:
withoutUkrainData=data[data["Ukraine"]!=1]


In [161]:
withoutUkrainData

Unnamed: 0,country,percent_Refugees_of_Conflict,gdp_per_cap,normalized_pop,normalized_refugee,normalized_gdp,normalized_qrdp,normalized_lib,normalized_migr_ratio,normalized_bilateral_migr,...,Afghanistan,Burundi,Central African Republic,Democratic Republic of the Congo,Nigeria,Somalia,South Sudan,Syria,Ukraine,Venezuela
0,Angola,0.037758,0.00226,0.145513,-0.367685,0.019443,-1.330296,-1.234557,-0.279793,1.174665,...,0,0,0,1,0,0,0,0,0,0
1,Argentina,0.03447,0.009981,-0.033002,-0.475331,0.073612,0.432706,-0.934349,0.490914,-0.116315,...,0,0,0,0,0,0,0,0,0,1
2,Brazil,0.063906,0.007738,2.526772,-0.22742,2.265861,-1.28622,-0.874756,-1.121042,-0.73268,...,0,0,0,0,0,0,0,0,0,1
3,Burundi,0.086505,0.000269,-0.758244,0.025586,-0.538995,-0.135406,-0.185223,0.105496,0.446657,...,0,0,0,1,0,0,0,0,0,0
4,Cameroon,0.471566,0.001688,-0.315123,1.481046,0.664062,0.744556,0.822715,-0.061142,1.111459,...,0,0,1,0,0,0,0,0,0,0
5,Cameroon,0.384641,0.001688,0.788363,0.202705,1.14979,-0.028621,-0.466841,-0.173448,0.722482,...,0,0,0,0,1,0,0,0,0,0
6,Central African Republic,0.005806,0.000554,-1.096039,-0.625463,-0.544035,0.13616,0.23451,-0.389875,-0.746655,...,0,0,0,1,0,0,0,0,0,0
7,Chad,0.164883,0.00076,-0.616784,-0.191518,-1.036376,-0.281511,-0.891275,1.161694,-0.383378,...,0,0,1,0,0,0,0,0,0,0
8,Chad,0.058494,0.00076,-1.124841,-1.085823,-0.66702,-0.985382,-0.681207,1.075378,-1.141315,...,0,0,0,0,1,0,0,0,0,0
9,Chile,0.090591,0.018849,-0.462462,-0.002681,-0.154553,0.40653,1.985718,1.33719,-0.541533,...,0,0,0,0,0,0,0,0,0,1


In [139]:
#train model without Ukraine so we use withoutUkrain


In [162]:
# set y variable we are trying tp predict
y=withoutUkrainData['percent_Refugees_of_Conflict']

In [163]:
# set independant variables 
features_normalized = withoutUkrainData[[
    "Nigeria",
    "Afghanistan",
    "Burundi",
    "Central African Republic",
    "Democratic Republic of the Congo",
    "Somalia",
    "South Sudan",
    "Syria",
    "Venezuela" ,
    "normalized_lib", 
    "normalized_bilateral_migr",
    "normalized_remittances"
]]

###   We also tried to use these additional features in the model, however none of these features had a high enough p value to be included in the final model
 1. "normalized_qrdp"
 2. "same_language"
 3. "touching"
 4. "normalized_lib"
 5. "normalized_pop"
 6. "normalized_migr_ratio"


In [164]:
features_normalized.head()

Unnamed: 0,Nigeria,Afghanistan,Burundi,Central African Republic,Democratic Republic of the Congo,Somalia,South Sudan,Syria,Venezuela,normalized_lib,normalized_bilateral_migr,normalized_remittances
0,0,0,0,0,1,0,0,0,0,-1.234557,1.174665,-0.271065
1,0,0,0,0,0,0,0,0,1,-0.934349,-0.116315,-0.333506
2,0,0,0,0,0,0,0,0,1,-0.874756,-0.73268,-0.329553
3,0,0,0,0,1,0,0,0,0,-0.185223,0.446657,-0.271065
4,0,0,0,1,0,0,0,0,0,0.822715,1.111459,1.788854


In [165]:
y.head()

0    0.037758
1    0.034470
2    0.063906
3    0.086505
4    0.471566
Name: percent_Refugees_of_Conflict, dtype: float64

In [144]:
# Run the linear regression. 
import statsmodels.api as sm

In [166]:
results=sm.OLS(y,features_normalized).fit()

In [167]:
results.summary()

0,1,2,3
Dep. Variable:,percent_Refugees_of_Conflict,R-squared:,0.729
Model:,OLS,Adj. R-squared:,0.657
Method:,Least Squares,F-statistic:,10.05
Date:,"Tue, 12 Apr 2022",Prob (F-statistic):,1.61e-08
Time:,15:42:30,Log-Likelihood:,49.75
No. Observations:,53,AIC:,-75.5
Df Residuals:,41,BIC:,-51.86
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Nigeria,0.3333,0.062,5.365,0.000,0.208,0.459
Afghanistan,0.3333,0.062,5.365,0.000,0.208,0.459
Burundi,0.2500,0.054,4.646,0.000,0.141,0.359
Central African Republic,0.2000,0.048,4.156,0.000,0.103,0.297
Democratic Republic of the Congo,0.0833,0.031,2.683,0.010,0.021,0.146
Somalia,0.2000,0.048,4.156,0.000,0.103,0.297
South Sudan,0.2000,0.048,4.156,0.000,0.103,0.297
Syria,0.2000,0.048,4.156,0.000,0.103,0.297
Venezuela,0.0909,0.032,2.802,0.008,0.025,0.156

0,1,2,3
Omnibus:,5.506,Durbin-Watson:,2.335
Prob(Omnibus):,0.064,Jarque-Bera (JB):,4.759
Skew:,0.725,Prob(JB):,0.0926
Kurtosis:,3.233,Cond. No.,4.86


In [169]:
features_to_predict=data[["Nigeria","Afghanistan","Burundi","Central African Republic","Democratic Republic of the Congo","Somalia", "South Sudan","Syria","Venezuela" ,"normalized_lib", "normalized_bilateral_migr", "normalized_remittances"]]
shares = results.predict(features_to_predict)
data['predicted_shares'] = shares
ukr_results = data[data['Ukraine']==1][['country','percent_Refugees_of_Conflict','predicted_shares']]
ukr_results.to_csv('ukraine_model_results.csv',index=False)

In [170]:
ukr_results

Unnamed: 0,country,percent_Refugees_of_Conflict,predicted_shares
22,Hungary,0.094222,0.11639
32,Moldova,0.093682,0.137529
37,Poland,0.58641,0.029058
40,Romania,0.154561,-0.177177
43,Slovakia,0.071125,-0.105799


# Retrain model with Ukraine Conflict data included

In [156]:
y=data['percent_Refugees_of_Conflict']
features_normalized = data[[
    "Nigeria",
    "Afghanistan",
    "Burundi",
    "Central African Republic",
    "Democratic Republic of the Congo",
    "Somalia",
    "South Sudan",
    "Syria",
    "Venezuela" ,
    "normalized_lib", 
    "normalized_bilateral_migr",
    "normalized_remittances"
]]

In [150]:
results=sm.OLS(y,features_normalized).fit()

In [151]:
results.summary()

0,1,2,3
Dep. Variable:,percent_IndividualPerCountry_of_recorded,R-squared (uncentered):,0.75
Model:,OLS,Adj. R-squared (uncentered):,0.685
Method:,Least Squares,F-statistic:,11.5
Date:,"Tue, 12 Apr 2022",Prob (F-statistic):,3.53e-10
Time:,15:26:16,Log-Likelihood:,37.936
No. Observations:,58,AIC:,-51.87
Df Residuals:,46,BIC:,-27.15
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Nigeria,0.3333,0.082,4.087,0.000,0.169,0.498
Afghanistan,0.3333,0.082,4.087,0.000,0.169,0.498
Burundi,0.2500,0.071,3.539,0.001,0.108,0.392
Central African Republic,0.2000,0.063,3.166,0.003,0.073,0.327
Democratic Republic of the Congo,0.0833,0.041,2.043,0.047,0.001,0.165
Somalia,0.2000,0.063,3.166,0.003,0.073,0.327
South Sudan,0.2000,0.063,3.166,0.003,0.073,0.327
Syria,0.2000,0.063,3.166,0.003,0.073,0.327
Venezuela,0.0909,0.043,2.134,0.038,0.005,0.177

0,1,2,3
Omnibus:,31.156,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,67.839
Skew:,1.674,Prob(JB):,1.86e-15
Kurtosis:,7.106,Cond. No.,5.09


In [152]:
features_to_predict=data[["Nigeria","Afghanistan","Burundi","Central African Republic","Democratic Republic of the Congo","Somalia", "South Sudan","Syria","Venezuela" ,"normalized_lib", "normalized_bilateral_migr", "normalized_remittances"]]
shares = results.predict(features_to_predict)
data['predicted_shares'] = shares
ukr_results = data[data['Ukraine']==1][['country','percent_IndividualPerCountry_of_recorded','predicted_shares']]
ukr_results.to_csv('ukraine_model_results.csv',index=False)

In [153]:
ukr_results

Unnamed: 0,country,percent_IndividualPerCountry_of_recorded,predicted_shares
22,Hungary,0.094222,0.096681
32,Moldova,0.093682,0.137262
37,Poland,0.58641,0.041586
40,Romania,0.154561,-0.161515
43,Slovakia,0.071125,-0.114013
