In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

In [2]:
# import data
data=pd.read_csv('refugee_data/refugee_data_final.csv')
data = data[data['country']!='Russian Federation']

In [3]:
# if we want to recalc percent total of conflict after dropping Russia...
data['pct_tot'] = data['individualPerCountry'] / data.groupby('conflict')['individualPerCountry'].transform('sum')

In [4]:
data['bilateral_migration_percap'] = data['bilateral_migration']/data['population']

In [5]:
data['gdp_per_cap'] = data['gdp_millions']*1000000/data['population']

In [6]:
data['migrants_per_cap'] = data['total_recored_migrants']/data['population']

In [7]:
cols_to_scale = ['bilateral_migration','gdp_millions','population', 'remittances']

In [8]:
scaler = MinMaxScaler()
for col in cols_to_scale:
    print(col)
    normed = pd.DataFrame()
    
    for y, x in data.groupby('conflict'):
        norm_ = [i[0] for i in scaler.fit_transform(x[col].values.reshape(-1,1))]
        countries = x['country']
        conflict_ = x['conflict']
        res = pd.DataFrame(tuple(zip(countries,conflict_,norm_)), columns=['country','conflict',f"{col}_norm"])
        normed = normed.append(res)
    data = pd.merge(data, normed, left_on=['country','conflict'], right_on=['country','conflict'], how='right')

bilateral_migration
gdp_millions
population
remittances


In [9]:
ukr = data[data['conflict']=='Ukraine']

In [10]:
#train model without Ukraine so we use withoutUkrain
withoutUkrainData=data[(data["Ukraine"]!=1) & (data['touching']==1)]

In [11]:
# set y variable we are trying to predict
y=withoutUkrainData['pct_tot']

In [12]:
# set independant variables 
features_cols = [
                    'gdp_millions_norm', 
#                  'bilateral_migration_norm', 
#                  'gdp_per_cap',
#                 'migrants_per_cap',
#                  'population_norm', 
                 'v2x_libdem',
#                  'v2xeg_eqdr',
#                  'same_language',
#                  'migrant_ratio',
#                  'remittances_norm'
]
features_normalized = withoutUkrainData[features_cols]

In [13]:
ukr[['country']+features_cols]

Unnamed: 0,country,gdp_millions_norm,v2x_libdem
45,Hungary,0.262177,0.362
46,Moldova,0.0,0.615
47,Poland,1.0,0.413
48,Romania,0.427543,0.644
49,Slovakia,0.162305,0.769


###   We also tried to use these additional features in the model, however none of these features had a high enough p value to be included in the final model
 1. "normalized_qrdp"
 2. "same_language"
 3. "touching"
 4. "normalized_lib"
 5. "normalized_pop"
 6. "normalized_migr_ratio"
 7. "normalized_gdp"
 8. "normalized_export_trade"
 9. "normalized_remittances"
 

In [14]:
# Run the linear regression. 
import statsmodels.api as sm

In [15]:
results=sm.OLS(y,features_normalized).fit()

In [16]:
results.summary()

0,1,2,3
Dep. Variable:,pct_tot,R-squared (uncentered):,0.552
Model:,OLS,Adj. R-squared (uncentered):,0.528
Method:,Least Squares,F-statistic:,22.83
Date:,"Fri, 15 Apr 2022",Prob (F-statistic):,3.49e-07
Time:,00:33:13,Log-Likelihood:,9.79
No. Observations:,39,AIC:,-15.58
Df Residuals:,37,BIC:,-12.25
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
gdp_millions_norm,0.2191,0.075,2.909,0.006,0.067,0.372
v2x_libdem,0.4174,0.150,2.781,0.008,0.113,0.722

0,1,2,3
Omnibus:,4.794,Durbin-Watson:,2.381
Prob(Omnibus):,0.091,Jarque-Bera (JB):,4.579
Skew:,0.279,Prob(JB):,0.101
Kurtosis:,4.583,Cond. No.,2.92


In [17]:
features_to_predict=data[features_cols]
shares = results.predict(features_to_predict)
data['predicted_shares'] = shares
ukr_results = data[data['Ukraine']==1][['country','pct_tot','predicted_shares']]
ukr_results.to_csv('outputs/ukraine_model_results.csv',index=False)

In [18]:
ukr_results

Unnamed: 0,country,pct_tot,predicted_shares
45,Hungary,0.094222,0.208548
46,Moldova,0.093682,0.256713
47,Poland,0.58641,0.39149
48,Romania,0.154561,0.362491
49,Slovakia,0.071125,0.356556
