In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

In [2]:
# import data
data=pd.read_csv('refugee_data/refugee_data_final.csv')
data = data[data['country']!='Russian Federation']

In [3]:
# if we want to recalc percent total of conflict after dropping Russia...
data['pct_tot'] = data['individualPerCountry'] / data.groupby('conflict')['individualPerCountry'].transform('sum')

In [4]:
data['bilateral_migration_percap'] = data['bilateral_migration']/data['population']

In [5]:
data['gdp_per_cap'] = data['gdp_millions']*1000000/data['population']

In [6]:
data['migrants_per_cap'] = data['total_recored_migrants']/data['population']

In [7]:
cols_to_scale = ['bilateral_migration','gdp_millions','population', 'remittances']

In [8]:
scaler = MinMaxScaler()
for col in cols_to_scale:
    print(col)
    normed = pd.DataFrame()
    
    for y, x in data.groupby('conflict'):
        norm_ = [i[0] for i in scaler.fit_transform(x[col].values.reshape(-1,1))]
        countries = x['country']
        conflict_ = x['conflict']
        res = pd.DataFrame(tuple(zip(countries,conflict_,norm_)), columns=['country','conflict',f"{col}_norm"])
        normed = normed.append(res)
    data = pd.merge(data, normed, left_on=['country','conflict'], right_on=['country','conflict'], how='right')

bilateral_migration
gdp_millions
population
remittances


In [9]:
ukr = data[data['conflict']=='Ukraine']

In [10]:
#train model without Ukraine so we use withoutUkrain
withoutUkrainData=data[(data["Ukraine"]!=1) & (data['touching']==1)]

In [11]:
# set y variable we are trying to predict
y=withoutUkrainData['pct_tot']

In [12]:
# set independant variables 
features_cols = [
                    'gdp_millions_norm', 
                    'v2x_libdem',
]
features_normalized = withoutUkrainData[features_cols]

In [13]:
ukr[['country']+features_cols]

Unnamed: 0,country,gdp_millions_norm,v2x_libdem
46,Hungary,0.262177,0.362
47,Moldova,0.0,0.615
48,Poland,1.0,0.413
49,Romania,0.427543,0.644
50,Slovakia,0.162305,0.769


In [14]:
# Run the linear regression. 
import statsmodels.api as sm

In [15]:
results=sm.OLS(y,features_normalized).fit()

In [16]:
results.summary()

0,1,2,3
Dep. Variable:,pct_tot,R-squared (uncentered):,0.568
Model:,OLS,Adj. R-squared (uncentered):,0.545
Method:,Least Squares,F-statistic:,24.96
Date:,"Fri, 15 Apr 2022",Prob (F-statistic):,1.2e-07
Time:,09:49:50,Log-Likelihood:,11.048
No. Observations:,40,AIC:,-18.1
Df Residuals:,38,BIC:,-14.72
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
gdp_millions_norm,0.2344,0.073,3.194,0.003,0.086,0.383
v2x_libdem,0.4026,0.146,2.753,0.009,0.107,0.699

0,1,2,3
Omnibus:,4.58,Durbin-Watson:,2.316
Prob(Omnibus):,0.101,Jarque-Bera (JB):,4.491
Skew:,0.215,Prob(JB):,0.106
Kurtosis:,4.584,Cond. No.,2.92


In [17]:
features_to_predict=data[features_cols]
shares = results.predict(features_to_predict)
data['predicted_shares'] = shares
ukr_results = data[data['Ukraine']==1][['country','pct_tot','predicted_shares']]
ukr_results.to_csv('outputs/ukraine_model_results.csv',index=False)

In [18]:
ukr_results

Unnamed: 0,country,pct_tot,predicted_shares
46,Hungary,0.094222,0.2072
47,Moldova,0.093682,0.247616
48,Poland,0.58641,0.400666
49,Romania,0.154561,0.3595
50,Slovakia,0.071125,0.347662
