In [None]:
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

In [None]:
# import data
data=pd.read_csv('refugee_data/refugee_data_final.csv')
data = data[data['country']!='Russian Federation']

In [None]:
# if we want to recalc percent total of conflict after dropping Russia...
data['pct_tot'] = data['individualPerCountry'] / data.groupby('conflict')['individualPerCountry'].transform('sum')

In [None]:
data['bilateral_migration_percap'] = data['bilateral_migration']/data['population']

In [None]:
data['gdp_per_cap'] = data['gdp_millions']*1000000/data['population']

In [None]:
data['migrants_per_cap'] = data['total_recored_migrants']/data['population']

In [None]:
cols_to_scale = ["historic_gdp_millions",'population', 'remittances']

In [None]:
scaler = MinMaxScaler()
for col in cols_to_scale:
    print(col)
    normed = pd.DataFrame()
    
    for y, x in data.groupby('conflict'):
        norm_ = [i[0] for i in scaler.fit_transform(x[col].values.reshape(-1,1))]
        countries = x['country']
        conflict_ = x['conflict']
        res = pd.DataFrame(tuple(zip(countries,conflict_,norm_)), columns=['country','conflict',f"{col}_norm"])
        normed = normed.append(res)
    data = pd.merge(data, normed, left_on=['country','conflict'], right_on=['country','conflict'], how='right')

In [None]:
ukr = data[data['conflict']=='Ukraine']

In [None]:
#train model without Ukraine so we use withoutUkrain
withoutUkrainData=data[(data["Ukraine"]!=1) & (data['touching']==1)]

In [None]:
# set y variable we are trying to predict
y=withoutUkrainData['pct_tot']

In [None]:
# set independant variables 
features_cols = [
    "historic_gdp_millions_norm",
    "v2x_libdem"
#                    'gdp_per_cap_historic_norm',
#                    'gdp_millions_norm'
#                  'bilateral_migration_norm', 
#                  'gdp_per_cap',
#                 'migrants_per_cap',
#                  'population_norm', 
#                 'v2x_libdem',
#                  'v2xeg_eqdr',
#                  'same_language',
#                  'migrant_ratio',
#                  'remittances_norm'
]
features_normalized = withoutUkrainData[features_cols]

In [None]:
ukr[['country']+features_cols]

###   We also tried to use these additional features in the model, however none of these features had a high enough p value to be included in the final model
 1. "normalized_qrdp"
 2. "same_language"
 3. "touching"
 4. "normalized_lib"
 5. "normalized_pop"
 6. "normalized_migr_ratio"
 7. "normalized_gdp"
 8. "normalized_export_trade"
 9. "normalized_remittances"
 

In [None]:
# Run the linear regression. 
import statsmodels.api as sm

In [None]:
results=sm.OLS(y,features_normalized).fit()

In [None]:
results.summary()

In [None]:
features_to_predict=data[features_cols]
shares = results.predict(features_to_predict)
data['predicted_shares'] = shares
ukr_results = data[data['Ukraine']==1][['country','pct_tot','predicted_shares']]
ukr_results.to_csv('outputs/ukraine_model_results.csv',index=False)

In [None]:
ukr_results