In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sb
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.neighbors import KNeighborsRegressor
# from sklearn.kernel_ridge import KernelRidge
from sklearn import metrics
%matplotlib inline

In [2]:
origins = ["DR_Congo", "Afghanistan", "Syria", "Myanmar", "Sudan"]
destinations = ["USA", "UK", "France", "Canada", "Italy", "Germany"]

org_codes = ["COD", "AFG", "SYR", "MMR", "SDN"]
dest_codes = ["USA", "GBR", "FRA", "CAN", "ITA","DEU"]

years = [2000 + i for i in range(19)]

features = [
 'applied',
 'accepted',
 'Rejected',
 'decisions',
 'month_number',
 'month_id',
 'deaths',
 'last_month',
 'two_months_ago',
 'distance']

responses = ['Value', 'next_month', 'two_months_later']

all_variables = features + responses

months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sept", "Oct", "Nov", "Dec"]

# combos = []
# for year in years:
#     for month in months:
#         combos.append((month, year))

In [3]:
coordinates = {}
coordinates["Syria"] = (34.8, 38.997)
coordinates["DR_Congo"] = (-4.0383, 21.759)
coordinates["Afghanistan"] = (33.939, 67.71)
coordinates["Sudan"] = (12.863, 30.218)
coordinates["Myanmar"] = (21.9162, 95.956)

coordinates["USA"] = (37.09, -95.713)
coordinates["UK"] = (55.378, -3.436)
coordinates["France"] = (46.228, 2.214)
coordinates["Germany"] = (51.166, 10.4515)
coordinates["Italy"] = (41.872, 12.5674)
coordinates["Canada"] = (56.1304, -106.3468)


dest_longs = [coordinates[dest][1] for dest in destinations]
dest_lats = [coordinates[dest][0] for dest in destinations]

org_longs = [coordinates[org][1] for org in origins]
org_lats = [coordinates[org][0] for org in origins]

In [4]:
def get_metrics(y_test, y_pred):

    """Gets the metrics of the fit"""

    if min(y_pred) < 0:
#         print(min(y_pred))
        y_pred += abs(min(y_pred))
    r2 = metrics.r2_score(y_test, y_pred)
    r2 = r2.__round__(4)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    rmse = rmse.__round__(4)

    return r2, rmse

In [5]:
def make_new_features(df):
    
    '''Makes new features for grouped dataframe'''
    
    for month in months:
        df.ix[df.Month.str.contains(month), ['Month']] = month
    month_ids = []
    d_lats = []
    d_longs = []
    o_lats = []
    o_longs = []
    for index, row in df.iterrows():
        month = row.Month
        month_ids.append(months.index(month) + 1)
        d_lats.append(coordinates[row.destination][0])
        o_lats.append(coordinates[row.Origin][0])
        d_longs.append(coordinates[row.destination][1])
        o_longs.append(coordinates[row.Origin][1])
    df['month_id'] = month_ids
    df['month_number'] =12*(df['Year']-2000) + df['month_id']
    df['dest_lat'] = d_lats
    df['dest_long'] = d_longs
    df['org_lat'] = o_lats
    df['org_long'] = o_longs
    df = df.sort_values(by='month_number')
    
    return df

In [6]:
def get_distance(o_coords, d_coords, length):
    '''Gets the distance between two countries using Havrsine formula'''
    # Earth's radius in km
    R = 6371.
    
    d_lat, d_long = np.deg2rad(d_coords[0]), np.deg2rad(d_coords[1])
    o_lat, o_long = np.deg2rad(o_coords[0]), np.deg2rad(o_coords[1])
    
    a = (np.sin((d_lat - o_lat)/2))**2 + \
        np.cos(d_lat)*np.cos(o_lat)*(np.sin((d_long - o_long)/2))**2
        
    c = 2*np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = R*c
    
    return [d for i in range(length)]

### Get data

In [7]:
pairs = []
for origin in origins:
    for destination in destinations:
        pairs.append((origin, destination))

triples = []
for pair in pairs:
    for response in responses:
        triples.append((pair[0], pair[1], response))
        
# Read in data
groups = {}
for pair in pairs:
    groups[pair] = pd.read_csv('./data/grouped_' + str(pair) + '.csv', skipinitialspace=True)
    groups[pair] = make_new_features(groups[pair])
    groups[pair]['distance'] = get_distance(coordinates[pair[1]], coordinates[pair[0]], groups[pair].shape[0])
    groups[pair] = groups[pair][all_variables]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


### Set up models and metrics

In [8]:
models = {'LinearRegression': LinearRegression, 'BayesianRidge': BayesianRidge, \
          'RandomForestRegressor': RandomForestRegressor, 'KNeighborsRegressor': KNeighborsRegressor}#, 'MLPRegressor': MLPRegressor}
r2s = {}
rmses = {}

# fit intercept, n_interations, n_estimators, num_neighbors, architecture
arguments = {'LinearRegression': True, 'BayesianRidge': 300, 'RandomForestRegressor': 10,'KNeighborsRegressor': 6}
            # , 'MLPRegressor': (50, 40, 35)}

# Make metric dictionaries
r2s = {}
rmses = {}
predictions = {}
trues = {}
results = {}
for y in responses:
    r2s[y] = {}
    rmses[y] = {}
    predictions[y] = {}
    trues[y] = {}
    results[y] = {}
    for pair in pairs:
        r2s[y][pair] = {}
        rmses[y][pair] = {}
        predictions[y][pair] = {}
        trues[y][pair] = {}
        results[y][pair] = {}

average_rmses = {}
for model in models:
    average_rmses[model] = {}
    for response in responses:
        average_rmses[model][response] = 0

### Run models

In [32]:
for pair in pairs:
    for model in models:
        global this_model
        this_model = models[model]
        X = groups[pair][features]
        X_train = X.iloc[:int(0.8*X.shape[0]), :]
        X_test = X.iloc[int(0.8*X.shape[0]):, :]
        for response in responses:
            y = np.log10(groups[pair][response])
            y_train = y.iloc[:int(0.8*y.shape[0])]
            y_test = y.iloc[int(0.8*y.shape[0]):]
            regressor = globals()[str(model)](arguments[model])
            fit = regressor.fit(X_train, y_train)
            y_pred = fit.predict(X_test)
            r2, rmse = get_metrics(y_test, y_pred)
            r2s[response][pair][model] = r2
            rmses[response][pair][model] = rmse
            predictions[response][pair][model] = y_pred
            trues[response][pair][model] = y_test
            average_rmses[model][response] += rmse
            results[response][pair][model] = [y_pred, y_test]

In [10]:
for pair in pairs:
    for response in r2s:
        for model in r2s[response][pair]:
#             print('model = %s, response = %s, pair = %s, R^2 = %.4f' % (model, key, str(pair), r2s[response][pair][model]))
            pass

In [11]:
for pair in pairs:
    for response in rmses:
        for model in rmses[response][pair]:
#             print('model = %s, response = %s, pair = %s, RMSE = %.4f' % (model, key, str(pair), rmses[response][pair][model]))
            pass

In [33]:
min_avg = 1e6
min_model = ''
for model in average_rmses:
    for response in responses:
        average_rmses[model][response] = np.mean(average_rmses[model][response])
        print('Average RMSE for %s for response %s is %.4f' %(model, response, average_rmses[model][response]))
        if average_rmses[model][response] < min_avg:
            min_avg = average_rmses[model][response]
            min_model = model
print('Minimum RMSE average is %.4f using %s' %(min_avg, min_model))

Average RMSE for LinearRegression for response Value is 652.7686
Average RMSE for LinearRegression for response next_month is 602.5893
Average RMSE for LinearRegression for response two_months_later is 560.5713
Average RMSE for BayesianRidge for response Value is 588.6495
Average RMSE for BayesianRidge for response next_month is 492.6234
Average RMSE for BayesianRidge for response two_months_later is 458.3799
Average RMSE for RandomForestRegressor for response Value is 114.2121
Average RMSE for RandomForestRegressor for response next_month is 105.8065
Average RMSE for RandomForestRegressor for response two_months_later is 102.4441
Average RMSE for KNeighborsRegressor for response Value is 180.6672
Average RMSE for KNeighborsRegressor for response next_month is 179.4996
Average RMSE for KNeighborsRegressor for response two_months_later is 178.8261
Minimum RMSE average is 102.4441 using RandomForestRegressor


### Visualize results

In [25]:
import dash
import dash_core_components as dcc
import dash_html_components as html
import plotly.graph_objs as go

import geopandas
from mpl_toolkits.basemap import Basemap
import plotly
import dash
import plotly.offline as ply
from plotly.graph_objs import *
from scipy.io import netcdf
import scipy.stats as stat

In [34]:
quads = []
for pair in pairs:
    for model in models:
        for response in responses:
            quads.append((pair[0], pair[1], response, model, 0))
            quads.append((pair[0], pair[1], response, model, 1))

TypeError: unhashable type: 'slice'

In [38]:
quads[:5]

[('DR_Congo', 'USA', 'Value', 'LinearRegression', 0),
 ('DR_Congo', 'USA', 'Value', 'LinearRegression', 1),
 ('DR_Congo', 'USA', 'next_month', 'LinearRegression', 0),
 ('DR_Congo', 'USA', 'next_month', 'LinearRegression', 1),
 ('DR_Congo', 'USA', 'two_months_later', 'LinearRegression', 0)]

In [35]:
fig = go.FigureWidget(
    data=[dict(
        x = groups[(quad[0], quad[1])]['month_number'][int(0.8*groups[(quad[0], quad[1])].shape[0]):,],
        y = results[quad[2]][(quad[0], quad[1])][quad[3]][quad[4]],
        name = str(quad), 
        mode = 'markers',
        opacity = 0.6
    ) for quad in quads
    ]
)
years = [2000 + i for i in range(18)]
marker_places= [12*i for i in range(18)]
tick_labs = [str(year) for year in years]
fig.layout.hovermode = 'closest'
fig.layout.yaxis.title = 'Refugees Taken In'
# fig.layout.xaxis.title = 'Month'
fig.layout.xaxis.ticktext = tick_labs
fig.layout.xaxis.tickvals = marker_places

In [36]:
from ipywidgets import interact

@interact(dest = destinations, org = origins, time = responses, model = list(models.keys()), data_type = ['Predicted', 'True']) # 0 = pred, 1 = true
def update_arguments(dest=['Canada'], org = ["Syria"], time = ['Value'], model = ["LinearRegression"], data_type = ['Predicted']):
    
    if data_type == 'Predicted':
        data_index = 0
    else:
        data_index = 1
    
    for i in range(len(fig.data)):
        this_pair = (org, dest)
        fig.data[i].x = groups[this_pair]['month_number'][int(0.8*groups[this_pair].shape[0]):,],
        fig.data[i].y = results[time][this_pair][model][data_index]\
                         + \
                        np.random.rand(len(results[time][this_pair][model][data_index]))
        if time[0] == 'Value':
            time_frame = 'This month'
        elif time[0] == 'next_month':
            time_frame = 'Next Month'
        else:
            time_frame = 'Two Months From Now'
    
    fig.layout.title = 'Refugees taken in by %s from %s predicting %s, %s values ' %(dest, org, time_frame, data_type)
fig

interactive(children=(Dropdown(description='dest', options=('USA', 'UK', 'France', 'Canada', 'Italy', 'Germany…

FigureWidget({
    'data': [{'mode': 'markers',
              'name': "('DR_Congo', 'USA', 'Value', 'LinearReg…

In [19]:
for x,y in [1,2,3], [4,5]:
    print(x + y)

ValueError: too many values to unpack (expected 2)

In [None]:
# results['Value'][pair][model][0]
this_pair = pair
time= 'Value'
len(results[time][this_pair][model][0])
data_index = 0

In [None]:
len(groups[this_pair]['month_number'][int(0.8*groups[this_pair].shape[0]):,])

In [None]:
len(groups[this_pair]['month_number'][int(0.8*groups[this_pair].shape[0]):,])