In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sb
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.neighbors import KNeighborsRegressor
# from sklearn.kernel_ridge import KernelRidge
from sklearn import metrics
%matplotlib inline

In [2]:
origins = ["DR_Congo", "Afghanistan", "Syria", "Myanmar", "Sudan"]
destinations = ["USA", "UK", "France", "Canada", "Italy", "Germany"]

org_codes = ["COD", "AFG", "SYR", "MMR", "SDN"]
dest_codes = ["USA", "GBR", "FRA", "CAN", "ITA","DEU"]

years = [2000 + i for i in range(19)]

features = [
 'applied',
 'accepted',
 'Rejected',
 'decisions',
 'month_number',
 'month_id',
 'deaths',
 'last_month',
 'two_months_ago',
 'distance']

responses = ['Value', 'next_month', 'two_months_later']

all_variables = features + responses

months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sept", "Oct", "Nov", "Dec"]

combos = []
for year in years:
    for month in months:
        combos.append((month, year))

In [3]:
coordinates = {}
coordinates["Syria"] = (34.8, 38.997)
coordinates["DR_Congo"] = (-4.0383, 21.759)
coordinates["Afghanistan"] = (33.939, 67.71)
coordinates["Sudan"] = (12.863, 30.218)
coordinates["Myanmar"] = (21.9162, 95.956)

coordinates["USA"] = (37.09, -95.713)
coordinates["UK"] = (55.378, -3.436)
coordinates["France"] = (46.228, 2.214)
coordinates["Germany"] = (51.166, 10.4515)
coordinates["Italy"] = (41.872, 12.5674)
coordinates["Canada"] = (56.1304, -106.3468)


dest_longs = [coordinates[dest][1] for dest in destinations]
dest_lats = [coordinates[dest][0] for dest in destinations]

org_longs = [coordinates[org][1] for org in origins]
org_lats = [coordinates[org][0] for org in origins]

In [4]:
def get_metrics(y_test, y_pred):

    """Gets the metrics of the fit"""

    if min(y_pred) < 0:
#         print(min(y_pred))
        y_pred += abs(min(y_pred))
    r2 = metrics.r2_score(y_test, y_pred)
    r2 = r2.__round__(4)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    rmse = rmse.__round__(4)

    return r2, rmse

In [5]:
def make_new_features(df):
    
    '''Makes new features for grouped dataframe'''
    
    for month in months:
        df.ix[df.Month.str.contains(month), ['Month']] = month
    month_ids = []
    d_lats = []
    d_longs = []
    o_lats = []
    o_longs = []
    for index, row in df.iterrows():
        month = row.Month
        month_ids.append(months.index(month) + 1)
        d_lats.append(coordinates[row.destination][0])
        o_lats.append(coordinates[row.Origin][0])
        d_longs.append(coordinates[row.destination][1])
        o_longs.append(coordinates[row.Origin][1])
    df['month_id'] = month_ids
    df['month_number'] =12*(df['Year']-2000) + df['month_id']
    df['dest_lat'] = d_lats
    df['dest_long'] = d_longs
    df['org_lat'] = o_lats
    df['org_long'] = o_longs
    df = df.sort_values(by='month_number')
    
    return df

In [6]:
def get_distance(o_coords, d_coords, length):
    '''Gets the distance between two countries using Havrsine formula'''
    # Earth's radius in km
    R = 6371.
    
    d_lat, d_long = np.deg2rad(d_coords[0]), np.deg2rad(d_coords[1])
    o_lat, o_long = np.deg2rad(o_coords[0]), np.deg2rad(o_coords[1])
    
    a = (np.sin((d_lat - o_lat)/2))**2 + \
        np.cos(d_lat)*np.cos(o_lat)*(np.sin((d_long - o_long)/2))**2
        
    c = 2*np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = R*c
    
    return [d for i in range(length)]

### Get data

In [7]:
pairs = []
for origin in origins:
    for destination in destinations:
        pairs.append((origin, destination))

# Read in data
groups = {}
for pair in pairs:
    groups[pair] = pd.read_csv('./data/grouped_' + str(pair) + '.csv', skipinitialspace=True)
    groups[pair] = make_new_features(groups[pair])
    groups[pair]['distance'] = get_distance(coordinates[pair[1]], coordinates[pair[0]], groups[pair].shape[0])
    groups[pair] = groups[pair][all_variables]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


### Set up models and metrics

In [8]:
models = {'LinearRegression': LinearRegression, 'BayesianRidge': BayesianRidge, \
          'RandomForestRegressor': RandomForestRegressor, 'KNeighborsRegressor': KNeighborsRegressor}#, 'MLPRegressor': MLPRegressor}
r2s = {}
rmses = {}

# fit intercept, n_interations, n_estimators, num_neighbors, architecture
arguments = {'LinearRegression': True, 'BayesianRidge': 300, 'RandomForestRegressor': 10,'KNeighborsRegressor': 6}
            # , 'MLPRegressor': (50, 40, 35)}

# Make metric dictionaries
r2s = {}
rmses = {}
for y in responses:
    r2s[y] = {}
    rmses[y] = {}

### Run models

In [14]:
for pair in pairs:
    for model in models:
        global this_model
        this_model = models[model]
        X = groups[pair][features]
        X_train = X.iloc[:int(0.8*X.shape[0]), :]
        X_test = X.iloc[int(0.8*X.shape[0]):, :]
        for response in responses:
            y = np.log10(groups[pair][response])
            y_train = y.iloc[:int(0.8*y.shape[0])]
            y_test = y.iloc[int(0.8*y.shape[0]):]
            regressor = globals()[str(model)](arguments[model])
            fit = regressor.fit(X_train, y_train)
            y_pred = fit.predict(X_test)
            r2, rmse = get_metrics(y_test, y_pred)
            r2s[response][model] = r2
            rmses[response][model] = rmse

In [19]:
for key in r2s:
    for model in r2s[key]:
        print('model = %s, response = %s, R^2 = %.4f' % (model, key, r2s[key][model]))

model = LinearRegression, response = Value, R^2 = -65.6149
model = BayesianRidge, response = Value, R^2 = -43.2771
model = RandomForestRegressor, response = Value, R^2 = -1.2760
model = KNeighborsRegressor, response = Value, R^2 = -3.6101
model = LinearRegression, response = next_month, R^2 = -24.9477
model = BayesianRidge, response = next_month, R^2 = -16.3529
model = RandomForestRegressor, response = next_month, R^2 = -2.4440
model = KNeighborsRegressor, response = next_month, R^2 = -2.7374
model = LinearRegression, response = two_months_later, R^2 = -30.0853
model = BayesianRidge, response = two_months_later, R^2 = -26.6572
model = RandomForestRegressor, response = two_months_later, R^2 = -1.3227
model = KNeighborsRegressor, response = two_months_later, R^2 = -1.9690


In [20]:
for key in rmses:
    for model in rmses[key]:
        print('model = %s, response = %s, R^2 = %.4f' % (model, key, rmses[key][model]))

model = LinearRegression, response = Value, R^2 = 2.5261
model = BayesianRidge, response = Value, R^2 = 2.0595
model = RandomForestRegressor, response = Value, R^2 = 0.4669
model = KNeighborsRegressor, response = Value, R^2 = 0.6645
model = LinearRegression, response = next_month, R^2 = 1.5999
model = BayesianRidge, response = next_month, R^2 = 1.3084
model = RandomForestRegressor, response = next_month, R^2 = 0.5829
model = KNeighborsRegressor, response = next_month, R^2 = 0.6072
model = LinearRegression, response = two_months_later, R^2 = 1.8377
model = BayesianRidge, response = two_months_later, R^2 = 1.7334
model = RandomForestRegressor, response = two_months_later, R^2 = 0.5023
model = KNeighborsRegressor, response = two_months_later, R^2 = 0.5679
