In [1]:
import pandas as pd
import numpy as np
from astropy.time import Time
from scipy import interpolate
from scipy.stats import spearmanr
import matplotlib.pyplot as plot
from geopy.distance import distance as gdistance
from sklearn.preprocessing import StandardScaler as SCALER
import itertools
import seaborn as sns
import immigration_data
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate,GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, roc_auc_score
import xgboost as xgb
import get_best_xgb_model

In [2]:
import immigration_data
import importlib
importlib.reload(immigration_data)
imm = immigration_data.immigration_data(nyears_lookback = 1)
X,y,d = imm.get_training_data()
year_pred = imm.year_pred


We can not use standard cross validation as this is temporally ordered data. Below is a walk-forward cross validation scheme.

In [3]:
def CV_immigration_data(clf, X, y, year_pred):

    score = []
    year = 2004 + np.arange(14)
    for ye in year:
        ind_test = year_pred == ye
        ind_train = (year_pred < ye) 
        Xtrain = X[ind_train, :]
        ytrain = y[ind_train]
        Xtest = X[ind_test, :]
        ytest = y[ind_test]
        clf.fit(Xtrain, ytrain)
        ypred = clf.predict(Xtest)
        rr = roc_auc_score(ytest, ypred)
        score.append(rr)
    

    score_out = np.mean(score)
    
    return score_out


Here is the hyper parameter optimization code. Additional hyper parameters may be added to the space if desired. The results of this run are hard-coded into get_best_xgb_model.py

In [6]:
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval

def objective(para):
    # print(para['max_depth'],para['learning_rate'])
    clf = xgb.XGBClassifier(**para, objective='binary:logistic')

    testScore = CV_immigration_data(clf, X, y, year_pred)
    #print(testScore)
    return {'loss': -1 * testScore, 'status': STATUS_OK}
                                

trials = Trials()
space = {
            'learning_rate':    hp.choice('learning_rate',    np.arange(0.01, 0.21, 0.02)),
            'max_depth':        hp.choice('max_depth',        np.arange(3, 16, 1, dtype=int)),
            'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
            'colsample_bytree': hp.choice('colsample_bytree', np.arange(0.3, 0.8, 0.1)),
            'subsample':        hp.uniform('subsample', 0.8, 1),
            'n_estimators':     hp.choice('n_estimators', np.arange(1,100,1)),
            'gamma' :           hp.choice('gamma', [0,0.5, 1,2.5, 5, 10]), 
            'alpha' :           hp.choice('alpha', [0, 1e-5, 1e-3, 1e-1, 1, 5, 10])}
result = fmin(fn = objective, space = space, algo = tpe.suggest,
                      trials=trials, max_evals=500)

100%|██████████| 500/500 [8:53:54<00:00, 76.81s/it, best loss: -0.6268685984779628]    


Below are the best parameters which are hard-coded into get_best_xgb_model.py:

In [4]:
        para = {'alpha': 5, 
                'colsample_bytree': 0.4, 
                'gamma': 10, 
                'learning_rate': 0.12999999999999998, 
                'max_depth': 7, 
                'min_child_weight': 5, 
                'n_estimators': 46, 
                'subsample': 0.8779937359366555}