In [105]:
import numpy as np
import csv
import math
import progressbar
import xgboostlib
import random

from sklearn.model_selection import cross_val_score

In [106]:
# load csv file
data_pathname = "./dataset/ASR2016-clean.csv"
data_file = open(data_pathname, 'r')
reader = csv.reader(data_file)
headers = next(reader, None) # skip headers in csv

# northeast, south, midwest, west
X = [[], [], [], []] 
y = [[], [], [], []]

# load regional data and labels
for row in reader:
    row = [float(item) if item !=' ' else -1.0 for item in row]
    
    region = int(row[0] - 1)
    
    label = row[-1] == float('1') # is employed
    features = row[1:-1] # 32 features
    
    X[region].append(features)
    y[region].append(label)

In [107]:
# split regions into train-test-validation sets
NUM_REGIONS = 4
PERCENT_TRAIN = 0.8

X_train = [[], [], [], []] 
X_test = [[], [], [], []] 
y_train = [[], [], [], []]
y_test = [[], [], [], []]

for region in range(NUM_REGIONS):
    num_training = int(PERCENT_TRAIN*len(y[region]))
    
    
    X_train[region] = np.asarray(X[region])[:num_training]
    X_test[region] = np.asarray(X[region])[num_training + 1:]
    
    y_train[region] = np.asarray(y[region])[:num_training]
    y_test[region] = np.asarray(y[region])[num_training + 1:]

In [None]:
# XGBoost Classifier (from https://github.com/eriklindernoren/ML-From-Scratch)
# train XGBoost neighbors classification on each region 
import matplotlib.pyplot as plt

NUM_SEARCH_ITERS = 10

for region in range(2, NUM_REGIONS):
    
    print("===============REGION: " + str(region) + "===============")
    best_accuracy = 0
    best_params = [None, None]
    
    for iter in range(NUM_SEARCH_ITERS):
        
        #hyperparams: random search
        learn_rate = random.uniform(0.01, 0.1)
        max_depth = random.randint(2,10) #default is 6
        #min_splt_loss = random.randint(0,5) #default is 0

        print("Iteration "+ str(iter))
        print("Hyperparameters: eta=" + str(learn_rate) + " max_depth="+str(max_depth))

        clfs = [[], [], [], []]
        
        #cross validation: 
        FOLDS = 5
        X_sec = np.array_split(X_train[region], FOLDS)
        y_sec = np.array_split(y_train[region], FOLDS) # breaks if percent train <= 0.5?

        #do cross validaion FOLDS=5 times. print accuracy for each. 
        total_accuracy = 0
        print("Starting Cross-Validation for region "+str(region))
        for i in range(FOLDS):
            #format the data
            cv_y_train = np.array(y_sec[:i] + y_sec[i+1:]).reshape(-1)       
            cv_X_train = np.array(X_sec[:i] + X_sec[i+1:])
            cv_X_train = cv_X_train.reshape(-1, cv_X_train.shape[-1])

            cv_y_test = np.array(y_sec[i])
            cv_X_test = np.array(X_sec[i])

            #training  
            cv_y_train = cv_y_train.astype(int)
            clfs[region] = xgboostlib.XGBoost(\
                learning_rate = learn_rate, max_depth = max_depth)#todo: add more params
            clfs[region].fit(cv_X_train, cv_y_train)

            #testing
            def accuracy_score(y_true, y_pred):
                accuracy = np.sum(y_true == y_pred, axis=0) / len(y_true)
                return accuracy

            y_logits = [[], [], [], []]
            y_pred = clfs[region].predict(cv_X_test)

            y_logits[region] = clfs[region].logits[:, 1]

            accuracy = accuracy_score(cv_y_test, y_pred)
            total_accuracy += accuracy

            print("Score for fold "+ str(i) + ": " + str(accuracy))

        avg_accuracy = total_accuracy / FOLDS
        print("Average accuracy for region "+ str(region) + ": " + str(avg_accuracy))

        if avg_accuracy > best_accuracy:
            best_accuracy = avg_accuracy
            best_params = [learn_rate, max_depth]

    print("Region: " + str(region) + ", Best accuracy: " + str(best_accuracy)  + ", with learning rate: " \
      + str(best_params[0])  + ", and max depth: " + str(best_params[1]))


Training:   0% [                                               ] ETA:  --:--:--

Iteration 0
Hyperparameters: eta=0.03307843342795409 max_depth=10
Starting Cross-Validation for region 2


Training: 100% [------------------------------------------------] Time: 0:07:13
Training:   0% [                                               ] ETA:  --:--:--

Score for fold 0: 0.9230769230769231


Training: 100% [------------------------------------------------] Time: 0:06:57
Training:   0% [                                               ] ETA:  --:--:--

Score for fold 1: 0.9321266968325792


Training: 100% [------------------------------------------------] Time: 0:07:13
Training:   0% [                                               ] ETA:  --:--:--

Score for fold 2: 0.9366515837104072


Training:  12% [------                                          ] ETA:  0:06:01

In [127]:

# for each region, use the best hyperparameters
# plot loss over training iterations (not num iters)

# train again, using the best hyperparamters 
# keep the logits --> so we can input into matching algorithm, 
# for now we can simply just pick the highest guess across each region

Region: 0, Best accuracy: 0.9109243697478991, with learning rate: 0.06838765161547133, and max depth: 2


In [None]:
#