In [1]:
import csv
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_validate

In [2]:
X = np.loadtxt(open('data/X_train.csv', 'rb'), delimiter=',', skiprows=1)
y = np.loadtxt(open('data/y_train.csv', 'rb'), delimiter=',', skiprows=1)

# Number of datapoints N, input features D
N = len(X)
D = len(X[0])

In [3]:
# entering_host one-hot encoding vector of length 81
entering_host_vectors = np.zeros(shape=(N, 81))

# Map ids to indices
entering_host_ids = []

for i in range(N):
    host_id = X[i][16]
    if host_id not in entering_host_ids:
        entering_host_ids.append(host_id)
    entering_host_vectors[i][entering_host_ids.index(host_id)] = 1

In [4]:
# summoning_host one-hot encoding vector of length 85
summoning_host_vectors = np.zeros(shape=(N, 85))

# Map ids to indices
summoning_host_ids = []

for i in range(N):
    host_id = X[i][25]
    if host_id not in summoning_host_ids:
        summoning_host_ids.append(host_id)
    summoning_host_vectors[i][summoning_host_ids.index(host_id)] = 1

In [5]:
# Delete unnecessary cols (same vals, missing vals, one-hot encoded)
X = np.delete(X, [0, 1, 2, 5, 6, 14, 16, 23, 25], axis=1)
X = np.append(np.append(X, entering_host_vectors, axis=1), summoning_host_vectors, axis=1)

In [6]:
# Replace missing values with medians

def replace_missing(col_idx):
    col = X[:, col_idx]
    med = np.median(col[np.where(col != -1)])
    X[:, col_idx] = np.where(col == -1, med, col)
    
replace_missing(7)
replace_missing(15)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Normalize y_train for each queue type

y_split = [[] for _ in range(5)]

N = len(X_train)

for i in range(N):
    for j in range(2, 7):
        if X_train[i][j] == 1:
            y_split[j-2].append(y_train[i])
            break

means = [0 for _ in range(5)]
stds = [0 for _ in range(5)]

for q in range(5):
    y_q = y_split[q]
    means[q] = np.mean(y_q)
    stds[q] = np.std(y_q)

for i in range(N):
    for j in range(2, 7):
        if X_train[i][j] == 1:
            y_train[i] = (y_train[i] - means[j-2]) / stds[j-2]
            break

In [7]:
def fit_predict(X_train, y_train, X_test):
    gbr = GradientBoostingRegressor()
    gbr.fit(X_train, y_train)
    y_pred = gbr.predict(X_test)
    return y_pred

def fit_predict_rf(X_train, y_train, X_test):
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    return y_pred

In [27]:
y_pred = fit_predict(X_train, y_train, X_test)
y_pred_rf = fit_predict_rf(X_train, y_train, X_test)



In [28]:
# Unnormalize y_pred (scale back to original)

N = len(y_pred)

for i in range(N):
    for j in range(2, 7):
        if X_test[i][j] == 1:
            y_pred[i] = y_pred[i] * stds[j-2] + means[j-2]
            y_pred_rf[i] = y_pred_rf[i] * stds[j-2] + means[j-2]
            break

In [29]:
mean_squared_error(y_test, y_pred)

1562.3317933434073

In [30]:
mean_squared_error(y_test, y_pred_rf)

1195.0462731428913

In [31]:
y_pred_avg = 0.7 * np.array(y_pred) + 0.3 * np.array(y_pred_rf)
mean_squared_error(y_test, y_pred_avg)

1301.6930683794478

In [8]:
# For submission

# Normalize y for each queue type

y_split = [[] for _ in range(5)]

N = len(X)

for i in range(N):
    for j in range(2, 7):
        if X[i][j] == 1:
            y_split[j-2].append(y[i])
            break

means = [0 for _ in range(5)]
stds = [0 for _ in range(5)]

for q in range(5):
    y_q = y_split[q]
    means[q] = np.mean(y_q)
    stds[q] = np.std(y_q)

for i in range(N):
    for j in range(2, 7):
        if X[i][j] == 1:
            y[i] = (y[i] - means[j-2]) / stds[j-2]
            break
            
            
X_test_real = np.loadtxt(open('data/X_test.csv', 'rb'), delimiter=',', skiprows=1)
N = len(X_test_real)

# entering_host one-hot encoding vector of length 81
entering_host_vectors = np.zeros(shape=(N, 81))

for i in range(len(X_test_real)):
    host_id = X_test_real[i][16]
    if host_id not in entering_host_ids:
        break
    entering_host_vectors[i][entering_host_ids.index(host_id)] = 1

    
# summoning_host one-hot encoding vector of length 85
summoning_host_vectors = np.zeros(shape=(N, 85))

for i in range(len(X_test_real)):
    host_id = X_test_real[i][25]
    if host_id not in summoning_host_ids:
        break
    summoning_host_vectors[i][summoning_host_ids.index(host_id)] = 1
    
    
# Delete unnecessary cols (same vals, missing vals, one-hot encoded)
X_test_real = np.delete(X_test_real, [0, 1, 2, 5, 6, 14, 16, 23, 25], axis=1)
X_test_real = np.append(np.append(X_test_real, entering_host_vectors, axis=1), summoning_host_vectors, axis=1)


# Replace missing values with medians

def replace_missing(col_idx):
    col = X_test_real[:, col_idx]
    med = np.median(col[np.where(col != -1)])
    X_test_real[:, col_idx] = np.where(col == -1, med, col)
    
replace_missing(7)
replace_missing(15)
    
    
# Train and predict

y_pred = fit_predict(X, y, X_test_real)
y_pred_rf = fit_predict_rf(X, y, X_test_real)


# Unnormalize y_pred (scale back to original)

N = len(y_pred)

for i in range(N):
    for j in range(2, 7):
        if X_test_real[i][j] == 1:
            y_pred[i] = y_pred[i] * stds[j-2] + means[j-2]
            y_pred_rf[i] = y_pred_rf[i] * stds[j-2] + means[j-2]
            break



In [9]:
y_pred_avg = 0.7 * np.array(y_pred) + 0.3 * np.array(y_pred_rf)

In [13]:
with open('submissions/sub5.csv', 'w') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerow(['id', 'actual_wait div 60000'])
    for i in range(len(y_pred_avg)):
        writer.writerow([i, y_pred_avg[i]])

In [10]:
y_pred_avg

array([33.5134111 , 20.3797707 , 40.80784683, ..., 18.4664022 ,
       18.5564022 , 17.87935424])