In [26]:
import csv
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [27]:
X = np.loadtxt(open('data/X_train.csv', 'rb'), delimiter=',', skiprows=1)
y = np.loadtxt(open('data/y_train.csv', 'rb'), delimiter=',', skiprows=1)

# Number of datapoints N, input features D
N = len(X)
D = len(X[0])

In [28]:
# entering_host one-hot encoding vector of length 81
entering_host_vectors = np.zeros(shape=(N, 81))

# Map ids to indices
entering_host_ids = []

for i in range(N):
    host_id = X[i][16]
    if host_id not in entering_host_ids:
        entering_host_ids.append(host_id)
    entering_host_vectors[i][entering_host_ids.index(host_id)] = 1

In [29]:
# summoning_host one-hot encoding vector of length 85
summoning_host_vectors = np.zeros(shape=(N, 85))

# Map ids to indices
summoning_host_ids = []

for i in range(N):
    host_id = X[i][25]
    if host_id not in summoning_host_ids:
        summoning_host_ids.append(host_id)
    summoning_host_vectors[i][summoning_host_ids.index(host_id)] = 1

In [30]:
# Delete unnecessary cols (same vals, missing vals, one-hot encoded)
X = np.delete(X, [0, 1, 2, 5, 6, 14, 16, 23, 25], axis=1)
X = np.append(np.append(X, entering_host_vectors, axis=1), summoning_host_vectors, axis=1)

In [31]:
# Split different queue types

X_split = [[] for _ in range(5)]
y_split = [[] for _ in range(5)]

for i in range(N):
    for j in range(2, 7):
        if X[i][j] == 1:
            X_split[j-2].append(X[i])
            y_split[j-2].append(y[i])
            break
            
for i in range(5):
    X_split[i] = np.array(X_split[i])
    y_split[i] = np.array(y_split[i])

In [32]:
# Replace missing values with medians

# q: queue type idx
def replace_missing(q, col_idx):
    X_q = X_split[q]
    col = X_q[:, col_idx]
    med = np.median(col[np.where(col != -1)])
    X_q[:, col_idx] = np.where(col == -1, med, col)
    
for q in range(5):
    replace_missing(q, 7)
    replace_missing(q, 15)

In [33]:
def fit_predict(X_train, y_train, X_test):
    gbr = GradientBoostingRegressor()
    gbr.fit(X_train, y_train)
    y_pred = gbr.predict(X_test)
    return y_pred

def fit_predict_rf(X_train, y_train, X_test):
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    return y_pred

In [34]:
y_test_combined = []
y_pred_combined = []
y_pred_rf_combined = []

for q in range(5):
    X_q, y_q = X_split[q], y_split[q]
    X_train, X_test, y_train, y_test = train_test_split(X_q, y_q, test_size=0.3)
    y_pred = fit_predict(X_train, y_train, X_test)
    y_pred_rf = fit_predict_rf(X_train, y_train, X_test)
    y_test_combined += list(y_test)
    y_pred_combined += list(y_pred)
    y_pred_rf_combined += list(y_pred_rf)



In [35]:
mean_squared_error(y_test_combined, y_pred_combined)

1430.530954022257

In [36]:
mean_squared_error(y_test_combined, y_pred_rf_combined)

1195.7574806224711

In [47]:
y_pred_avg = 0.7 * np.array(y_pred_combined) + 0.3 * np.array(y_pred_rf_combined)
mean_squared_error(y_test_combined, y_pred_avg)

1223.5650710027032

In [9]:
# For submission

X_test_real = np.loadtxt(open('data/X_test.csv', 'rb'), delimiter=',', skiprows=1)
N = len(X_test_real)

# entering_host one-hot encoding vector of length 81
entering_host_vectors = np.zeros(shape=(N, 81))

for i in range(len(X_test_real)):
    host_id = X_test_real[i][16]
    if host_id not in entering_host_ids:
        break
    entering_host_vectors[i][entering_host_ids.index(host_id)] = 1

    
# summoning_host one-hot encoding vector of length 85
summoning_host_vectors = np.zeros(shape=(N, 85))

for i in range(len(X_test_real)):
    host_id = X_test_real[i][25]
    if host_id not in summoning_host_ids:
        break
    summoning_host_vectors[i][summoning_host_ids.index(host_id)] = 1
    
    
# Delete unnecessary cols (same vals, missing vals, one-hot encoded)
X_test_real = np.delete(X_test_real, [0, 1, 2, 5, 6, 14, 16, 23, 25], axis=1)
X_test_real = np.append(np.append(X_test_real, entering_host_vectors, axis=1), summoning_host_vectors, axis=1)


# Split different queue types
X_test_split = [[] for _ in range(5)]
X_ids_split = [[] for _ in range(5)]

for i in range(len(X_test_real)):
    for j in range(2, 7):
        if X_test_real[i][j] == 1:
            X_test_split[j-2].append(X_test_real[i])
            X_ids_split[j-2].append(i)
            break
            
for i in range(5):
    X_test_split[i] = np.array(X_test_split[i])

    
# Replace missing values with medians

# q: queue type idx
def replace_missing(q, col_idx):
    X_q = X_test_split[q]
    col = X_q[:, col_idx]
    med = np.median(col[np.where(col != -1)])
    X_q[:, col_idx] = np.where(col == -1, med, col)
    
for q in range(5):
    replace_missing(q, 7)
    replace_missing(q, 15)
    
    
# Train and predict

y_pred_combined = []
y_pred_rf_combined = []

for q in range(5):
    X_train, y_train = X_split[q], y_split[q]
    X_test = X_test_split[q]
    y_pred = fit_predict(X_train, y_train, X_test)
    y_pred_rf = fit_predict_rf(X_train, y_train, X_test)
    y_pred_combined.append(y_pred)
    y_pred_rf_combined.append(y_pred_rf)
    
# Sort by id
y_pred_sorted = [0 for _ in range(N)]
y_pred_rf_sorted = [0 for _ in range(N)]

for q in range(5):
    for i in range(len(y_pred_combined[q])):
        y_pred_sorted[X_ids_split[q][i]] = y_pred_combined[q][i]
        y_pred_rf_sorted[X_ids_split[q][i]] = y_pred_rf_combined[q][i]



In [44]:
y_pred_avg = 0.7 * np.array(y_pred_sorted) + 0.3 * np.array(y_pred_rf_sorted)

In [46]:
with open('submissions/sub4.csv', 'w') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerow(['id', 'actual_wait div 60000'])
    for i in range(len(y_pred_sorted)):
        writer.writerow([i, y_pred_avg[i]])

In [45]:
y_pred_avg

array([41.63134534, 16.61670116, 55.50389858, ..., 23.06341763,
       23.34102291, 24.6447005 ])