In [1]:
import csv
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
X = np.loadtxt(open('data/X_train.csv', 'rb'), delimiter=',', skiprows=1)
y = np.loadtxt(open('data/y_train.csv', 'rb'), delimiter=',', skiprows=1)

# Number of datapoints N, input features D
N = len(X)
D = len(X[0])

In [3]:
def clean_data(X, y):
    N = len(X)
    D = len(X[0])
    
    # entering_host one-hot encoding vector of length 81
    entering_host_vectors = np.zeros(shape=(N, 81))

    # Map ids to indices
    entering_host_ids = []

    for i in range(N):
        host_id = X[i][16]
        if host_id not in entering_host_ids:
            entering_host_ids.append(host_id)
        entering_host_vectors[i][entering_host_ids.index(host_id)] = 1
        
    # summoning_host one-hot encoding vector of length 85
    summoning_host_vectors = np.zeros(shape=(N, 85))

    # Map ids to indices
    summoning_host_ids = []

    for i in range(N):
        host_id = X[i][25]
        if host_id not in summoning_host_ids:
            summoning_host_ids.append(host_id)
        summoning_host_vectors[i][summoning_host_ids.index(host_id)] = 1
        
    # Delete unnecessary cols (same vals, missing vals, one-hot encoded)
    X = np.delete(X, [0, 1, 2, 5, 6, 14, 16, 23, 25], axis=1)
    X = np.append(np.append(X, entering_host_vectors, axis=1), summoning_host_vectors, axis=1)
    
    # Split different queue types
    X_split = [[] for _ in range(5)]
    y_split = [[] for _ in range(5)]

    for i in range(N):
        for j in range(2, 7):
            if X[i][j] == 1:
                X_split[j-2].append(X[i])
                y_split[j-2].append(y[i])
                break

    for i in range(5):
        X_split[i] = np.array(X_split[i])
        y_split[i] = np.array(y_split[i])
        
    # Replace missing values with medians

    # q: queue type idx
    def replace_missing(q, col_idx):
        X_q = X_split[q]
        col = X_q[:, col_idx]
        med = np.median(col[np.where(col != -1)])
        X_q[:, col_idx] = np.where(col == -1, med, col)

    for q in range(5):
        replace_missing(q, 7)
        replace_missing(q, 15)
        
    return X_split, y_split

In [4]:
def fit_predict(X_train, y_train, X_test):
    gbr = GradientBoostingRegressor()
    gbr.fit(X_train, y_train)
    y_pred = gbr.predict(X_test)
    return y_pred

def fit_predict_rf(X_train, y_train, X_test):
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    return y_pred

In [None]:
# For submission:

X_split, y_split = clean_data(X, y)

X_test_real = np.loadtxt(open('data/X_test.csv', 'rb'), delimiter=',', skiprows=1)
X_test_real_split, _ = clean_data(X_test_real, X_test_real[:, 0])

y_test_combined = []
y_pred_combined = []
y_pred_rf_combined = []

for q in range(5):
    X_train, y_train = X_split[q], y_split[q]
    X_test = X_test_real_split[q]
    X_train, X_test, y_train, y_test = train_test_split(X_q, y_q, test_size=0.3)
    y_pred = fit_predict(X_train, y_train, X_test)
    y_pred_rf = fit_predict_rf(X_train, y_train, X_test)
    y_test_combined += list(y_test)
    y_pred_combined += list(y_pred)
    y_pred_rf_combined += list(y_pred_rf)