In [1]:
# Imports
import os
import pandas as pd
import numpy as np
from IPython.display import display
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
FOLDS = 5
N_JOBS = 6
SEED = 2016

DATA = "Base"
#DATA = "Time"


In [2]:
# Get data
train1 = pd.read_csv("clean_data/train1_" + DATA + ".csv")
train2 = pd.read_csv("clean_data/train2_" + DATA + ".csv")
train3 = pd.read_csv("clean_data/train3_" + DATA + ".csv")
train4 = pd.read_csv("clean_data/train4_" + DATA + ".csv")
train5 = pd.read_csv("clean_data/train5_" + DATA + ".csv")
test = pd.read_csv("clean_data/test_" + DATA + ".csv")
y_1 = pd.read_csv("clean_data/y_1.csv")
y_2 = pd.read_csv("clean_data/y_2.csv")
y_3 = pd.read_csv("clean_data/y_3.csv")
y_4 = pd.read_csv("clean_data/y_4.csv")
y_5 = pd.read_csv("clean_data/y_5.csv")

test_ids = test.Id

print(train1.shape)
print(y_1.shape)
print(train2.shape)
print(y_2.shape)
print(train3.shape)
print(y_3.shape)
print(train4.shape)
print(y_4.shape)
print(train5.shape)
print(y_5.shape)

(11708, 8)
(11708, 2)
(8781, 8)
(8781, 2)
(2927, 8)
(2927, 2)
(8781, 8)
(8781, 2)
(11708, 8)
(11708, 2)


In [3]:
# Standardize numerical features
num_features = ["Sensor1", "Sensor2", "Sensor3", "Weather1", "Weather2"]
stdSc = StandardScaler()
train1[num_features] = stdSc.fit_transform(train1[num_features])
train2[num_features] = stdSc.fit_transform(train2[num_features])
train3[num_features] = stdSc.fit_transform(train3[num_features])
train4[num_features] = stdSc.fit_transform(train4[num_features])
train5[num_features] = stdSc.fit_transform(train5[num_features])
test[num_features] = stdSc.transform(test[num_features])

In [4]:
# Split data for final performance check
X_train1, X_test1, y_train1, y_test1 = train_test_split(train1, y_1, test_size = 0.25, random_state = SEED)
X_train2, X_test2, y_train2, y_test2 = train_test_split(train2, y_2, test_size = 0.25, random_state = SEED)
X_train3, X_test3, y_train3, y_test3 = train_test_split(train3, y_3, test_size = 0.25, random_state = SEED)
X_train4, X_test4, y_train4, y_test4 = train_test_split(train4, y_4, test_size = 0.25, random_state = SEED)
X_train5, X_test5, y_train5, y_test5 = train_test_split(train5, y_5, test_size = 0.25, random_state = SEED)

y_test1.Id_bat = X_test1.Id_bat
y_test2.Id_bat = X_test2.Id_bat
y_test3.Id_bat = X_test3.Id_bat
y_test4.Id_bat = X_test4.Id_bat
y_test5.Id_bat = X_test5.Id_bat

In [5]:
# Save y_test for later ensembling
name1 = "clean_data/y_test1.csv"
name2 = "clean_data/y_test2.csv"
name3 = "clean_data/y_test3.csv"
name4 = "clean_data/y_test4.csv"
name5 = "clean_data/y_test5.csv"

pd.DataFrame({"Id": y_test1.Id, "Id_bat": y_test1.Id_bat, "y1": y_test1.y1}, columns = ["Id", "Id_bat", "y1"]).to_csv(name1, index = None)
pd.DataFrame({"Id": y_test2.Id, "Id_bat": y_test2.Id_bat, "y2": y_test2.y2}, columns = ["Id", "Id_bat", "y2"]).to_csv(name2, index = None)
pd.DataFrame({"Id": y_test3.Id, "Id_bat": y_test3.Id_bat, "y3": y_test3.y3}, columns = ["Id", "Id_bat", "y3"]).to_csv(name3, index = None)
pd.DataFrame({"Id": y_test4.Id, "Id_bat": y_test4.Id_bat, "y4": y_test4.y4}, columns = ["Id", "Id_bat", "y4"]).to_csv(name4, index = None)
pd.DataFrame({"Id": y_test5.Id, "Id_bat": y_test5.Id_bat, "y5": y_test5.y5}, columns = ["Id", "Id_bat", "y5"]).to_csv(name5, index = None)


In [8]:
# Definitions for Ridge

def run_ridge(X_train, y_train, train_index, val_index) :
    print("FOLD : " + str(i + 1))    

    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Train model
    ri = RidgeCV(alphas = [0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60])
    ri.fit(X_train_fold, y_train_fold)
    alpha = ri.alpha_
    ri = RidgeCV(alphas = [alpha * .7, alpha * .75, alpha * .8, alpha * .85, alpha * .9, alpha * .95, alpha, 
                           alpha * 1.05, alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3])
    ri.fit(X_train_fold, y_train_fold)
    print("RIDGE Best alpha :", ri.alpha_)
    
    return (X_val_fold, ri)


In [9]:
# Generate y_test predictions to be able to optimize ensemble weights
preds_y_train1 = np.zeros((X_train1.shape[0],))
preds_y_train2 = np.zeros((X_train2.shape[0],))
preds_y_train3 = np.zeros((X_train3.shape[0],))
preds_y_train4 = np.zeros((X_train4.shape[0],))
preds_y_train5 = np.zeros((X_train5.shape[0],))

preds_y_testX1 = np.empty((FOLDS, X_test1.shape[0]))
preds_y_testX2 = np.empty((FOLDS, X_test2.shape[0]))
preds_y_testX3 = np.empty((FOLDS, X_test3.shape[0]))
preds_y_testX4 = np.empty((FOLDS, X_test4.shape[0]))
preds_y_testX5 = np.empty((FOLDS, X_test5.shape[0]))

kf = KFold(n_splits = FOLDS, shuffle = True, random_state = SEED)

print("y1 : ")   
for i, (train_index, val_index) in enumerate(kf.split(X_train1)):
    X_val_fold, ri = run_ridge(X_train1, 
                              y_train1.y1,
                              train_index, 
                              val_index)
    preds_y_train1[val_index] = ri.predict(X_val_fold)
    preds_y_testX1[i, :] = ri.predict(X_test1)    
    print("-----")

print("y2 : ")    
for i, (train_index, val_index) in enumerate(kf.split(X_train2)):
    X_val_fold, ri = run_ridge(X_train2, 
                              y_train2.y2,
                              train_index, 
                              val_index)
    preds_y_train2[val_index] = ri.predict(X_val_fold)
    preds_y_testX2[i, :] = ri.predict(X_test2)    
    print("-----")
    
print("y3 : ")    
for i, (train_index, val_index) in enumerate(kf.split(X_train3)):
    X_val_fold, ri = run_ridge(X_train3, 
                              y_train3.y3, 
                              train_index, 
                              val_index)
    preds_y_train3[val_index] = ri.predict(X_val_fold)
    preds_y_testX3[i, :] = ri.predict(X_test3)    
    print("-----")
    
print("y4 : ")    
for i, (train_index, val_index) in enumerate(kf.split(X_train4)):
    X_val_fold, ri = run_ridge(X_train4, 
                              y_train4.y4, 
                              train_index, 
                              val_index)
    preds_y_train4[val_index] = ri.predict(X_val_fold)
    preds_y_testX4[i, :] = ri.predict(X_test4)    
    print("-----")
    
print("y5 : ")    
for i, (train_index, val_index) in enumerate(kf.split(X_train5)):
    X_val_fold, ri = run_ridge(X_train5, 
                              y_train5.y5, 
                              train_index, 
                              val_index)
    preds_y_train5[val_index] = ri.predict(X_val_fold)
    preds_y_testX5[i, :] = ri.predict(X_test5)    
    print("-----")
    
# Average predictions on test set
preds_y_test1 = preds_y_testX1.mean(axis = 0)
preds_y_test2 = preds_y_testX2.mean(axis = 0)
preds_y_test3 = preds_y_testX3.mean(axis = 0)
preds_y_test4 = preds_y_testX4.mean(axis = 0)
preds_y_test5 = preds_y_testX5.mean(axis = 0)


y1 : 
FOLD : 1
RIDGE Best alpha : 24.0
-----
FOLD : 2
RIDGE Best alpha : 24.0
-----
FOLD : 3
RIDGE Best alpha : 22.5
-----
FOLD : 4
RIDGE Best alpha : 24.0
-----
FOLD : 5
RIDGE Best alpha : 22.5
-----
y2 : 
FOLD : 1
RIDGE Best alpha : 9.5
-----
FOLD : 2
RIDGE Best alpha : 10.0
-----
FOLD : 3
RIDGE Best alpha : 11.0
-----
FOLD : 4
RIDGE Best alpha : 10.5
-----
FOLD : 5
RIDGE Best alpha : 10.5
-----
y3 : 
FOLD : 1
RIDGE Best alpha : 24.0
-----
FOLD : 2
RIDGE Best alpha : 24.0
-----
FOLD : 3
RIDGE Best alpha : 21.0
-----
FOLD : 4
RIDGE Best alpha : 22.5
-----
FOLD : 5
RIDGE Best alpha : 25.5
-----
y4 : 
FOLD : 1
RIDGE Best alpha : 27.0
-----
FOLD : 2
RIDGE Best alpha : 25.5
-----
FOLD : 3
RIDGE Best alpha : 21.0
-----
FOLD : 4
RIDGE Best alpha : 28.5
-----
FOLD : 5
RIDGE Best alpha : 27.0
-----
y5 : 
FOLD : 1
RIDGE Best alpha : 8.5
-----
FOLD : 2
RIDGE Best alpha : 8.5
-----
FOLD : 3
RIDGE Best alpha : 8.0
-----
FOLD : 4
RIDGE Best alpha : 8.5
-----
FOLD : 5
RIDGE Best alpha : 8.0
-----


In [10]:
# Show MSE on y_train using KFold CV
print("Average Ridge MSE using OOF predictions for y1 : " + str(mean_squared_error(y_train1.y1, preds_y_train1)))
print("Average Ridge MSE using OOF predictions for y2 : " + str(mean_squared_error(y_train2.y2, preds_y_train2)))
print("Average Ridge MSE using OOF predictions for y3 : " + str(mean_squared_error(y_train3.y3, preds_y_train3)))
print("Average Ridge MSE using OOF predictions for y4 : " + str(mean_squared_error(y_train4.y4, preds_y_train4)))
print("Average Ridge MSE using OOF predictions for y5 : " + str(mean_squared_error(y_train5.y5, preds_y_train5)))

# Show MSE on y_test
print("Average Ridge MSE on last hold-out fold for y1 : " + str(mean_squared_error(y_test1.y1, preds_y_test1)))
print("Average Ridge MSE on last hold-out fold for y2 : " + str(mean_squared_error(y_test2.y2, preds_y_test2)))
print("Average Ridge MSE on last hold-out fold for y3 : " + str(mean_squared_error(y_test3.y3, preds_y_test3)))
print("Average Ridge MSE on last hold-out fold for y4 : " + str(mean_squared_error(y_test4.y4, preds_y_test4)))
print("Average Ridge MSE on last hold-out fold for y5 : " + str(mean_squared_error(y_test5.y5, preds_y_test5)))


Average Ridge MSE using OOF predictions for y1 : 14611.0285661
Average Ridge MSE using OOF predictions for y2 : 9078.42237235
Average Ridge MSE using OOF predictions for y3 : 90.1025632361
Average Ridge MSE using OOF predictions for y4 : 3688.0244059
Average Ridge MSE using OOF predictions for y5 : 11825.4573435
Average Ridge MSE on last hold-out fold for y1 : 14762.9883232
Average Ridge MSE on last hold-out fold for y2 : 9594.58624716
Average Ridge MSE on last hold-out fold for y3 : 113.682498951
Average Ridge MSE on last hold-out fold for y4 : 3519.97116331
Average Ridge MSE on last hold-out fold for y5 : 12140.6752811


In [11]:
# Now generate predictions on whole test set, to be used in ensemble for submissions
print("DATA : " + DATA)
preds_testX1 = np.empty((FOLDS, test.shape[0]))
preds_testX2 = np.empty((FOLDS, test.shape[0]))
preds_testX3 = np.empty((FOLDS, test.shape[0]))
preds_testX4 = np.empty((FOLDS, test.shape[0]))
preds_testX5 = np.empty((FOLDS, test.shape[0]))

kf = KFold(n_splits = FOLDS, shuffle = True, random_state = SEED)

print("y1 : ")   
for i, (train_index, val_index) in enumerate(kf.split(train1)):
    X_val_fold, ri = run_ridge(train1, 
                              y_1.y1,
                              train_index, 
                              val_index)
    preds_testX1[i, :] = ri.predict(test)    
    print("-----")

print("y2 : ")    
for i, (train_index, val_index) in enumerate(kf.split(train2)):
    X_val_fold, ri = run_ridge(train2, 
                              y_2.y2,
                              train_index, 
                              val_index)
    preds_testX2[i, :] = ri.predict(test)    
    print("-----")
    
print("y3 : ")    
for i, (train_index, val_index) in enumerate(kf.split(train3)):
    X_val_fold, ri = run_ridge(train3, 
                              y_3.y3, 
                              train_index, 
                              val_index)
    preds_testX3[i, :] = ri.predict(test)    
    print("-----")
    
print("y4 : ")    
for i, (train_index, val_index) in enumerate(kf.split(train4)):
    X_val_fold, ri = run_ridge(train4, 
                              y_4.y4, 
                              train_index, 
                              val_index)
    preds_testX4[i, :] = ri.predict(test)    
    print("-----")
    
print("y5 : ")    
for i, (train_index, val_index) in enumerate(kf.split(train5)):
    X_val_fold, ri = run_ridge(train5, 
                              y_5.y5, 
                              train_index, 
                              val_index)
    preds_testX5[i, :] = ri.predict(test)    
    print("-----")
    
# Average predictions on test set
preds_test1 = preds_testX1.mean(axis = 0)
preds_test2 = preds_testX2.mean(axis = 0)
preds_test3 = preds_testX3.mean(axis = 0)
preds_test4 = preds_testX4.mean(axis = 0)
preds_test5 = preds_testX5.mean(axis = 0)


DATA : Base
y1 : 
FOLD : 1
RIDGE Best alpha : 24.0
-----
FOLD : 2
RIDGE Best alpha : 22.5
-----
FOLD : 3
RIDGE Best alpha : 22.5
-----
FOLD : 4
RIDGE Best alpha : 22.5
-----
FOLD : 5
RIDGE Best alpha : 21.0
-----
y2 : 
FOLD : 1
RIDGE Best alpha : 10.0
-----
FOLD : 2
RIDGE Best alpha : 10.5
-----
FOLD : 3
RIDGE Best alpha : 9.5
-----
FOLD : 4
RIDGE Best alpha : 9.5
-----
FOLD : 5
RIDGE Best alpha : 11.0
-----
y3 : 
FOLD : 1
RIDGE Best alpha : 21.0
-----
FOLD : 2
RIDGE Best alpha : 24.0
-----
FOLD : 3
RIDGE Best alpha : 21.0
-----
FOLD : 4
RIDGE Best alpha : 25.5
-----
FOLD : 5
RIDGE Best alpha : 21.0
-----
y4 : 
FOLD : 1
RIDGE Best alpha : 28.5
-----
FOLD : 2
RIDGE Best alpha : 25.5
-----
FOLD : 3
RIDGE Best alpha : 25.5
-----
FOLD : 4
RIDGE Best alpha : 27.0
-----
FOLD : 5
RIDGE Best alpha : 27.0
-----
y5 : 
FOLD : 1
RIDGE Best alpha : 8.5
-----
FOLD : 2
RIDGE Best alpha : 8.0
-----
FOLD : 3
RIDGE Best alpha : 8.5
-----
FOLD : 4
RIDGE Best alpha : 8.5
-----
FOLD : 5
RIDGE Best alpha : 

In [12]:
# Correct predictions for special outputs
preds_test2[test[test.Id_bat == 3].index] = 0
preds_test3[test[test.Id_bat != 3].index] = 0
preds_test4[test[test.Id_bat == 3].index] = 0

In [14]:
# Save predictions
y_test_name1 = "y_test_preds/Ridge1_" + DATA + ".csv"
y_test_name2 = "y_test_preds/Ridge2_" + DATA + ".csv"
y_test_name3 = "y_test_preds/Ridge3_" + DATA + ".csv"
y_test_name4 = "y_test_preds/Ridge4_" + DATA + ".csv"
y_test_name5 = "y_test_preds/Ridge5_" + DATA + ".csv"
test_name = "test_preds/Ridge_" + DATA + ".csv"
pd.DataFrame({"Id": y_test1.Id, "y1": preds_y_test1}, columns = ["Id", "y1"]).to_csv(y_test_name1, index = None, sep = ";")
pd.DataFrame({"Id": y_test2.Id, "y2": preds_y_test2}, columns = ["Id", "y2"]).to_csv(y_test_name2, index = None, sep = ";")
pd.DataFrame({"Id": y_test3.Id, "y3": preds_y_test3}, columns = ["Id", "y3"]).to_csv(y_test_name3, index = None, sep = ";")
pd.DataFrame({"Id": y_test4.Id, "y4": preds_y_test4}, columns = ["Id", "y4"]).to_csv(y_test_name4, index = None, sep = ";")
pd.DataFrame({"Id": y_test5.Id, "y5": preds_y_test5}, columns = ["Id", "y5"]).to_csv(y_test_name5, index = None, sep = ";")
pd.DataFrame({"Id": test.Id, "y1": preds_test1, "y2": preds_test2, "y3": preds_test3, "y4": preds_test4, "y5": preds_test5}, 
             columns = ["Id", "y1", "y2", "y3", "y4", "y5"]).to_csv(test_name, index = None, sep = ";")