Importing the data from csv files. We store the data in different ways so that we can grab them immediately for different situation. Also we're transforming them into torch dataloaders for further uses.

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import optuna
from optuna.trial import TrialState
from optuna.integration import LightGBMPruningCallback
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from utils import index_til_exceed
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
import warnings

In [3]:
dfX = pd.read_csv("./train_X_origin.csv").set_index('ID')
dfY = pd.read_csv("./train_Y.csv").set_index("ID")
dfTest = pd.read_csv("./test_X.csv", sep=";")
dfFormat = pd.read_csv("./format.csv").set_index("ID")
trainX_df = dfX.loc[dfX["DATASET"] < 4]
valX_df = dfX.loc[dfX["DATASET"] == 4]
trainY_df = dfY.loc[dfY["DATASET"] < 4]
valY_df = dfY.loc[dfY["DATASET"] == 4]
dfData = pd.read_csv("./train_X.csv", sep=";")
data = dfData.to_numpy()[:, 2:]
data_test = dfTest.to_numpy()[:, 2:]

Define some useful functions

In [4]:
from utils import prediction_reshape
def metric(target, pred):
    target_means = np.array([row.mean() for row in target.reshape(-1, 16)])
    pred_means, pred_variances = prediction_reshape(pred.reshape(-1, 16))
    pred_means_np = np.array(pred_means)
    pred_variances_np = np.array(pred_variances)
    nline = len(pred_means)

    chi2 = (pred_means_np-target_means)**2
        
    R2 = np.sum(chi2)/np.sum((target_means)**2)
    RELIABILITY = np.sqrt(np.mean(chi2/(pred_variances_np+10-9)))

    score=-(np.log(R2)+abs(np.log(RELIABILITY)))
    is_higher_better = True
    return "climate_metric", score, is_higher_better


from utils import prediction_reshape
from climate_challenge_custom_metric import climate_metric_function
def score_pred(aa, label):
    means, variances = prediction_reshape(aa.reshape(-1, 16))
    pred = pd.DataFrame({"MEAN":means, "VARIANCE":variances})
    score_pred = climate_metric_function(label, pred)
    return score_pred

#print(score_pred(predY, valY_df))
#testY = gbm.predict(testX_np_transformed, num_iterations = gbm.best_iteration)
def test_submit(aa, name):
    means, variances = prediction_reshape(aa.reshape(-1, 16))
    df = dfFormat
    df["MEAN"] = means
    df["VARIANCE"] = variances
    df.to_csv(f'./submissions/pred_{name}.csv', index = True)
    print("Prediction saved...")

def index_from_MY(M, Y):
    if M > 0:
        return (M-1)*11+Y+10
    else:
        return Y

Data Shuffling

In [5]:
data_to_shuffle = data.reshape(-1, 16, 255)
perm = np.random.permutation(len(data_to_shuffle[:,0,0]))
# print(perm[:10])
data_shuffled = data_to_shuffle[perm, :, :].reshape(-1, 255)
trainX_np = data[:12288, :254]
valX_np = data[12288:, :254]
allX_np = data[:, :254]
trainY_np = data[:12288, -1]
valY_np = data[12288:, -1]
allY_np = data[:, -1]
testX_np = data_test[:, :254]
trainX_np_shuffled = data_shuffled[:12288, :254]
valX_np_shuffled = data_shuffled[12288:, :254]
allX_np_shuffled = data_shuffled[:, :254]
trainY_np_shuffled = data_shuffled[:12288, -1]
valY_np_shuffled = data_shuffled[12288:, -1]
allY_np_shuffled = data_shuffled[:, -1]
trainY_df_shuffled, _ = prediction_reshape(trainY_np_shuffled.reshape(-1, 16))
trainY_df_shuffled = pd.DataFrame({"MEAN":trainY_df_shuffled, "VARIANCE": [np.nan]*len(trainY_df_shuffled)})
valY_df_shuffled, _ = prediction_reshape(valY_np_shuffled.reshape(-1, 16))
valY_df_shuffled = pd.DataFrame({"MEAN":valY_df_shuffled, "VARIANCE": [np.nan]*len(valY_df_shuffled)})


Data Preprocessing by normalizing.

In [6]:
sc = StandardScaler().fit(trainX_np)
trainX_np_normalized = sc.transform(trainX_np)
valX_np_normalized = sc.transform(valX_np)
testX_np_normalized = sc.transform(testX_np)

sc_shuffled = StandardScaler().fit(trainX_np_shuffled)
trainX_np_shuffled_normalized = sc_shuffled.transform(trainX_np_shuffled)
valX_np_shuffled_normalized = sc_shuffled.transform(valX_np_shuffled)
testX_np_shuffled_normalized = sc_shuffled.transform(testX_np)

sc_all = StandardScaler().fit(allX_np)
allX_np_normalized = sc_all.transform(allX_np)
allX_np_shuffled_normalized = sc_all.transform(allX_np)



Tensors for DL

In [7]:
trainX = torch.from_numpy(trainX_np_shuffled_normalized).to("cuda").float()
valX = torch.from_numpy(valX_np_shuffled_normalized).to("cuda").float()
valY = torch.from_numpy(valY_np_shuffled).to("cuda").float().unsqueeze(1)
trainY = torch.from_numpy(trainY_np_shuffled).to("cuda").float().unsqueeze(1)
testX = torch.from_numpy(testX_np_shuffled_normalized).to("cuda").float()

train_data= TensorDataset(trainX, trainY)
val_data = TensorDataset(valX, valY)
train_data_recent = TensorDataset(trainX[:, :10], trainY)
val_data_recent = TensorDataset(trainX[:, :10], trainY)
train_loader = DataLoader(train_data, batch_size = 64, shuffle = True)
val_loader = DataLoader(val_data, batch_size = 64, shuffle = True)
train_loader_recent = DataLoader(train_data_recent, batch_size = 64, shuffle = True)
val_loader_recent = DataLoader(val_data_recent, batch_size=64, shuffle=True)

Benchmarks

In [8]:
pred_train_benchmark = trainX_np[:, 9]
pred_val_benchmark = valX_np[:, 9]
pred_train_shuffle_benchmark = trainX_np_shuffled[:, 9]
pred_val_shuffle_benchmark = valX_np_shuffled[:, 9]
print("Training Benchmark: ", score_pred(pred_train_benchmark, trainY_df))
print("Validation Benchmark: ", score_pred(pred_val_benchmark, valY_df))
print("Training Benchmark with shuffling: ", score_pred(pred_train_shuffle_benchmark, trainY_df_shuffled))
print("Validation Benchmark with shuffling: ", score_pred(pred_val_shuffle_benchmark, valY_df_shuffled))

Training Benchmark:  -1.3277852269792563
Validation Benchmark:  -1.5995618725175895
Training Benchmark with shuffling:  -1.2794413320272424
Validation Benchmark with shuffling:  -1.6573494562835753


Simple calculation of the means to see how the variables behave.

In [9]:
res_train = [np.array([0.0]*12288) for k in range(23)]
res_train_shuffle = [np.array([0.0]*12288) for k in range(23)]
res_val = [np.array([0.0]*3072) for k in range(23)]
res_val_shuffle = [np.array([0.0]*3072) for k in range(23)]
index = 0
for k in range(23):
    if k == 0:
        j = 10
    else:
        j = 11
    for i in range(j):
        res_train[k]+=trainX_np[:, index]/10
        res_val[k]+=valX_np[:, index]/10
        res_train_shuffle[k]+=trainX_np_shuffled[:, index]/10
        res_val_shuffle[k]+=valX_np_shuffled[:, index]/10
        index+=1
best_model_train = np.argsort([score_pred(res_train[k], trainY_df) for k in range(23)])[-9:]
best_model_val = np.argsort([score_pred(res_val[k], valY_df) for k in range(23)])[-9:]
best_model_train_shuffle = np.argsort([score_pred(res_train_shuffle[k], trainY_df_shuffled) for k in range(23)])[-9:]
best_model_val_shuffle= np.argsort([score_pred(res_val_shuffle[k], valY_df_shuffled) for k in range(23)])[-9:]
print(f"Model with best performance in training: {best_model_train},",
 f"with score: {[score_pred(res_train[k], trainY_df) for k in best_model_train]}")
print(f"Model with best performance in validation: {best_model_val},",
 f"with score: {[score_pred(res_val[k], valY_df) for k in best_model_val]}")
print(f"Model with best performance in training with shuffling: {best_model_train_shuffle},", 
f" with score: {[score_pred(res_train_shuffle[k], trainY_df_shuffled) for k in best_model_train_shuffle]}")
print(f"Model with best performance in validation with shuffling: {best_model_val_shuffle},",
 f" with score: {[score_pred(res_val_shuffle[k], valY_df_shuffled) for k in best_model_val_shuffle]}")
res_test = np.array([0.0]*3072*2)
for k in range(11):
    if k<10:
        res_test+=(testX_np[:, index_from_MY(8,k)]+testX_np[:, index_from_MY(5, k)]+testX_np[:, index_from_MY(0, k)])/32
    else:
        res_test+=(testX_np[:, index_from_MY(8, k)]+testX_np[:, index_from_MY(5, k)])/32
test_submit(res_test, "MeansOverBestModelAndObservations")

Model with best performance in training: [18 22  1  5  4 19 17  0  8], with score: [-1.3341052649073424, -1.3209518134161182, -1.2844178515599305, -1.2810905181103258, -1.2771765719246786, -1.2696471507895244, -1.2420247202966577, -1.2151217334725608, -1.1699814856878423]
Model with best performance in validation: [16  0 15  2  7  1  6 12  5], with score: [-1.6252185680986158, -1.59131339975462, -1.5809363122508637, -1.5499614770633192, -1.5357665697752658, -1.5126760479533694, -1.4594159032882577, -1.4445212854216318, -1.4351372473711352]
Model with best performance in training with shuffling: [19  2 22  6 17  1  5  0  8],  with score: [-1.3702341304678156, -1.3701694375292188, -1.3607430436644454, -1.35832331017058, -1.3072426748035908, -1.2959473042146146, -1.295082118020785, -1.262094917723523, -1.2558747813228073]
Model with best performance in validation with shuffling: [19 22  1  4 17  5 18  0  8],  with score: [-1.4343846782025356, -1.429951151917416, -1.4275019155405473, -1.41

Best Models?

In [57]:
round = 10
def test_for_best_model(round):
    ll = np.array([0]*23)
    for k in range(round):
        perm = np.random.permutation(len(data_to_shuffle[:,0,0]))
        data_shuffled = data_to_shuffle[perm, :, :].reshape(-1, 255)
        trainX_np_shuffled_testing = data_shuffled[:12288, :254]
        valX_np_shuffled_testing = data_shuffled[12288:, :254]
        trainY_np_shuffled_testing = data_shuffled[:12288, -1]
        valY_np_shuffled_testing = data_shuffled[12288:, -1]
        trainY_df_shuffled_testing, _ = prediction_reshape(trainY_np_shuffled_testing.reshape(-1, 16))
        trainY_df_shuffled_testing = pd.DataFrame({"MEAN":trainY_df_shuffled_testing, "VARIANCE": [np.nan]*len(trainY_df_shuffled_testing)})
        valY_df_shuffled_testing, _ = prediction_reshape(valY_np_shuffled_testing.reshape(-1, 16))
        valY_df_shuffled_testing = pd.DataFrame({"MEAN":valY_df_shuffled_testing, "VARIANCE": [np.nan]*len(valY_df_shuffled_testing)})
        res_train = [np.array([0.0]*12288) for k in range(23)]
        res_val = [np.array([0.0]*3072) for k in range(23)]
        index = 0
        for k in range(23):
            if k == 0:
                j = 10
            else:
                j = 11
            for i in range(j):
                res_train[k]+=trainX_np_shuffled_testing[:, index]/10
                res_val[k]+=valX_np_shuffled_testing[:, index]/10
                index+=1
        model_ranking_train_list = np.argsort([score_pred(res_train[k], trainY_df_shuffled_testing) for k in range(23)])
        model_ranking_val_list = np.argsort([score_pred(res_val[k], valY_df_shuffled_testing) for k in range(23)])
        for k in range(len(model_ranking_train_list)):
            ll[model_ranking_train_list[k]]+=k
        # for k in range(len(model_ranking_val_list)):
        #     ll[model_ranking_val_list[k]]+=k
    indices = np.argsort(ll)[-9:]
    # plt.title('Best Model Ranking')
    # plt.barh(range(len(indices)), ll[indices], color='b', align='center')
    # plt.yticks(range(len(indices)), [i for i in indices])
    # plt.xlabel('Ranking')
    # plt.show()
    return ll
for k in range(10):
    print(f"Round {k}")
    print(np.argsort(test_for_best_model(round))[-9:])

Round 0
[18 19  6 22 17  1  5  0  8]
Round 1
[ 6 18 19 22 17  1  5  0  8]
Round 2
[ 4 18 19 22 17  1  5  0  8]
Round 3
[ 6  4 22 18 17  1  5  0  8]
Round 4
[19  4 18 22 17  1  5  0  8]
Round 5
[ 4 22 18 19 17  1  5  0  8]
Round 6
[18  6 19 22 17  1  5  0  8]
Round 7
[ 4  6 19 22  1 17  5  0  8]
Round 8
[18 19  6 22 17  1  5  0  8]
Round 9
[18  4 22 19 17  1  5  8  0]


In [11]:
res_train_mean = np.array([0.0]*12288)
res_train_mean_shuffle = np.array([0.0]*12288)
res_val_mean = np.array([0.0]*3072)
res_val_mean_shuffle = np.array([0.0]*3072)
res_test_mean = np.array([0.0]*3072*2)
best_models_indices = [0, 8, 5, 1, 17, 22]
for k in range(11):
    if k<10:
        for i in best_models_indices:
            res_train_mean += trainX_np[:, index_from_MY(i, k)]/(len(best_models_indices)*11-1)
            res_train_mean_shuffle += trainX_np_shuffled[:, index_from_MY(i, k)]/(len(best_models_indices)*11-1)
            res_val_mean += valX_np[:, index_from_MY(i, k)]/(len(best_models_indices)*11-1)
            res_val_mean_shuffle += valX_np_shuffled[:, index_from_MY(i, k)]/(len(best_models_indices)*11-1)
            res_test_mean += testX_np[:, index_from_MY(i, k)]/(len(best_models_indices)*11-1)
    else:
        for i in best_models_indices:
            if i > 0:
                res_train_mean += trainX_np[:, index_from_MY(i, k)]/(len(best_models_indices)*11-1)
                res_train_mean_shuffle += trainX_np_shuffled[:, index_from_MY(i, k)]/(len(best_models_indices)*11-1)
                res_val_mean += valX_np[:, index_from_MY(i, k)]/(len(best_models_indices)*11-1)
                res_val_mean_shuffle += valX_np_shuffled[:, index_from_MY(i, k)]/(len(best_models_indices)*11-1)
                res_test_mean += testX_np[:, index_from_MY(i, k)]/(len(best_models_indices)*11-1)
print("Training score: ", score_pred(res_train_mean, trainY_df))
print("Validation score: ", score_pred(res_val_mean, valY_df))
print("Training score with shuffling: ", score_pred(res_train_mean_shuffle, trainY_df_shuffled))
print("Validation score with shuffling: ", score_pred(res_val_mean_shuffle, valY_df_shuffled))
test_submit(res_test_mean, "MEANWithBestModels")

Training score:  -1.04054592581554
Validation score:  -1.414768156451494
Training score with shuffling:  -1.085766734500794
Validation score with shuffling:  -1.1572345529948167
Prediction saved...


Simple calculation of the mean of the predictions of all the models (should do more or less well).

In [9]:
mean_train_shuffled_predicted = np.array([0.0]*12288)
mean_val_shuffled_predicted = np.array([0.0]*3072)
mean_train_predicted = np.array([0.0]*12288)
mean_val_predicted = np.array([0.0]*3072)
mean_test_predicted = np.array([0.0]*6144)
for k in range(1, 23):
    mean_train_shuffled_predicted += trainX_np_shuffled[:, 9+11*k]/22
    mean_val_shuffled_predicted += valX_np_shuffled[:, 9+11*k]/22
    mean_train_predicted += trainX_np[:, 9+11*k]/22
    mean_val_predicted += valX_np[:, 9+11*k]/22
    mean_test_predicted+= testX_np[:, 9+11*k]/22
print("Training score: ", score_pred(mean_train_predicted, trainY_df))
print("Valid score: ", score_pred(mean_val_predicted, valY_df))
print("Training score with shuffling: ", score_pred(mean_train_shuffled_predicted, trainY_df_shuffled))
print("Valid score with shuffling: ", score_pred(mean_val_shuffled_predicted, valY_df_shuffled))
test_submit(mean_test_predicted, "MeansOfModelPrediction")


Training score:  -1.197763000299892
Valid score:  -1.460989971094912
Training score with shuffling:  -1.2336097107202142
Valid score with shuffling:  -1.2690486284708065
Prediction saved...


Plotting the dependancy of the variables and the target.

In [None]:
means_target_train, _ = prediction_reshape(trainY_np.reshape(-1, 16))
means_target_train = np.array(means_target_train)
for k in range(23):
    means_train, variances_train = prediction_reshape(res_train[k].reshape(-1, 16))
    means_train = np.array(means_train)
    diff = np.abs(means_train-means_target_train)
    plt.hist(x=diff, bins='auto', alpha=0.7, rwidth=0.9)
    plt.savefig(f"./plots/hist_model_mean{k}.png")
    print(f"hist_model_mean{k} saved")
    plt.clf()
    plt.scatter(means_train, means_target_train, alpha = 0.1)
    plt.savefig(f"./plots/scatter_model_mean{k}.png")
    print(f"scatter_model_mean{k} saved")
    plt.clf()

<Figure size 640x480 with 0 Axes>

After plotting the dependency between the variables and the target, we do some feature selection and dimensionality reduction.

First we restrain ourself to the models that perform better with only the means.

In [12]:
best_indices = []
for m in sorted(best_models_indices[:3]):
    if m > 0:
        j = 11
    else:
        j = 10
    for k in range(j):
        best_indices.append(index_from_MY(m, k))
print(best_indices)
trainX_np_normalized_best = trainX_np_normalized[:, best_indices]
valX_np_normalized_best = valX_np_normalized[:, best_indices]
trainX_np_shuffled_normalized_best = trainX_np_shuffled_normalized[:, best_indices]
valX_np_shuffled_normalized_best = valX_np_shuffled_normalized[:, best_indices]
allX_np_normalized_best = allX_np_normalized[:, best_indices]
allX_np_shuffled_normalized_best = allX_np_shuffled_normalized[:, best_indices]
testX_np_normalized_best = testX_np_normalized[:, best_indices]
testX_np_shuffled_normalized_best = testX_np_shuffled_normalized[:, best_indices]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97]


Feature Selection

RFE

In [20]:
selector_RFE = RFE(
    LinearRegression(),
    n_features_to_select=10,
    verbose=1,
)
selector_RFE.fit(trainX_np_shuffled_normalized, trainY_np_shuffled)

Fitting estimator with 254 features.
Fitting estimator with 253 features.
Fitting estimator with 252 features.
Fitting estimator with 251 features.
Fitting estimator with 250 features.
Fitting estimator with 249 features.
Fitting estimator with 248 features.
Fitting estimator with 247 features.
Fitting estimator with 246 features.
Fitting estimator with 245 features.
Fitting estimator with 244 features.
Fitting estimator with 243 features.
Fitting estimator with 242 features.
Fitting estimator with 241 features.
Fitting estimator with 240 features.
Fitting estimator with 239 features.
Fitting estimator with 238 features.
Fitting estimator with 237 features.
Fitting estimator with 236 features.
Fitting estimator with 235 features.
Fitting estimator with 234 features.
Fitting estimator with 233 features.
Fitting estimator with 232 features.
Fitting estimator with 231 features.
Fitting estimator with 230 features.
Fitting estimator with 229 features.
Fitting estimator with 228 features.
F

In [22]:
print(selector_RFE.support_)
trainX_np_shuffled_selected = selector_RFE.transform(trainX_np_shuffled_normalized)
valX_np_shuffled_selected = selector_RFE.transform(valX_np_shuffled_normalized)
testX_np_selected = selector_RFE.transform(testX_np_normalized)


[False False False False  True  True False False False  True False False
 False False False  True False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False  True  True False  True  True False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
  True  True False False False False False False Fa

Random Forests

In [15]:
model_rf = RandomForestRegressor(random_state = 1, max_depth=10)
model_rf.fit(trainX_np_normalized, trainY_np)

In [50]:
importances = model_rf.feature_importances_
print(importances)
indices = np.argsort(importances)[-9:]
features = dfData.drop(["POS", "DATA", "Y"], axis = "columns").columns
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

NameError: name 'model_rf' is not defined

Dimensionaly reduction with PCA

In [13]:
pca = PCA(n_components=254)
pca.fit(trainX_np_normalized)
#num_components = index_til_exceed(pca.explained_variance_ratio_, 0.81)+1
num_components = 10
print(num_components)
pca = PCA(n_components = num_components)
pca.fit(trainX_np_normalized)
print(pca.explained_variance_ratio_[:10])

trainX_np_transformed = pca.transform(trainX_np_normalized)
testX_np_transformed = pca.transform(testX_np_normalized)
valX_np_transformed = pca.transform(valX_np_normalized)

pca_shuffle = PCA(n_components=254)
pca.fit(trainX_np_shuffled_normalized)
#num_components_shuffle = index_til_exceed(pca.explained_variance_ratio_, 0.81)+1
num_components_shuffle = 10
print(num_components_shuffle)
pca_shuffle = PCA(n_components = num_components_shuffle)
pca.fit(trainX_np_shuffled_normalized)
trainX_np_shuffled_transformed = pca.transform(trainX_np_shuffled_normalized)
testX_np_shuffled_transformed = pca.transform(testX_np_shuffled_normalized)
valX_np_shuffled_transformed = pca.transform(valX_np_shuffled_normalized)

pca_all = PCA(n_components=20)
pca_all.fit(allX_np_normalized)
allX_np_transformed = pca_all.transform(allX_np_normalized)

pca_best = PCA()
pca_best.fit(trainX_np_normalized_best)
num_components_best = index_til_exceed(pca_best.explained_variance_ratio_, 0.81)+1
print(len(pca_best.explained_variance_ratio_), num_components_best)
pca_best = PCA(n_components=num_components_best)
pca_best.fit(trainX_np_normalized_best)
trainX_np_transformed_best = pca_best.transform(trainX_np_normalized_best)
testX_np_transformed_best = pca_best.transform(testX_np_normalized_best)
valX_np_transformed_best = pca_best.transform(valX_np_normalized_best)

pca_best_shuffle = PCA()
pca_best_shuffle.fit(trainX_np_shuffled_normalized_best)
num_components_best_shuffle = index_til_exceed(pca_best_shuffle.explained_variance_ratio_, 0.81)+1
print(len(pca_best_shuffle.explained_variance_ratio_), num_components_best_shuffle)
pca_best_shuffle = PCA(n_components=num_components_best_shuffle)
pca_best_shuffle.fit(trainX_np_shuffled_normalized_best)
trainX_np_shuffled_transformed_best = pca_best_shuffle.transform(trainX_np_shuffled_normalized_best)
testX_np_shuffled_transformed_best = pca_best_shuffle.transform(testX_np_shuffled_normalized_best)
valX_np_shuffled_transformed_best = pca_best_shuffle.transform(valX_np_shuffled_normalized_best)


10
[0.09128919 0.0598804  0.04586726 0.04156713 0.03532029 0.03212401
 0.03002951 0.02733157 0.02509556 0.02368212]
10
32 7
32 8


Now we start the part of using different models in ML to get a better result than using just the means of the data.

LR

With observed data.

In [13]:
reg = LinearRegression().fit(trainX_np[:, :10], trainY_np)
pred_train_LR = reg.predict(trainX_np[:, :10])
print(score_pred(pred_train_LR, trainY_df))
pred_val_LR = reg.predict(valX_np[:, :10])
print(score_pred(pred_val_LR, valY_df))
reg_shuffle = LinearRegression().fit(trainX_np_shuffled[:, :10], trainY_np_shuffled)
pred_train_shuffle_LR = reg_shuffle.predict(trainX_np_shuffled[:, :10])
print(score_pred(pred_train_shuffle_LR, trainY_df_shuffled))
pred_val_shuffle_LR = reg_shuffle.predict(valX_np_shuffled[:, :10])
print(score_pred(pred_val_shuffle_LR, valY_df_shuffled))

pred_test_LR = reg.predict(testX_np[:, :10])
test_submit(pred_test_LR, "LRAllObservation")
pred_test_shuffle_LR = reg_shuffle.predict(testX_np[:, :10])
test_submit(pred_test_shuffle_LR, "LRAllObservationShuffled")

-1.0016202538913725
-1.5278940367416713
-1.0804195095703328
-1.0848235088516134
Prediction saved...
Prediction saved...


With best model.

In [20]:
model_index = 9+(best_model_train-1)*11
reg_best_model = LinearRegression().fit(trainX_np[:, (model_index+1):(model_index+12)], trainY_np)
pred_train_LR_best_model = reg_best_model.predict(trainX_np[:, (model_index+1):(model_index+12)])
print("Training score: ", score_pred(pred_train_LR_best_model, trainY_df))
pred_val_LR_best_model = reg_best_model.predict(valX_np[:, (model_index+1):(model_index+12)])
print("Validation score: ", score_pred(pred_val_LR_best_model, valY_df))


model_index_shuffle = 9 +(best_model_train_shuffle-1)*11
reg_best_model_shuffle = LinearRegression().fit(trainX_np_shuffled[:, (model_index_shuffle+1):(model_index_shuffle+12)], trainY_np_shuffled)
pred_train_shuffle_LR_best_model = reg_best_model_shuffle.predict(trainX_np_shuffled[:, (model_index_shuffle+1):(model_index_shuffle+12)])
print("Training score with shuffling: ", score_pred(pred_train_shuffle_LR_best_model, trainY_df_shuffled))
pred_val_shuffle_LR_best_model = reg_best_model_shuffle.predict(valX_np_shuffled[:, (model_index_shuffle+1):(model_index_shuffle+12)])
print("Validation score with shuffling: ", score_pred(pred_val_shuffle_LR_best_model, valY_df_shuffled))


pred_test_LR_best_model = reg_best_model.predict(testX_np[:, (model_index+1):(model_index+12)])
pred_test_shuffle_LR_best_model = reg_best_model_shuffle.predict(testX_np[:, (model_index_shuffle+1):(model_index_shuffle+12)])
test_submit(pred_test_LR_best_model, "LRWithBestModelInMeans")
test_submit(pred_test_shuffle_LR_best_model, "LRWithBestModelInMeansShuffled")

Training score:  -1.020952329667872
Validation score:  -1.5450669182148626
Training score with shuffling:  -1.1037777708851229
Validation score with shuffling:  -1.1143276339536863
Prediction saved...
Prediction saved...


With Last Observation

In [62]:
reg2 = LinearRegression().fit(trainX_np[:, 9].reshape(-1, 1), trainY_np)
print()
pred_val_LR2 = reg2.predict(valX_np[:, 9].reshape(-1, 1))
print(score_pred(pred_val_LR2, valY_df))
pred_test_LR2 = reg2.predict(testX_np[:, 9].reshape(-1,1))
test_submit(pred_test_LR2, "LRLastObservation")

-1.5205237305160313
Prediction saved...


With RFE

In [23]:
lr_RFE = LinearRegression().fit(trainX_np_shuffled_selected, trainY_np_shuffled)
pred_train_LR2 = lr_RFE.predict(trainX_np_shuffled_selected)
print(score_pred(pred_train_LR2, trainY_df_shuffled))
pred_val_LR2 = lr_RFE.predict(valX_np_shuffled_selected)
print(score_pred(pred_val_LR2, valY_df_shuffled))
pred_test_LR2 = lr_RFE.predict(testX_np_selected)
test_submit(pred_test_LR2, "LR10coodSelectedByNFE")

-1.053835342596043
-1.0094850741415642
Prediction saved...


With PCA

In [30]:
lr_PCA = LinearRegression().fit(trainX_np_shuffled_transformed, trainY_np_shuffled)
pred_train_LR_PCA = lr_PCA.predict(trainX_np_shuffled_transformed)
print(score_pred(pred_train_LR_PCA, trainY_df))
pred_val_LR_PCA = lr_PCA.predict(valX_np_shuffled_transformed)
print(score_pred(pred_val_LR_PCA, valY_df))


-1.199635749297854
-1.631707772407675


With best models and PCA

In [14]:
lr_PCA_best = LinearRegression().fit(trainX_np_transformed_best, trainY_np)
pred_train_LR_PCA_best = lr_PCA_best.predict(trainX_np_transformed_best)
print("Training score: ", score_pred(pred_train_LR_PCA_best, trainY_df))
pred_val_LR_PCA_best = lr_PCA_best.predict(valX_np_transformed_best)
print("Validation score: ", score_pred(pred_val_LR_PCA_best, valY_df))

lr_PCA_best_shuffle = LinearRegression().fit(trainX_np_shuffled_transformed_best, trainY_np_shuffled)
pred_train_LR_PCA_best_shuffle=  lr_PCA_best_shuffle.predict(trainX_np_shuffled_transformed_best)
print("Training score with shuffling: ", score_pred(pred_train_LR_PCA_best_shuffle, trainY_df_shuffled))
pred_val_LR_PCA_best_shuffle=  lr_PCA_best_shuffle.predict(valX_np_shuffled_transformed_best)
print("Training score with shuffling: ", score_pred(pred_val_LR_PCA_best_shuffle, valY_df_shuffled))

Training score:  -0.9657615423778054
Validation score:  -1.4926287339354798
Training score with shuffling:  -1.0402980769626395
Training score with shuffling:  -1.0460009898659692


In [15]:
pred_test_LR_PCA_best = lr_PCA_best.predict(testX_np_transformed_best)
pred_test_LR_PCA_best_shuffle = lr_PCA_best_shuffle.predict(testX_np_shuffled_transformed_best)
test_submit(pred_test_LR_PCA_best, "LRBestModelsWithPCA")
test_submit(pred_test_LR_PCA_best_shuffle, "LRBestModelsWithPCAWithShuffling")

Prediction saved...
Prediction saved...


First we try with Light GBM (faster version of Random Forest)

Using Optuna for optimizing the hyperparameters in the LightGBM.

In [27]:
lgb_train = lgb.Dataset(trainX_np_shuffled_normalized_best, trainY_np_shuffled)
lgb_eval = lgb.Dataset(valX_np_shuffled_normalized_best, valY_np_shuffled)

In [29]:
warnings.filterwarnings("ignore")

def objective_gbm(trial):
    param = {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step = 20),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
    }
    
    gbm = lgb.train(param, lgb_train)
    preds = gbm.predict(valX_np_shuffled_normalized_best)
    return score_pred(preds, valY_df_shuffled)

study = optuna.create_study(direction="maximize")
study.optimize(objective_gbm, n_trials=100)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


[32m[I 2022-11-23 18:59:13,227][0m A new study created in memory with name: no-name-a795ae6c-a603-4827-886e-b1ecd6470285[0m
[32m[I 2022-11-23 18:59:17,035][0m Trial 0 finished with value: -0.7608422777867556 and parameters: {'lambda_l1': 4.3739968499729146e-08, 'lambda_l2': 4.319102014393532e-06, 'num_leaves': 2880, 'feature_fraction': 0.601270304584327, 'bagging_fraction': 0.7915044657531347, 'bagging_freq': 3, 'learning_rate': 0.1887549522894483}. Best is trial 0 with value: -0.7608422777867556.[0m
[32m[I 2022-11-23 18:59:20,549][0m Trial 1 finished with value: -0.8940476658510765 and parameters: {'lambda_l1': 0.08075410227583978, 'lambda_l2': 1.1336773292068506e-06, 'num_leaves': 1380, 'feature_fraction': 0.6869870785874264, 'bagging_fraction': 0.8481839527716919, 'bagging_freq': 1, 'learning_rate': 0.015713628421081496}. Best is trial 0 with value: -0.7608422777867556.[0m
[32m[I 2022-11-23 18:59:24,771][0m Trial 2 finished with value: -0.784690788269442 and parameters: {

Number of finished trials: 100
Best trial:
  Value: -0.7311851574903518
  Params: 
    lambda_l1: 0.0001441040170678068
    lambda_l2: 3.6498482941844166e-08
    num_leaves: 2600
    feature_fraction: 0.5952838188136965
    bagging_fraction: 0.6805593472334575
    bagging_freq: 4
    learning_rate: 0.23040343049150605


'lambda_l1': 0.0023077719434512387, 'lambda_l2': 1.7480233081481513e-07, 'num_leaves': 3000, 'feature_fraction': 0.7886485535894862, 'bagging_fraction': 0.46644991789330914, 'bagging_freq': 1, 'learning_rate': 0.2023565800661427

In [30]:
param = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': "rmse",
    'lambda_l1': 0.0001441040170678068,
    'lambda_l2': 3.6498482941844166e-08, 
    'num_leaves': 2600, 
    'feature_fraction': 0.5952838188136965, 
    'bagging_fraction': 0.6805593472334575, 
    'bagging_freq': 4, 
    'learning_rate': 0.23040343049150605
}

gbm = lgb.train(param, lgb_train)
preds = gbm.predict(valX_np_shuffled_normalized_best)
print(score_pred(preds, valY_df_shuffled))
pred_test_gbm = gbm.predict(testX_np_shuffled_normalized_best)
test_submit(pred_test_gbm, "GBMBestModels")

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8160
[LightGBM] [Info] Number of data points in the train set: 12288, number of used features: 32
[LightGBM] [Info] Start training from score 0.069410
-0.7311851574903518
Prediction saved...


Deep learning with linear fully connected models.

In [18]:

class LinearModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearModel, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.fc1 = nn.Linear(input_size, 5)
        self.fc3 = nn.Linear(5, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc3(x)
        return x


In [34]:
def train_loop(model, optimizer, loss_fn, train_loader, val_loader, epochs=100, device="cuda"):
    tr_losses = []
    val_losses = []
    for epoch in range(1, epochs+1):
        training_loss = 0.0
        valid_loss = 0.0
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            inputs, targets = batch
            inputs = inputs.to(device)
            targets = targets.to(device)
            output = model(inputs)
            loss = loss_fn(output, targets)
            loss.backward()
            optimizer.step()
            training_loss += loss.data.item() * inputs.size(0)
        training_loss /= len(train_loader.dataset)
        
        model.eval()
        for batch in val_loader:
            inputs, targets = batch
            inputs = inputs.to(device)
            output = model(inputs)
            targets = targets.to(device)
            loss = loss_fn(output,targets) 
            valid_loss += loss.data.item() * inputs.size(0)
        valid_loss /= len(val_loader.dataset)

        print('Epoch: {}, Training Loss: {:.2f}, Validation Loss: {:.2f}'.format(epoch, training_loss,
        valid_loss))
        tr_losses.append(training_loss)
        val_losses.append(valid_loss)
    

    

In [48]:
model1 = LinearModel(10, 1).to("cuda")
optimizer = optim.Adam(model1.parameters(), lr = 0.001)
loss_fn = nn.MSELoss()

train_loop(
    model = model1,
    optimizer = optimizer,
    loss_fn=loss_fn,
    epochs = 10,
    train_loader=train_loader_recent,
    val_loader=val_loader_recent,
)
torch.save(model1.state_dict(), './model_state/model1.pt')


Epoch: 1, Training Loss: 0.15, Validation Loss: 0.09
Epoch: 2, Training Loss: 0.08, Validation Loss: 0.07
Epoch: 3, Training Loss: 0.07, Validation Loss: 0.07
Epoch: 4, Training Loss: 0.07, Validation Loss: 0.07
Epoch: 5, Training Loss: 0.07, Validation Loss: 0.07
Epoch: 6, Training Loss: 0.07, Validation Loss: 0.07
Epoch: 7, Training Loss: 0.07, Validation Loss: 0.07
Epoch: 8, Training Loss: 0.07, Validation Loss: 0.07
Epoch: 9, Training Loss: 0.07, Validation Loss: 0.07
Epoch: 10, Training Loss: 0.07, Validation Loss: 0.07


In [54]:
pred_val_DL_Observed= model1(valX[:, :10]).cpu().detach().numpy()
print(score_pred(pred_val_DL_Observed, valY_df))
pred_test_DL_Observed = model1(testX[:, :10]).cpu().detach().numpy()
test_submit(pred_test_DL_Observed, "DLObserved")

-1.495178138635222
Prediction saved...


Using Optuna optimizing hyperparemeters in Nearal Networks.

In [32]:
DEVICE = torch.device("cuda")
BATCHSIZE = 128
CLASSES = 1
EPOCHS = 10
N_TRAIN_EXAMPLES = BATCHSIZE*30
N_VALID_EXAMPLES = BATCHSIZE*10
trainX_best = torch.from_numpy(trainX_np_shuffled_normalized_best).to("cuda").float()
valX_best = torch.from_numpy(valX_np_shuffled_normalized_best).to("cuda").float()

train_data_best = TensorDataset(trainX_best, trainY)
val_data_best = TensorDataset(valX_best, valY)

train_loader_best = DataLoader(train_data_best, batch_size = 64, shuffle = True)
val_loader_best = DataLoader(val_data_best, batch_size=64, shuffle=True)
def define_model(trial):
    n_layers = trial.suggest_int("n_layers", 1, 5)
    layers = []

    in_features = 32
    for i in range(n_layers):
        out_features = trial.suggest_int("n_units_l{}".format(i), 2, 128)
        layers.append(nn.Linear(in_features, out_features))
        layers.append(nn.ReLU())
        in_features = out_features
    
    layers.append(nn.Linear(in_features, CLASSES))

    return nn.Sequential(*layers)

def objective(trial):
    loss_fn = nn.MSELoss()
    train_loader = train_loader_best
    val_loader = val_loader_best
    model = define_model(trial).to(DEVICE)
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-2, log = True)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)


    for epoch in range(EPOCHS):
        training_loss = 0.0
        valid_loss = 0.0
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            inputs, targets = batch
            inputs = inputs.to(DEVICE)
            targets = targets.to(DEVICE)
            output = model(inputs)
            loss = loss_fn(output, targets)
            loss.backward()
            optimizer.step()
            training_loss += loss.data.item() * inputs.size(0)
        training_loss /= len(train_loader.dataset)
        
        model.eval()
        for batch in val_loader:
            inputs, targets = batch
            inputs = inputs.to(DEVICE)
            output = model(inputs)
            targets = targets.to(DEVICE)
            loss = loss_fn(output,targets) 
            valid_loss += loss.data.item() * inputs.size(0)
        valid_loss /= len(val_loader.dataset)
        
        trial.report(valid_loss, epoch)

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    return valid_loss

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50, timeout=600)
pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-11-23 19:11:34,039][0m A new study created in memory with name: no-name-d1d31692-315c-4169-b7fb-79a8e3b16229[0m
[32m[I 2022-11-23 19:11:46,037][0m Trial 0 finished with value: 0.057472936382206775 and parameters: {'n_layers': 5, 'n_units_l0': 121, 'n_units_l1': 86, 'n_units_l2': 124, 'n_units_l3': 12, 'n_units_l4': 90, 'optimizer': 'RMSprop', 'lr': 1.13299743555736e-05}. Best is trial 0 with value: 0.057472936382206775.[0m
[32m[I 2022-11-23 19:11:50,990][0m Trial 1 finished with value: 0.0767719130186985 and parameters: {'n_layers': 1, 'n_units_l0': 52, 'optimizer': 'SGD', 'lr': 9.474337067007181e-05}. Best is trial 0 with value: 0.057472936382206775.[0m
[32m[I 2022-11-23 19:12:03,532][0m Trial 2 finished with value: 0.047331373944568135 and parameters: {'n_layers': 3, 'n_units_l0': 40, 'n_units_l1': 54, 'n_units_l2': 87, 'optimizer': 'Adam', 'lr': 0.0010888945686495194}. Best is trial 2 with value: 0.047331373944568135.[0m
[32m[I 2022-11-23 19:12:15,025][0m T

Study statistics: 
  Number of finished trials:  50
  Number of pruned trials:  25
  Number of complete trials:  25
Best trial:
  Value:  0.04088609882940849
  Params: 
    n_layers: 3
    n_units_l0: 128
    n_units_l1: 86
    n_units_l2: 29
    optimizer: RMSprop
    lr: 0.0003557540548570621


Best trial:
  Value:  0.04088609882940849
  Params: 
    n_layers: 3
    n_units_l0: 128
    n_units_l1: 86
    n_units_l2: 29
    optimizer: RMSprop
    lr: 0.0003557540548570621


In [35]:
class LinearOptimized(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearOptimized, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128 , 86)
        self.fc3 = nn.Linear(86, 29)
        self.fc4 = nn.Linear(29, 1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x
model2 = LinearOptimized(32, 1).to(DEVICE)
optimizer2 = optim.RMSprop(model2.parameters(), lr = 0.0003557540548570621)
loss_fn = nn.MSELoss()
train_loop(
    model = model2,
    optimizer = optimizer2,
    loss_fn=loss_fn,
    epochs = 10,
    train_loader=train_loader_best,
    val_loader=val_loader_best,
)
torch.save(model2.state_dict(), './model_state/model2.pt')

    

Epoch: 1, Training Loss: 0.06, Validation Loss: 0.05
Epoch: 2, Training Loss: 0.04, Validation Loss: 0.06
Epoch: 3, Training Loss: 0.04, Validation Loss: 0.04
Epoch: 4, Training Loss: 0.03, Validation Loss: 0.05
Epoch: 5, Training Loss: 0.03, Validation Loss: 0.04
Epoch: 6, Training Loss: 0.03, Validation Loss: 0.04
Epoch: 7, Training Loss: 0.03, Validation Loss: 0.04
Epoch: 8, Training Loss: 0.02, Validation Loss: 0.04
Epoch: 9, Training Loss: 0.02, Validation Loss: 0.05
Epoch: 10, Training Loss: 0.02, Validation Loss: 0.04


In [37]:
pred_val_DL_optimized_best = model2(valX_best).cpu().detach().numpy()
print(score_pred(pred_val_DL_optimized_best, valY_df_shuffled))

testX_best = torch.from_numpy(testX_np_shuffled_normalized_best).to("cuda").float()
pred_test_DL_optimized_best = model2(testX_best).cpu().detach().numpy()
test_submit(pred_test_DL_optimized_best, "DLObservedOptimizedWithBestModels")

-0.7889092497712737
Prediction saved...
