In [1]:
import matplotlib.pyplot as plt 
plt.style.use("ggplot") 
import numpy as np 
import pandas as pd  
import seaborn as sns 
import xgboost as xgb 
from scipy.stats import spearmanr 
from sklearn.impute import SimpleImputer 
from sklearn.model_selection import KFold  
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm


In [2]:
X = pd.read_csv("data/X_train_NHkHMNU.csv")
y = pd.read_csv("data/y_train_ZAN5mwg.csv")
data = pd.merge(X, y, on="ID")
data

Unnamed: 0,ID,DAY_ID,COUNTRY,DE_CONSUMPTION,FR_CONSUMPTION,DE_FR_EXCHANGE,FR_DE_EXCHANGE,DE_NET_EXPORT,FR_NET_EXPORT,DE_NET_IMPORT,...,DE_RAIN,FR_RAIN,DE_WIND,FR_WIND,DE_TEMP,FR_TEMP,GAS_RET,COAL_RET,CARBON_RET,TARGET
0,1054,206,FR,0.210099,-0.427458,-0.606523,0.606523,,0.692860,,...,-0.172680,-0.556356,-0.790823,-0.283160,-1.069070,-0.063404,0.339041,0.124552,-0.002445,0.028313
1,2049,501,FR,-0.022399,-1.003452,-0.022063,0.022063,-0.573520,-1.130838,0.573520,...,-1.240300,-0.770457,1.522331,0.828412,0.437419,1.831241,-0.659091,0.047114,-0.490365,-0.112516
2,1924,687,FR,1.395035,1.978665,1.021305,-1.021305,-0.622021,-1.682587,0.622021,...,-0.480700,-0.313338,0.431134,0.487608,0.684884,0.114836,0.535974,0.743338,0.204952,-0.180840
3,297,720,DE,-0.983324,-0.849198,-0.839586,0.839586,-0.270870,0.563230,0.270870,...,-1.114838,-0.507570,-0.499409,-0.236249,0.350938,-0.417514,0.911652,-0.296168,1.073948,-0.260356
4,1101,818,FR,0.143807,-0.617038,-0.924990,0.924990,,0.990324,,...,-0.541465,-0.424550,-1.088158,-1.011560,0.614338,0.729495,0.245109,1.526606,2.614378,-0.071733
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1489,459,809,DE,1.529204,1.106682,-1.855327,1.855327,-0.218658,1.450426,0.218658,...,,,,,,,0.876984,0.819520,1.320373,-0.172597
1490,1674,887,FR,1.618582,1.752840,0.611392,-0.611392,0.449153,-0.152146,-0.449153,...,,,,,,,0.932633,-0.085690,0.356356,-0.063546
1491,748,1083,DE,0.856399,0.489199,-0.255778,0.255778,-1.531544,-0.829568,1.531544,...,0.207905,0.404763,-0.594595,0.894011,0.256338,0.402316,-1.112899,-0.237835,0.067152,0.151797
1492,1454,1133,FR,0.560689,-0.343777,-0.830239,0.830239,-0.304856,1.210230,0.304856,...,-0.682815,-0.390304,-0.972088,-1.501930,1.215528,1.338708,0.962812,-5.392852,-0.843812,-0.640917


In [3]:
def preprocess(X, imputer_func, scaler_func=None):
    X = pd.get_dummies(X)
    X.loc[:, X.isna().any()] = imputer_func(X.loc[:, X.isna().any()])
    if scaler_func is not None:
        X = scaler_func(X)
    return X

def train(model, data, n_splits, imputer=SimpleImputer(), scaler=StandardScaler()):
    X, y = data.drop("TARGET", axis=1), data["TARGET"]
    kf = KFold(n_splits=n_splits)
    results = []
    for train_idx, test_idx in tqdm(kf.split(data)):
        X_train, y_train = X.copy().loc[train_idx], y[train_idx]
        X_test, y_test = X.copy().loc[test_idx], y[test_idx]
        X_train = preprocess(X_train, imputer.fit_transform, scaler.fit_transform)
        X_test = preprocess(X_test, imputer.transform, scaler.transform)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        rank = evaluate(preds, y_test)
        results.append(rank)
    return results

def evaluate(preds, y_test):
    rank = spearmanr(preds, y_test.values).correlation
    return rank


In [6]:
from sklearn.svm import SVR, LinearSVR

svm_poly_reg = SVR(kernel="poly", degree=2, C=100, epsilon=0.1)
svm_linear_reg = LinearSVR(epsilon=1.5)
results = train(svm_linear_reg, data, 5)
results

5it [00:01,  3.41it/s]


[0.23247629960767444,
 0.19808623824381047,
 0.18802235102349812,
 0.20706810251937266,
 0.18536354544524952]