In [134]:
# import relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RepeatedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error

In [135]:
# load data set
df_train = pd.read_csv('train.csv')
y = pd.DataFrame(df_train['y'])
X = pd.DataFrame(df_train.drop(df_train.columns[[0,1]],axis=1))

In [136]:
# feature transformation

# quadratic x6-x10
X_squared = X**2
X_squared.columns=['x6', 'x7','x8', 'x9', 'x10']
X_trafo = pd.concat([X, X_squared],axis=1)

# exponential x11-x15
X_exp = np.exp(X)
X_exp.columns=['x11', 'x12','x13', 'x14', 'x15']
X_trafo = pd.concat([X_trafo, X_exp],axis=1)

# cosine x16-x20
X_cos = np.cos(X)
X_cos.columns=['x16', 'x17','x18', 'x19', 'x20']
X_trafo = pd.concat([X_trafo, X_cos],axis=1)

# constant x21
X_trafo['x21'] = np.ones_like(700)


In [137]:
# find optimal alpha for ridge regression (from machinelearningmastery.com)

model = Ridge()

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

grid = dict()
grid['alpha'] = [0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1, 5, 10, 50, 100, 250, 500, 1000, 2000]

search = GridSearchCV(model, grid, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)

results = search.fit(X, y)

print('MSE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

MSE: -1.955
Config: {'alpha': 1}


In [138]:
# refine search

model = Ridge()

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

grid = dict()
grid['alpha'] = np.arange(0.5, 5, 0.1)
search = GridSearchCV(model, grid, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)

results = search.fit(X, y)

print('Neg. RMSE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

Neg. RMSE: -1.955
Config: {'alpha': 1.8999999999999997}


In [139]:
# split train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_trafo, y, test_size=0.2, random_state=42)

# fit model and predict
ridge_regr = Ridge(alpha=results.best_score_).fit(X_train, y_train)
y_pred = ridge_regr.predict(X_test)

print(ridge_regr.score(X_train,y_train))

# write the predictions to the submission file

weights_df=pd.DataFrame(data=np.transpose(ridge_regr.coef_))
weights_df.to_csv('submission.csv',index=False, header=False)

0.01600956907090545
