# Introduction to Machine Learning: Task 1a

__Author__: Jannick Sicher

### Initial Configurations and Packages

In [1]:
# Load packages
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import KFold

###  Data Preparation

In [2]:
# Load data
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,Id,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
0,0,508.45197,0.886865,0.400531,1.103694,0.135455,8.148069,9.787555,79.749674,0.045166,1.859346,1.0
1,1,3972.980713,0.303604,-1.661598,0.772695,1.696182,0.45555,86.413653,39.365705,-0.014858,0.821406,1.0
2,2,-1.084332,1.065927,-1.60732,0.063896,-0.724311,-0.088216,0.132828,-0.011718,-0.773193,-1.489933,1.0
3,3,-340.149697,0.04953,-0.78082,-1.653181,0.663369,-2.492097,-1.243722,3.099476,-1.037881,0.346979,1.0
4,4,572.648651,0.417725,-1.150429,0.258384,0.045522,5.676019,-0.470274,-2.669283,0.632083,-0.510847,1.0


In [3]:
# Define Target Variable
y = df["y"] # define the target variable (dependent variable) as y
# Define predictor Variables
Xs = df.drop(['Id','y'], axis = 1)

### Grid Search Cross Validation

In [4]:
# Define Regularization Parameters
parameters = {'alpha': [0.1, 1.0, 10.0, 100.0, 1000.0]}

# Ridge Regression
ridge = Ridge(fit_intercept = False)

ridge_regressor = GridSearchCV(ridge, parameters,
                               scoring = 'neg_mean_squared_error', cv = 10)

ridge_regressor.fit(Xs, y)

GridSearchCV(cv=10, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=False, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [0.1, 1.0, 10.0, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [5]:
# Compute Root Mean Squared Error from Test Results
means = ridge_regressor.cv_results_["mean_test_score"]
root_mean_squared_error = np.sqrt(abs(means))

In [6]:
# Create Submission File
submission = pd.DataFrame(data = root_mean_squared_error, columns=None)
submission.head()

Unnamed: 0,0
0,1.010268
1,1.007159
2,1.023893
3,3.306179
4,31.690946


In [7]:
submission.to_csv('submission.csv', index = False, header = False)

## K-Fold Cross Validation

In [8]:
# Define Target Variable
y = df["y"] # define the target variable (dependent variable) as y
# Define predictor Variables
Xs = df.drop(['Id','y'], axis = 1)

In [9]:
k_folds = 10
kf = KFold(n_splits=k_folds, random_state=42, shuffle=False)
kf.get_n_splits(Xs)

10

In [10]:
# Define lambda values
lambdas =  [0.1, 1.0, 10.0, 100.0, 1000.0]

RMSE = np.array([])

# For loop for ridge regression for different lambdas
for lambdavalue in lambdas:
    Average_RMSE = 0
    for train_index, test_index in kf.split(Xs):
        X_train, X_test = Xs.iloc[train_index], Xs.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        ridge = Ridge(alpha = lambdavalue).fit(X_train,y_train)
        y_pred = ridge.predict(X_test)
        RMSE = mean_squared_error(y_test, y_pred)**0.5
        Average_RMSE += RMSE
    Average_RMSE /= k_folds
    print("Average RMSE: "+str(Average_RMSE))

Average RMSE: 1.005938404435637
Average RMSE: 1.0026002531475364
Average RMSE: 1.0197494069487274
Average RMSE: 2.5762847054281655
Average RMSE: 21.894475155996453
