# Introduction to Machine Learning: Task 1b

__Author__: Jannick Sicher

### Initial Configurations and Packages

In [2]:
# Load packages
import numpy as np
import matplotlib.pyplot as plot
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn import datasets, linear_model
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold

In [3]:
# Load data
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,Id,y,x1,x2,x3,x4,x5
0,0,-5.522114,1.764052,0.400157,0.978738,2.240893,1.867558
1,1,-21.78998,-0.977278,0.950088,-0.151357,-0.103219,0.410599
2,2,-7.911497,0.144044,1.454274,0.761038,0.121675,0.443863
3,3,-3.698062,0.333674,1.494079,-0.205158,0.313068,-0.854096
4,4,-16.001833,-2.55299,0.653619,0.864436,-0.742165,2.269755


### Define Target and Predictor Variables

In [4]:
# Define Target Variable
y = df["y"] # define the target variable (dependent variable) as y

# Define Predictor Variables
x1 = df["x1"]
x2 = df["x2"]
x3 = df["x3"]
x4 = df["x4"]
x5 = df["x5"]

# Define all Variable Transformations
Xs = np.column_stack((x1, x2, x3, x4, x5, x1**2, x2**2, x3**2, x4**2, x5**2,
                       np.exp(x1),np.exp(x2),np.exp(x3),np.exp(x4),np.exp(x5), 
                        np.cos(x1),np.cos(x2),np.cos(x3),np.cos(x4),np.cos(x5),
                        np.ones(df.shape[0])))

###  Linear Regression

In [5]:
# Fit linear regression model
model = linear_model.LinearRegression(fit_intercept = False, normalize = False)
parameters = {'fit_intercept':[False], 'normalize':[False], 'copy_X':[False]}

# Build Linear Model with GridsearchCV
grid = GridSearchCV(model, parameters, cv=50)
grid.fit(Xs, y)
print("r2 / variance : ", grid.best_score_)
print("Residual sum of squares: %.2f" % np.mean((grid.predict(Xs) - y) ** 2))

## Access and store the best estimators
coefficients = grid.best_estimator_.coef_
submission = pd.DataFrame(data = coefficients, columns = None)
submission.to_csv('submission_Linear.csv', header = False, index = False)
submission

r2 / variance :  0.0275043979998894
Residual sum of squares: 95.10


Unnamed: 0,0
0,0.621003
1,-1.880502
2,0.384095
3,-0.441386
4,0.386888
5,-0.361083
6,0.391405
7,0.175731
8,-2.719074
9,2.234341


###  Ridge Regression

In [8]:
# Define parameters for Ridge Regression
parameters = [0.1, 1.0, 5.0, 10.0, 100.0, 150.0, 200.0, 250.0, 500.0, 750.0, 1000.0]

# Define Target and Predictor Variables
# Define Target Variable
y = df["y"] # define the target variable (dependent variable) as y

# Define Predictor Variables
x1 = df["x1"]
x2 = df["x2"]
x3 = df["x3"]
x4 = df["x4"]
x5 = df["x5"]

# Define all Variable Transformations
X = np.column_stack((x1, x2, x3, x4, x5, x1**2, x2**2, x3**2, x4**2, x5**2,
                       np.exp(x1),np.exp(x2),np.exp(x3),np.exp(x4),np.exp(x5), 
                        np.cos(x1),np.cos(x2),np.cos(x3),np.cos(x4),np.cos(x5),
                        np.ones(df.shape[0])))

In [18]:
# K-Fold Cross Validation
kf = KFold(n_splits= 10, random_state = 42)
kf.get_n_splits(X)

# For loop for Ridge Regression
for parameter in parameters:
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        ridge = Ridge(alpha = parameter, fit_intercept = False).fit(X_train,y_train)
        y_pred = ridge.predict(X_test)
        RMSE = mean_squared_error(y_pred, y_test)**0.5
    ridge = Ridge(alpha = parameter, fit_intercept = False).fit(X,y)
    print ('coefficients', ridge.coef_)
    print('RMSE', RMSE)

coefficients [ 0.61762984 -1.88229179  0.3813551  -0.43683927  0.38161901 -0.39712944
  0.35504523  0.16427795 -2.7104222   2.2092493   1.18116461  0.40509532
 -1.26237762  0.61234526 -0.33617798  3.00338141  1.73453392  0.93355747
 -6.07605683  3.89894139 -6.04314073]
RMSE 9.53880779422458
coefficients [ 0.59919306 -1.88485659  0.36124235 -0.37954175  0.34206535 -0.62446289
  0.13906265  0.08915228 -2.58182476  2.03360456  1.19655256  0.40485332
 -1.24957355  0.57501686 -0.31134547  2.34473675  1.09565677  0.72006588
 -5.76898595  3.39294339 -4.4652457 ]
RMSE 9.542495135303842
coefficients [ 0.59579305 -1.81038073  0.30815317 -0.14034298  0.23752641 -0.98979187
 -0.12701751 -0.05980985 -1.99475698  1.62407009  1.20712014  0.35053798
 -1.21634849  0.41857975 -0.24629334  1.24792607  0.20144866  0.32785049
 -4.34186724  2.23398022 -2.4021687 ]
RMSE 9.554121972560278
coefficients [ 0.62508804 -1.69299116  0.26960425  0.02930066  0.16841252 -1.12429614
 -0.16878671 -0.14334656 -1.56613319