###  1: Linear Models

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

x_train = pd.read_csv("x_train.csv")
x_train_scaled = pd.read_csv("x_train_scaled.csv")
y_train = pd.read_csv("y_train.csv")

x_test = pd.read_csv("x_test.csv")
x_test_scaled = pd.read_csv("x_test_scale.csv")
y_test = pd.read_csv("y_test.csv")


In [8]:
x_train.shape


(3701, 1445)

In [9]:
y_train.shape

(3701, 1)

In [10]:
x_test.shape

(1216, 1043)

In [11]:
y_test.shape

(1216, 1)

In [12]:
additional_test_columns = [x for x in x_test.columns if x not in x_train.columns]
additional_training_columns = test_columns = [x for x in x_train.columns if x not in x_test.columns]

for column in additional_training_columns:
    x_test[column] = 0
for column in additional_test_columns:
    x_train[column] = 0


In [13]:
x_train.shape


(3701, 1584)

In [14]:
x_test.shape

(1216, 1584)

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet

### Ridge

In [16]:
#Scaling does not affect the accuracy of Linear Regression

param_grid_ridge = {'alpha': np.logspace(6,8,10)}
print(param_grid_ridge)

{'alpha': array([  1.00000000e+06,   1.66810054e+06,   2.78255940e+06,
         4.64158883e+06,   7.74263683e+06,   1.29154967e+07,
         2.15443469e+07,   3.59381366e+07,   5.99484250e+07,
         1.00000000e+08])}


In [17]:
ridge = GridSearchCV(Ridge(), param_grid_ridge, return_train_score=True, iid=False)
ridge.fit(x_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid=False, n_jobs=1,
       param_grid={'alpha': array([  1.00000e+06,   1.66810e+06,   2.78256e+06,   4.64159e+06,
         7.74264e+06,   1.29155e+07,   2.15443e+07,   3.59381e+07,
         5.99484e+07,   1.00000e+08])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [18]:
print(ridge.best_params_)
print(ridge.best_score_)

{'alpha': 1000000.0}
0.816193830078


In [19]:
ridge.score(x_test, y_test)

0.76814159850602481

### 2 : Feature Engineering

In [35]:
x_train_num = x_train.iloc[:, 0:10]
x_test_num = x_test.iloc[:, 0:10]

In [36]:
#x_train_num.info()
#x_test_num.info()

In [37]:
x_train_categorical = x_train.iloc[:, 10:]
x_test_categorical = x_test.iloc[:, 10:]

#x_train_categorical.info()
#x_test_categorical.info()

In [38]:
from sklearn.preprocessing import PolynomialFeatures

In [39]:
poly = PolynomialFeatures()
x_train_num_poly = poly.fit_transform(x_train_num)
x_train_num_poly = pd.DataFrame(x_train_num_poly)
x_train_num_poly.shape


(3701, 66)

In [40]:
poly = PolynomialFeatures()
x_test_num_poly = poly.fit_transform(x_test_num)
x_test_num_poly = pd.DataFrame(x_test_num_poly)
x_test_num_poly.shape

(1216, 66)

In [41]:
x_train_poly = pd.concat([x_train_num_poly,x_train_categorical ], axis=1)
x_train_poly.shape

(3701, 1640)

In [42]:
x_test_poly = pd.concat([x_test_num_poly,x_test_categorical ], axis=1)
x_test_poly.shape

(1216, 1640)

### Ridge on Polynomial features

In [43]:
param_grid_ridge = {'alpha': np.logspace(6,8,10)}
print(param_grid_ridge)

{'alpha': array([  1.00000000e+06,   1.66810054e+06,   2.78255940e+06,
         4.64158883e+06,   7.74263683e+06,   1.29154967e+07,
         2.15443469e+07,   3.59381366e+07,   5.99484250e+07,
         1.00000000e+08])}


In [44]:
ridge = GridSearchCV(Ridge(), param_grid_ridge, return_train_score=True, iid=False)
ridge.fit(x_train_poly, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid=False, n_jobs=1,
       param_grid={'alpha': array([  1.00000e+06,   1.66810e+06,   2.78256e+06,   4.64159e+06,
         7.74264e+06,   1.29155e+07,   2.15443e+07,   3.59381e+07,
         5.99484e+07,   1.00000e+08])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [45]:
print(ridge.best_params_)
print(ridge.best_score_)

{'alpha': 1000000.0}
0.929178931816


In [46]:
ridge.score(x_test_poly, y_test)

0.89509570142264938