In [38]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [39]:
traindf = pd.read_csv('../data/preprocessed/train_preprocessed.csv')
traindf = traindf.rename(columns={'Unnamed: 0': 'Id'})
traindf


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,0.073375,-0.045532,-0.226101,-0.207142,0.064238,0.750731,0.314667,-0.02618,0.604670,...,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,0.138777,0.313867,0.208502,0.347273
1,2,-0.872563,-0.045532,0.455190,-0.091886,0.064238,0.750731,0.314667,-0.02618,-0.628316,...,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-0.489110,-0.614439,0.313867,0.208502,0.007288
2,3,0.073375,-0.045532,-0.089843,0.073480,0.064238,-1.378933,0.314667,-0.02618,0.604670,...,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.990891,0.138777,0.313867,0.208502,0.536154
3,4,0.309859,-0.045532,-0.453198,-0.096897,0.064238,-1.378933,0.314667,-0.02618,-1.861302,...,4.092524,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,-1.367655,0.313867,-3.426284,-0.515281
4,5,0.073375,-0.045532,0.636868,0.375148,0.064238,-1.378933,0.314667,-0.02618,-0.628316,...,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,2.100892,0.138777,0.313867,0.208502,0.869843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,0.073375,-0.045532,-0.362359,-0.260560,0.064238,0.750731,0.314667,-0.02618,0.604670,...,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.620891,-0.614439,0.313867,0.208502,-0.074560
1456,1457,-0.872563,-0.045532,0.682287,0.266407,0.064238,0.750731,0.314667,-0.02618,0.604670,...,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,1.645210,0.313867,0.208502,0.366161
1457,1458,0.309859,-0.045532,-0.180681,-0.147810,0.064238,0.750731,0.314667,-0.02618,0.604670,...,-0.359325,-0.116339,-0.270208,-0.068692,4.953112,-0.489110,1.645210,0.313867,0.208502,1.077611
1458,1459,-0.872563,-0.045532,-0.089843,-0.080160,0.064238,0.750731,0.314667,-0.02618,0.604670,...,1.473789,-0.116339,-0.270208,-0.068692,-0.087688,-0.859110,1.645210,0.313867,0.208502,-0.488523


In [40]:
features = traindf.drop(columns = ['Id', 'SalePrice'])
labels = traindf['SalePrice']
features

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0.073375,-0.045532,-0.226101,-0.207142,0.064238,0.750731,0.314667,-0.02618,0.604670,-0.225716,...,0.216503,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,0.138777,0.313867,0.208502
1,-0.872563,-0.045532,0.455190,-0.091886,0.064238,0.750731,0.314667,-0.02618,-0.628316,-0.225716,...,-0.704483,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-0.489110,-0.614439,0.313867,0.208502
2,0.073375,-0.045532,-0.089843,0.073480,0.064238,-1.378933,0.314667,-0.02618,0.604670,-0.225716,...,-0.070361,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.990891,0.138777,0.313867,0.208502
3,0.309859,-0.045532,-0.453198,-0.096897,0.064238,-1.378933,0.314667,-0.02618,-1.861302,-0.225716,...,-0.176048,4.092524,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,-1.367655,0.313867,-3.426284
4,0.073375,-0.045532,0.636868,0.375148,0.064238,-1.378933,0.314667,-0.02618,-0.628316,-0.225716,...,0.563760,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,2.100892,0.138777,0.313867,0.208502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.073375,-0.045532,-0.362359,-0.260560,0.064238,0.750731,0.314667,-0.02618,0.604670,-0.225716,...,-0.100558,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.620891,-0.614439,0.313867,0.208502
1456,-0.872563,-0.045532,0.682287,0.266407,0.064238,0.750731,0.314667,-0.02618,0.604670,-0.225716,...,-0.704483,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,1.645210,0.313867,0.208502
1457,0.309859,-0.045532,-0.180681,-0.147810,0.064238,0.750731,0.314667,-0.02618,0.604670,-0.225716,...,0.201405,-0.359325,-0.116339,-0.270208,-0.068692,4.953112,-0.489110,1.645210,0.313867,0.208502
1458,-0.872563,-0.045532,-0.089843,-0.080160,0.064238,0.750731,0.314667,-0.02618,0.604670,-0.225716,...,-0.704483,1.473789,-0.116339,-0.270208,-0.068692,-0.087688,-0.859110,1.645210,0.313867,0.208502


In [41]:
testdf = pd.read_csv('../data/preprocessed/test_preprocessed.csv')
test = testdf.drop(columns = ['Id'])

## Linear Regression

In [42]:
from sklearn.linear_model import LinearRegression

In [43]:
LinearRegr = LinearRegression()
LinearRegr.fit(features, labels)

In [44]:
LinearRegr.predict(test)

array([-0.94749952, -0.33022001, -0.18269322, ..., -0.39958445,
       -0.73403882,  0.79649017])

## Ridge Regression

In [45]:
from sklearn.linear_model import Ridge

In [46]:
RidgeRegr = Ridge(alpha=10)
RidgeRegr.fit(features, labels)

In [47]:
RidgeRegr.predict(test)

array([-0.94199083, -0.33407856, -0.17822429, ..., -0.39077502,
       -0.74097702,  0.79412926])

## Lasso Regression

In [48]:
from sklearn.linear_model import Lasso

In [49]:
LassoRegr = Lasso(alpha = 0.001)
LassoRegr.fit(features, labels)

In [50]:
LassoRegr.predict(test)

array([-0.93203818, -0.30556796, -0.17875908, ..., -0.39477137,
       -0.77037206,  0.78456689])