## Pre-processing
- One-hot encode categorical variables.
- Train/test split your data.
- Scale your data.
- Consider using automated feature selection.

In [1]:
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pylab as plt
import pandas as pd

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge, RidgeCV ,Lasso, LassoCV, ElasticNet, ElasticNetCV

%matplotlib inline

  import pandas.util.testing as tm


In [2]:
train_df = pd.read_csv('../datasets/train_df.csv', index_col=0)
test_df = pd.read_csv('../datasets/test_df.csv', index_col=0)

In [3]:
train_df.shape

(2049, 81)

In [4]:
test_df.shape

(878, 80)

In [5]:
train_df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [6]:
test_df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2049 entries, 0 to 2050
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               2049 non-null   int64  
 1   PID              2049 non-null   int64  
 2   MS SubClass      2049 non-null   int64  
 3   MS Zoning        2049 non-null   object 
 4   Lot Frontage     1719 non-null   float64
 5   Lot Area         2049 non-null   int64  
 6   Street           2049 non-null   object 
 7   Alley            140 non-null    object 
 8   Lot Shape        2049 non-null   object 
 9   Land Contour     2049 non-null   object 
 10  Utilities        2049 non-null   object 
 11  Lot Config       2049 non-null   object 
 12  Land Slope       2049 non-null   object 
 13  Neighborhood     2049 non-null   object 
 14  Condition 1      2049 non-null   object 
 15  Condition 2      2049 non-null   object 
 16  Bldg Type        2049 non-null   object 
 17  House Style   

Running the first Linear Regression model with only those that have shown greater than 0.5 correlation to target 'SalesPrice'

In [8]:
X = train_df[['Gr Liv Area', 'Garage Area', 'Total Bsmt SF', '1st Flr SF']]
y = train_df['SalePrice']

In [9]:
X.shape

(2049, 4)

In [10]:
y.shape

(2049,)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y)
X_train.shape

(1536, 4)

In [12]:
ss = StandardScaler()

In [13]:
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [14]:
lr = LinearRegression()
lasso = LassoCV(n_alphas=200)

In [15]:
lr.scores = cross_val_score(lr, X_train_sc, y_train, cv=5)
lr.scores

array([0.77694597, 0.6032808 , 0.73222686, 0.66936746, 0.70067381])

In [16]:
lr.scores.mean()

0.6964989811008853

In [17]:
lasso_scores = cross_val_score(lasso, X_train_sc, y_train, cv=5)
lasso_scores

array([0.77644228, 0.60445761, 0.73447832, 0.66944124, 0.70017315])

In [18]:
lasso_scores.mean()

0.6969985188370786

In [19]:
lasso.fit(X_train_sc, y_train)

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=200, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=None,
        selection='cyclic', tol=0.0001, verbose=False)

In [20]:
lasso.score(X_train_sc, y_train)

0.7005131000431377

In [21]:
lasso.score(X_test_sc, y_test)

0.5758161445496195

In [22]:
X = train_df[['Gr Liv Area', 'Garage Area', 'Total Bsmt SF']]
y = train_df['SalePrice']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [24]:
ss = StandardScaler()

In [25]:
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [26]:
lr = LinearRegression()
lasso = LassoCV(n_alphas=200)

In [27]:
lr.scores = cross_val_score(lr, X_train_sc, y_train, cv=5)
lr.scores

array([0.67523789, 0.73466324, 0.43632809, 0.59459974, 0.70279738])

In [28]:
lr.scores.mean()

0.6287252671662982

In [29]:
lasso_scores = cross_val_score(lasso, X_train_sc, y_train, cv=5)
lasso_scores

array([0.67328854, 0.7334135 , 0.44158153, 0.59769072, 0.69781865])

In [30]:
lasso_scores.mean()

0.6287585899582712

In [31]:
lasso.fit(X_train_sc, y_train)

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=200, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=None,
        selection='cyclic', tol=0.0001, verbose=False)

In [32]:
lasso.score(X_train_sc, y_train)

0.6486081796305901

In [33]:
lasso.score(X_test_sc, y_test)

0.7122486608841978

In [34]:
y_pred = lasso.predict(X_test_sc)

In [35]:
test_df = pd.read_csv('../datasets/test_df.csv')

In [36]:
X_test = test_df[['Gr Liv Area', 'Garage Area', 'Total Bsmt SF']]

In [37]:
predictions = lasso.predict(X_test)

In [38]:
print(predictions)

[9.24355761e+07 1.15959672e+08 7.09854255e+07 6.17107887e+07
 8.46416838e+07 5.26569249e+07 5.96854641e+07 7.19631103e+07
 8.43775629e+07 7.60411055e+07 7.47425796e+07 6.17074954e+07
 8.20195150e+07 1.31694824e+08 8.10822040e+07 4.88138987e+07
 8.30952899e+07 6.38902506e+07 9.28570408e+07 8.40419968e+07
 6.15765648e+07 6.22044050e+07 9.64126659e+07 6.31507980e+07
 7.69476010e+07 5.33113275e+07 7.85703097e+07 7.43721363e+07
 6.38114388e+07 2.89729481e+07 6.07693056e+07 6.69732254e+07
 1.29481116e+08 7.16228552e+07 9.19336853e+07 6.95056641e+07
 7.98351178e+07 4.27742420e+07 4.09981985e+07 8.49875370e+07
 6.38547539e+07 8.90793699e+07 7.84368336e+07 7.06179238e+07
 9.01546581e+07 5.40007798e+07 8.98316855e+07 5.45931571e+07
 5.60227631e+07 6.14266613e+07 6.20281778e+07 9.40599025e+07
 1.04315960e+08 6.66190444e+07 5.34820168e+07 6.37883878e+07
 8.44309568e+07 7.49199894e+07 7.36692420e+07 9.92021527e+07
 1.25882730e+08 7.19865689e+07 6.80150694e+07 8.08556685e+07
 7.79025699e+07 1.256649

In [42]:
my_submission = pd.DataFrame({'Id': test_df.Id, 'SalePrice': predictions})
my_submission.to_csv('../datasets/submission.csv')