In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import LinearRegression, LassoCV,RidgeCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [2]:
naive_df=pd.read_csv('../data/naive_model_data.csv')

In [3]:
naive_df.head()

Unnamed: 0,lot_area,overall_qual,totrms_abvgrd,saleprice
0,13517,6,6,130500
1,11492,7,8,220000
2,7922,5,5,109000
3,9802,5,7,174000
4,14235,6,6,138500


### PolynomialFeatures

In [4]:
features = ['lot_area','overall_qual','totrms_abvgrd']
X=naive_df[features]
X.head()

Unnamed: 0,lot_area,overall_qual,totrms_abvgrd
0,13517,6,6
1,11492,7,8
2,7922,5,5
3,9802,5,7
4,14235,6,6


In [5]:
poly=PolynomialFeatures(include_bias=False)

In [6]:
poly

PolynomialFeatures(include_bias=False)

In [7]:
X_ploy = poly.fit_transform(X)

In [8]:
poly.get_feature_names(features)

['lot_area',
 'overall_qual',
 'totrms_abvgrd',
 'lot_area^2',
 'lot_area overall_qual',
 'lot_area totrms_abvgrd',
 'overall_qual^2',
 'overall_qual totrms_abvgrd',
 'totrms_abvgrd^2']

In [9]:
X_ploy_df=pd.DataFrame(X_ploy,columns=poly.get_feature_names(features))

In [10]:
X_ploy_df

Unnamed: 0,lot_area,overall_qual,totrms_abvgrd,lot_area^2,lot_area overall_qual,lot_area totrms_abvgrd,overall_qual^2,overall_qual totrms_abvgrd,totrms_abvgrd^2
0,13517.0,6.0,6.0,182709289.0,81102.0,81102.0,36.0,36.0,36.0
1,11492.0,7.0,8.0,132066064.0,80444.0,91936.0,49.0,56.0,64.0
2,7922.0,5.0,5.0,62758084.0,39610.0,39610.0,25.0,25.0,25.0
3,9802.0,5.0,7.0,96079204.0,49010.0,68614.0,25.0,35.0,49.0
4,14235.0,6.0,6.0,202635225.0,85410.0,85410.0,36.0,36.0,36.0
...,...,...,...,...,...,...,...,...,...
2040,11449.0,8.0,7.0,131079601.0,91592.0,80143.0,64.0,56.0,49.0
2041,12342.0,4.0,4.0,152324964.0,49368.0,49368.0,16.0,16.0,16.0
2042,7558.0,6.0,9.0,57123364.0,45348.0,68022.0,36.0,54.0,81.0
2043,10400.0,4.0,6.0,108160000.0,41600.0,62400.0,16.0,24.0,36.0


### Create X and y variables

In [11]:
X=X_ploy_df
y=naive_df['saleprice']

In [12]:
X.shape

(2045, 9)

In [13]:
y.shape

(2045,)

### Train/test Split

In [14]:
X_train, X_test, y_train, y_test=train_test_split(X, y)

In [15]:
X_train.shape

(1533, 9)

### Scaling

In [16]:
sc=StandardScaler()
Z_train=sc.fit_transform(X_train)
Z_test=sc.transform(X_test)

### Model Prep: Instantiate our models

In [17]:
lr=LinearRegression()
lasso=LassoCV()
ridge=RidgeCV()

### Cross validation

lr_scores=cross_val_score(lr,Z_train,y_train).mean()
lr_scores

lasso_scores=cross_val_score(lasso,Z_train,y_train).mean()
lasso_scores

ridge_scores=cross_val_score(ridge, Z_train,y_train).mean()
ridge_scores

> All 3 of those models scored simillar, so I am going to just use the linear regression to predict y

### lr

In [18]:
lr.fit(Z_train,y_train)

LinearRegression()

In [19]:
lr.score(Z_train, y_train)

0.7916335118517642

In [20]:
lr.score(Z_test, y_test) # overfit

0.7624404945048686

## get the actual test data set

In [21]:
test_pred_df=pd.read_csv('../data/naive_test_data.csv')

In [22]:
test_pred_df.head(2)

Unnamed: 0,id,lot_area,overall_qual,totrms_abvgrd
0,2658,9142,6,9
1,2718,9662,5,10


In [23]:
X_test_pred=test_pred_df[['lot_area','overall_qual','totrms_abvgrd']]

In [24]:
X_test_pred_poly=poly.fit_transform(X_test_pred)

In [25]:
poly.get_feature_names(features)

['lot_area',
 'overall_qual',
 'totrms_abvgrd',
 'lot_area^2',
 'lot_area overall_qual',
 'lot_area totrms_abvgrd',
 'overall_qual^2',
 'overall_qual totrms_abvgrd',
 'totrms_abvgrd^2']

In [26]:
X_test_pred_poly_df=pd.DataFrame(X_test_pred_poly,columns=poly.get_feature_names(features))

In [27]:
X_test_pred_poly_df.shape

(878, 9)

In [28]:
#Scal the test set
Z_test_pred_poly_df=sc.fit_transform(X_test_pred_poly_df)
Z_test_pred_poly_df

array([[-0.11653854, -0.03662533,  1.59523869, ..., -0.14335041,
         0.89122699,  1.65044166],
       [-0.06452281, -0.76746722,  2.22191411, ..., -0.78018493,
         0.63903223,  2.5027143 ],
       [ 0.67990225,  0.69421656,  0.34188784, ...,  0.60927221,
         0.57598354,  0.2150351 ],
       ...,
       [-0.20576551, -0.76746722, -0.911463  , ..., -0.78018493,
        -0.93718503, -0.86151982],
       [-0.13074283, -1.49830911, -0.911463  , ..., -1.30123136,
        -1.25242848, -0.86151982],
       [-0.19076098, -0.76746722, -0.911463  , ..., -0.78018493,
        -0.93718503, -0.86151982]])

# Predict

In [29]:
test_pred_df['SalePrice'] = lr.predict(Z_test_pred_poly_df)

In [30]:
naive_model_submission= test_pred_df[['id','SalePrice']].copy()

In [31]:
naive_model_submission.head()

Unnamed: 0,id,SalePrice
0,2658,192467.14132
1,2718,161995.648396
2,2414,240229.628016
3,1989,124432.123086
4,625,165544.074792


In [32]:
naive_model_submission.to_csv('../data/naive_model_submission.csv', index=False)