In [74]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from scipy import stats

from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import load_breast_cancer, load_digits, load_iris
from sklearn.metrics import mean_squared_error

from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import PolynomialFeatures, OrdinalEncoder

from category_encoders import OneHotEncoder

from sklearn.pipeline import make_pipeline
import sklearn
sklearn.set_config(display = 'diagram')
from sklearn.feature_selection import RFE

In [75]:
df = pd.read_csv('../datasets/train.csv')
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [50]:
df.corr()['SalePrice'].sort_values(ascending=False)

SalePrice          1.000000
Overall Qual       0.800207
Gr Liv Area        0.697038
Garage Area        0.650270
Garage Cars        0.648220
Total Bsmt SF      0.628925
1st Flr SF         0.618486
Year Built         0.571849
Year Remod/Add     0.550370
Full Bath          0.537969
Garage Yr Blt      0.533922
Mas Vnr Area       0.512230
TotRms AbvGrd      0.504014
Fireplaces         0.471093
BsmtFin SF 1       0.423519
Lot Frontage       0.341842
Open Porch SF      0.333476
Wood Deck SF       0.326490
Lot Area           0.296566
Bsmt Full Bath     0.283662
Half Bath          0.283001
2nd Flr SF         0.248452
Bsmt Unf SF        0.190210
Bedroom AbvGr      0.137067
Screen Porch       0.134581
3Ssn Porch         0.048732
Mo Sold            0.032735
Pool Area          0.023106
BsmtFin SF 2       0.016255
Misc Val          -0.007375
Yr Sold           -0.015203
Low Qual Fin SF   -0.041594
Bsmt Half Bath    -0.045328
Id                -0.051398
MS SubClass       -0.087335
Overall Cond      -0

In [51]:
df.describe()

Unnamed: 0,Id,PID,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,...,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,SalePrice
count,2051.0,2051.0,2051.0,1721.0,2051.0,2051.0,2051.0,2051.0,2051.0,2029.0,...,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0
mean,1474.033642,713590000.0,57.008776,69.0552,10065.208191,6.11214,5.562165,1971.708922,1984.190151,99.695909,...,93.83374,47.556802,22.571916,2.591419,16.511458,2.397855,51.574354,6.219893,2007.775719,181469.701609
std,843.980841,188691800.0,42.824223,23.260653,6742.488909,1.426271,1.104497,30.177889,21.03625,174.963129,...,128.549416,66.747241,59.84511,25.229615,57.374204,37.78257,573.393985,2.744736,1.312014,79258.659352
min,1.0,526301100.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,12789.0
25%,753.5,528458100.0,20.0,58.0,7500.0,5.0,5.0,1953.5,1964.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,129825.0
50%,1486.0,535453200.0,50.0,68.0,9430.0,6.0,5.0,1974.0,1993.0,0.0,...,0.0,27.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,162500.0
75%,2198.0,907180100.0,70.0,80.0,11513.5,7.0,6.0,2001.0,2004.0,161.0,...,168.0,70.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,2930.0,924152000.0,190.0,313.0,159000.0,10.0,9.0,2010.0,2010.0,1600.0,...,1424.0,547.0,432.0,508.0,490.0,800.0,17000.0,12.0,2010.0,611657.0


In [52]:
pd.options.display.max_rows = None

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               2051 non-null   int64  
 1   PID              2051 non-null   int64  
 2   MS SubClass      2051 non-null   int64  
 3   MS Zoning        2051 non-null   object 
 4   Lot Frontage     1721 non-null   float64
 5   Lot Area         2051 non-null   int64  
 6   Street           2051 non-null   object 
 7   Alley            140 non-null    object 
 8   Lot Shape        2051 non-null   object 
 9   Land Contour     2051 non-null   object 
 10  Utilities        2051 non-null   object 
 11  Lot Config       2051 non-null   object 
 12  Land Slope       2051 non-null   object 
 13  Neighborhood     2051 non-null   object 
 14  Condition 1      2051 non-null   object 
 15  Condition 2      2051 non-null   object 
 16  Bldg Type        2051 non-null   object 
 17  House Style   

In [None]:
df['MS SubClass']

In [53]:
df['Bsmt Qual'].value_counts()

TA    887
Gd    864
Ex    184
Fa     60
Po      1
Name: Bsmt Qual, dtype: int64

In [54]:
df['Bsmt Qual'] = df['Bsmt Qual'].map({np.nan: 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})

In [55]:
df['Bsmt Qual'].value_counts()

3    887
4    864
5    184
2     60
0     55
1      1
Name: Bsmt Qual, dtype: int64

In [56]:
df = df[df['Total Bsmt SF'].notna()]
df = df[df['Garage Area'].notna()]

In [57]:
X = df[['Overall Qual', 'Total Bsmt SF', 'Gr Liv Area', 'Year Built', 'Garage Area', 'Roof Matl', 'Land Contour', 'Neighborhood', 'Bsmt Qual']]

In [58]:
y = df['SalePrice']

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [60]:
pipe = make_pipeline(OneHotEncoder(use_cat_names = True), PolynomialFeatures(), StandardScaler(), Ridge())
pipe

In [28]:
params = {
    'polynomialfeatures__interaction_only': [True, False],
    'ridge__alpha': [.1, 1, 10, 100]   
}

In [29]:
gs = GridSearchCV(pipe, params, n_jobs = -1) 

In [30]:
gs.fit(X_train, y_train)

  elif pd.api.types.is_categorical(cols):


In [31]:
gs.best_params_

{'polynomialfeatures__interaction_only': True, 'ridge__alpha': 100}

In [61]:
pipe = make_pipeline(OneHotEncoder(use_cat_names = True), PolynomialFeatures(interaction_only = True), StandardScaler(), Ridge(alpha = 100))
pipe

In [62]:
pipe.fit(X_train, y_train)

  elif pd.api.types.is_categorical(cols):


In [63]:
pipe.score(X_train, y_train)

0.9135497608912738

In [64]:
pipe.score(X_test, y_test)

0.8716664082945531

In [65]:
preds = pipe.predict(X_test)

In [66]:
mean_squared_error(preds, y_test, squared = False)

30210.216525435288

In [67]:
pipe.fit(X, y)

  elif pd.api.types.is_categorical(cols):


In [68]:
pipe.score(X, y)

0.9118044732863719

In [69]:
preds = pipe.predict(X)

In [70]:
mean_squared_error(preds, y, squared = False)

23533.304918400438

Model 3 = 24087 on X, y<br>
26961 on X_train, y_train

In [76]:
df_test = pd.read_csv('../datasets/test.csv')
df_test['Bsmt Qual'] = df_test['Bsmt Qual'].map({np.nan: 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
df_test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [77]:
X_test = df_test[['Overall Qual', 'Total Bsmt SF', 'Gr Liv Area', 'Year Built', 'Garage Area', 'Roof Matl', 'Land Contour', 'Neighborhood', 'Bsmt Qual']]
X_test.shape

(878, 9)

In [78]:
predictions = pipe.predict(X_test)
predictions.shape

(878,)

In [79]:
df_sales = pd.DataFrame(predictions, columns = ['SalePrice'])
df_sales.shape

(878, 1)

In [82]:
df_model4 = pd.concat([df_test['Id'], df_sales], axis = 1)
df_model4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Id         878 non-null    int64  
 1   SalePrice  878 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 13.8 KB


In [83]:
df_model4.set_index('Id', inplace = True)

In [84]:
df_model4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 878 entries, 2658 to 1939
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SalePrice  878 non-null    float64
dtypes: float64(1)
memory usage: 13.7 KB


In [86]:
df_model4.to_csv('../submissions/model4_submission.csv')

Used: 'Overall Qual', 'Total Bsmt SF', 'Gr Liv Area', 'Year Built', 'Garage Area', 'Roof Matl', 'Land Contour', 'Neighborhood', 'Bsmt Qual'<br>
Converted 'Bsmt Qual' to 0-5

pipe = make_pipeline(OneHotEncoder(use_cat_names = True), PolynomialFeatures(interaction_only = True), StandardScaler(), Ridge(alpha = 100))