In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from scipy import stats

from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import load_breast_cancer, load_digits, load_iris
from sklearn.metrics import mean_squared_error

from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import PolynomialFeatures, OrdinalEncoder

from category_encoders import OneHotEncoder

from sklearn.pipeline import make_pipeline
import sklearn
sklearn.set_config(display = 'diagram')
from sklearn.feature_selection import RFE

In [244]:
df = pd.read_csv('../datasets/train.csv')

In [166]:
df.corr()['Overall Qual'].sort_values(ascending=False)

Overall Qual       1.000000
SalePrice          0.800207
Year Built         0.602964
Garage Cars        0.587423
Year Remod/Add     0.584654
Garage Yr Blt      0.574553
Gr Liv Area        0.566701
Garage Area        0.563814
Total Bsmt SF      0.548742
Full Bath          0.515080
1st Flr SF         0.477136
Mas Vnr Area       0.438685
Fireplaces         0.388920
TotRms AbvGrd      0.382025
Open Porch SF      0.308855
BsmtFin SF 1       0.278742
Bsmt Unf SF        0.275773
Half Bath          0.274859
Wood Deck SF       0.257081
2nd Flr SF         0.228152
Lot Frontage       0.194808
Bsmt Full Bath     0.175171
Lot Area           0.105824
Bedroom AbvGr      0.053373
Screen Porch       0.048752
MS SubClass        0.035763
3Ssn Porch         0.031938
Misc Val           0.022099
Mo Sold            0.019242
Pool Area          0.006558
Yr Sold           -0.011578
BsmtFin SF 2      -0.028199
Bsmt Half Bath    -0.047318
Low Qual Fin SF   -0.052338
Id                -0.061483
Overall Cond      -0

In [135]:
pd.options.display.max_rows = None

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               2051 non-null   int64  
 1   PID              2051 non-null   int64  
 2   MS SubClass      2051 non-null   int64  
 3   MS Zoning        2051 non-null   object 
 4   Lot Frontage     1721 non-null   float64
 5   Lot Area         2051 non-null   int64  
 6   Street           2051 non-null   object 
 7   Alley            140 non-null    object 
 8   Lot Shape        2051 non-null   object 
 9   Land Contour     2051 non-null   object 
 10  Utilities        2051 non-null   object 
 11  Lot Config       2051 non-null   object 
 12  Land Slope       2051 non-null   object 
 13  Neighborhood     2051 non-null   object 
 14  Condition 1      2051 non-null   object 
 15  Condition 2      2051 non-null   object 
 16  Bldg Type        2051 non-null   object 
 17  House Style   

In [167]:
df['Bsmt Qual'].value_counts()

TA    887
Gd    864
Ex    184
Fa     60
Po      1
Name: Bsmt Qual, dtype: int64

In [245]:
df['Bsmt Qual'] = df['Bsmt Qual'].map({np.nan: 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})

In [170]:
df['Bsmt Qual'].value_counts()

3    887
4    864
5    184
2     60
0     55
1      1
Name: Bsmt Qual, dtype: int64

In [246]:
df = df[df['Total Bsmt SF'].notna()]
df = df[df['Garage Area'].notna()]

In [247]:
X = df[['Overall Qual', 'Total Bsmt SF', 'Gr Liv Area', 'Year Built', 'Garage Area', 'Roof Matl', 'Land Contour', 'Bsmt Qual', 'Misc Val', 'Lot Area', 'Overall Cond',  'Screen Porch']]

In [248]:
y = df['SalePrice']

In [197]:
ohe = OneHotEncoder(use_cat_names = True)

In [198]:
X_encoded = ohe.fit_transform(X)

  elif pd.api.types.is_categorical(cols):


In [251]:
# model = sm.OLS(y, X_encoded).fit()

In [252]:
# model.summary()

In [249]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [250]:
y_test.shape

(513,)

In [226]:
pipe = make_pipeline(OneHotEncoder(use_cat_names = True), PolynomialFeatures(), StandardScaler(), LinearRegression())
pipe

In [227]:
params = {
    'polynomialfeatures__interaction_only': [True, False],
#     'ridge__alpha': [.1, 1, 10, 100]   
}

In [228]:
gs = GridSearchCV(pipe, params, n_jobs = -1) 

In [229]:
gs.fit(X_train, y_train)

  elif pd.api.types.is_categorical(cols):


In [230]:
gs.best_params_

{'polynomialfeatures__interaction_only': True}

In [207]:
pd.DataFrame(gs.cv_results_).sort_values(by = 'rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_polynomialfeatures__interaction_only,param_ridge__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,0.157014,0.015065,0.034256,0.005207,True,100.0,"{'polynomialfeatures__interaction_only': True,...",0.386158,0.902308,0.751149,0.872135,0.899821,0.762314,0.196054,1
2,0.192347,0.043748,0.033643,0.000592,True,10.0,"{'polynomialfeatures__interaction_only': True,...",0.124379,0.900929,0.899046,0.884033,0.899359,0.741549,0.308646,2
7,0.153646,0.023668,0.033604,0.004099,False,100.0,{'polynomialfeatures__interaction_only': False...,0.311222,0.90433,0.698041,0.872513,0.902539,0.737729,0.226525,3
6,0.149417,0.01372,0.035787,0.006179,False,10.0,{'polynomialfeatures__interaction_only': False...,0.103334,0.89239,0.906196,0.88168,0.900756,0.736871,0.316877,4
5,0.149478,0.017744,0.037584,0.012458,False,1.0,{'polynomialfeatures__interaction_only': False...,-0.236186,0.805794,0.482078,0.880117,0.885651,0.563491,0.426348,5
1,0.197852,0.046357,0.039016,0.003954,True,1.0,"{'polynomialfeatures__interaction_only': True,...",-0.267015,0.891059,0.224433,0.883623,0.883529,0.523126,0.470906,6
4,0.188422,0.033427,0.033161,0.012054,False,0.1,{'polynomialfeatures__interaction_only': False...,-1.330316,0.387504,0.29401,0.87137,0.881003,0.220714,0.812185,7
0,0.204033,0.027924,0.043185,0.011931,True,0.1,"{'polynomialfeatures__interaction_only': True,...",-1.677341,0.88204,-0.05861,0.873929,0.887535,0.18151,0.998163,8


In [235]:
pipe = make_pipeline(OneHotEncoder(use_cat_names = True), PolynomialFeatures(interaction_only=True), StandardScaler(), LinearRegression())
pipe

In [236]:
pipe.fit(X_train, y_train)

  elif pd.api.types.is_categorical(cols):


In [237]:
pipe.score(X_train, y_train)

0.9211907995348058

In [234]:
pipe.score(X_test, y_test)

ValueError: Found input variables with inconsistent numbers of samples: [513, 878]

In [None]:
preds = pipe.predict(X_test)

In [None]:
mean_squared_error(preds, y_test, squared = False)

In [214]:
pipe.fit(X, y)

  elif pd.api.types.is_categorical(cols):


In [215]:
pipe.score(X, y)

0.9102600634651007

In [216]:
preds = pipe.predict(X)

In [217]:
mean_squared_error(preds, y, squared = False)

23738.45893911903

Model 3 = 24087 on X, y<br>
26961 on X_train, y_train

In [218]:
df_test = pd.read_csv('../datasets/test.csv')
df_test['Bsmt Qual'] = df_test['Bsmt Qual'].map({np.nan: 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
df_test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [219]:
X_test = df_test[['Overall Qual', 'Total Bsmt SF', 'Gr Liv Area', 'Year Built', 'Garage Area', 'Roof Matl', 'Land Contour', 'Bsmt Qual', 'Misc Val', 'Lot Area', 'Overall Cond',  'Screen Porch']]
X_test.shape

(878, 12)

In [220]:
predictions = pipe.predict(X_test)
predictions.shape

(878,)

In [221]:
df_sales = pd.DataFrame(predictions, columns = ['SalePrice'])
df_sales.shape

(878, 1)

In [222]:
df_model6 = pd.concat([df_test['Id'], df_sales], axis = 1)
df_model6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Id         878 non-null    int64  
 1   SalePrice  878 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 13.8 KB


In [223]:
df_model6.set_index('Id', inplace = True)

In [224]:
df_model6.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 878 entries, 2658 to 1939
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SalePrice  878 non-null    float64
dtypes: float64(1)
memory usage: 13.7 KB


In [225]:
df_model6.to_csv('../submissions/model6_submission.csv')

Used: 'Overall Qual', 'Total Bsmt SF', 'Gr Liv Area', 'Year Built', 'Garage Area', 'Roof Matl', 'Land Contour', 'Neighborhood', 'Bsmt Qual', 'Misc Val'<br>
Converted 'Bsmt Qual' to 0-5

pipe = make_pipeline(OneHotEncoder(use_cat_names = True), PolynomialFeatures(interaction_only = True), StandardScaler(), Ridge(alpha = 100))