In [173]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from scipy import stats

from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import load_breast_cancer, load_digits, load_iris
from sklearn.metrics import mean_squared_error

from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures

from category_encoders import OneHotEncoder

from sklearn.pipeline import make_pipeline
import sklearn
sklearn.set_config(display = 'diagram')
from sklearn.feature_selection import RFE

In [5]:
df = pd.read_csv('../datasets/train.csv')
df

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,1587,921126030,20,RL,79.0,11449,Pave,,IR1,HLS,...,0,0,,,,0,1,2008,WD,298751
2047,785,905377130,30,RL,,12342,Pave,,IR1,Lvl,...,0,0,,,,0,3,2009,WD,82500
2048,916,909253010,50,RL,57.0,7558,Pave,,Reg,Bnk,...,0,0,,,,0,3,2009,WD,177000
2049,639,535179160,20,RL,80.0,10400,Pave,,Reg,Lvl,...,0,0,,,,0,11,2009,WD,144000


In [12]:
df.corr()['SalePrice'].sort_values(ascending=False)

SalePrice          1.000000
Overall Qual       0.800207
Gr Liv Area        0.697038
Garage Area        0.650270
Garage Cars        0.648220
Total Bsmt SF      0.628925
1st Flr SF         0.618486
Year Built         0.571849
Year Remod/Add     0.550370
Full Bath          0.537969
Garage Yr Blt      0.533922
Mas Vnr Area       0.512230
TotRms AbvGrd      0.504014
Fireplaces         0.471093
BsmtFin SF 1       0.423519
Lot Frontage       0.341842
Open Porch SF      0.333476
Wood Deck SF       0.326490
Lot Area           0.296566
Bsmt Full Bath     0.283662
Half Bath          0.283001
2nd Flr SF         0.248452
Bsmt Unf SF        0.190210
Bedroom AbvGr      0.137067
Screen Porch       0.134581
3Ssn Porch         0.048732
Mo Sold            0.032735
Pool Area          0.023106
BsmtFin SF 2       0.016255
Misc Val          -0.007375
Yr Sold           -0.015203
Low Qual Fin SF   -0.041594
Bsmt Half Bath    -0.045328
Id                -0.051398
MS SubClass       -0.087335
Overall Cond      -0

In [5]:
df.describe()

Unnamed: 0,Id,PID,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,...,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,SalePrice
count,2051.0,2051.0,2051.0,1721.0,2051.0,2051.0,2051.0,2051.0,2051.0,2029.0,...,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0
mean,1474.033642,713590000.0,57.008776,69.0552,10065.208191,6.11214,5.562165,1971.708922,1984.190151,99.695909,...,93.83374,47.556802,22.571916,2.591419,16.511458,2.397855,51.574354,6.219893,2007.775719,181469.701609
std,843.980841,188691800.0,42.824223,23.260653,6742.488909,1.426271,1.104497,30.177889,21.03625,174.963129,...,128.549416,66.747241,59.84511,25.229615,57.374204,37.78257,573.393985,2.744736,1.312014,79258.659352
min,1.0,526301100.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,12789.0
25%,753.5,528458100.0,20.0,58.0,7500.0,5.0,5.0,1953.5,1964.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,129825.0
50%,1486.0,535453200.0,50.0,68.0,9430.0,6.0,5.0,1974.0,1993.0,0.0,...,0.0,27.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,162500.0
75%,2198.0,907180100.0,70.0,80.0,11513.5,7.0,6.0,2001.0,2004.0,161.0,...,168.0,70.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,2930.0,924152000.0,190.0,313.0,159000.0,10.0,9.0,2010.0,2010.0,1600.0,...,1424.0,547.0,432.0,508.0,490.0,800.0,17000.0,12.0,2010.0,611657.0


In [6]:
pd.DataFrame(df.isna().sum()).T

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,0,0,0,0,330,0,0,1911,0,0,...,0,0,2042,1651,1986,0,0,0,0,0


In [247]:
df_cleaned = df.drop(['Alley', 'Pool QC', 'Fence', 'Misc Feature', 'Lot Frontage', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Garage Qual', 'Garage Cond'], axis = 1)

In [248]:
nan_vals = pd.DataFrame(df_cleaned.isna().sum())
pd.set_option("display.max_rows", None)
print(nan_vals)

                  0
Id                0
PID               0
MS SubClass       0
MS Zoning         0
Lot Area          0
Street            0
Lot Shape         0
Land Contour      0
Utilities         0
Lot Config        0
Land Slope        0
Neighborhood      0
Condition 1       0
Condition 2       0
Bldg Type         0
House Style       0
Overall Qual      0
Overall Cond      0
Year Built        0
Year Remod/Add    0
Roof Style        0
Roof Matl         0
Exterior 1st      0
Exterior 2nd      0
Mas Vnr Type     22
Mas Vnr Area     22
Exter Qual        0
Exter Cond        0
Foundation        0
Bsmt Qual        54
Bsmt Cond        54
Bsmt Exposure    57
BsmtFin Type 1   54
BsmtFin SF 1      0
BsmtFin Type 2   55
BsmtFin SF 2      0
Bsmt Unf SF       0
Total Bsmt SF     0
Heating           0
Heating QC        0
Central Air       0
Electrical        0
1st Flr SF        0
2nd Flr SF        0
Low Qual Fin SF   0
Gr Liv Area       0
Bsmt Full Bath    1
Bsmt Half Bath    1
Full Bath         0


In [251]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2049 entries, 0 to 2050
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               2049 non-null   int64  
 1   PID              2049 non-null   int64  
 2   MS SubClass      2049 non-null   int64  
 3   MS Zoning        2049 non-null   object 
 4   Lot Frontage     1719 non-null   float64
 5   Lot Area         2049 non-null   int64  
 6   Street           2049 non-null   object 
 7   Alley            140 non-null    object 
 8   Lot Shape        2049 non-null   object 
 9   Land Contour     2049 non-null   object 
 10  Utilities        2049 non-null   object 
 11  Lot Config       2049 non-null   object 
 12  Land Slope       2049 non-null   object 
 13  Neighborhood     2049 non-null   object 
 14  Condition 1      2049 non-null   object 
 15  Condition 2      2049 non-null   object 
 16  Bldg Type        2049 non-null   object 
 17  House Style   

In [257]:
df.groupby('Lot Config')['Roof Style'].count()

Lot Config
Corner      348
CulDSac     131
FR2          60
FR3           8
Inside     1502
Name: Roof Style, dtype: int64

In [185]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [186]:
X_train.shape

(1536, 5)

In [187]:
X_test.shape

(513, 5)

In [231]:
params = {'alpha_list': [.01, .1, 1, 10, 100, 1000]}
params

{'alpha_list': [0.01, 0.1, 1, 10, 100, 1000]}

In [232]:
gs = GridSearchCV(Lasso(), params, n_jobs = -1)
gs

In [233]:
gs.fit(X_train, y_train)

ValueError: Invalid parameter alpha_list for estimator Lasso(). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
pd.DataFrame(gs.cv_results_)

In [191]:
gs.best_params_

{'ridge__alpha': 10}

In [None]:
RFE = 

In [223]:
pipe = make_pipeline(StandardScaler(), PolynomialFeatures(), RFE(estimator = LinearRegression()),  Ridge(alpha = 10))

pipe.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
pipe.score(X_train, y_train)

In [208]:
pipe.score(X_test, y_test)

0.881720421689027

In [209]:
preds = pipe.predict(X_test)

In [210]:
mean_squared_error(preds, y_test, squared = False)

28761.209236731884

In [212]:
df_test = pd.read_csv('../datasets/test.csv')
df_test

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,1662,527377110,60,RL,80.0,8000,Pave,,Reg,Lvl,...,0,0,0,,,,0,11,2007,WD
874,1234,535126140,60,RL,90.0,14670,Pave,,Reg,Lvl,...,0,0,0,,MnPrv,,0,8,2008,WD
875,1373,904100040,20,RL,55.0,8250,Pave,,Reg,Lvl,...,0,0,0,,,,0,8,2008,WD
876,1672,527425140,20,RL,60.0,9000,Pave,,Reg,Lvl,...,0,0,0,,GdWo,,0,5,2007,WD


In [214]:
X_test = df_test[['Overall Qual', 'Total Bsmt SF', 'Gr Liv Area', 'Year Built', 'Garage Area']]

In [215]:
predictions = pipe.predict(X_test)
predictions.shape

(878,)

In [216]:
df_sales = pd.DataFrame(predictions, columns = ['SalePrice'])
df_sales

Unnamed: 0,SalePrice
0,168183.562566
1,162828.643491
2,172725.344703
3,118853.542962
4,177446.456735
...,...
873,197293.095383
874,199920.633835
875,133857.731127
876,114888.912087


In [217]:
df_model2 = pd.concat([df_test['Id'], df_sales], axis = 1)
df_model2

Unnamed: 0,Id,SalePrice
0,2658,168183.562566
1,2718,162828.643491
2,2414,172725.344703
3,1989,118853.542962
4,625,177446.456735
...,...,...
873,1662,197293.095383
874,1234,199920.633835
875,1373,133857.731127
876,1672,114888.912087


In [218]:
df_model2.set_index('Id', inplace = True)

In [219]:
df_model2

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,168183.562566
2718,162828.643491
2414,172725.344703
1989,118853.542962
625,177446.456735
...,...
1662,197293.095383
1234,199920.633835
1373,133857.731127
1672,114888.912087


In [220]:
df_model2.to_csv('../submissions/model2_submission.csv')