In [1]:
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
print("Setup Complete")

Setup Complete


In [2]:
df=pd.read_csv('train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
##Adding  feature and labels

labels=df[['SalePrice']]
features=df[['YearBuilt','OverallQual','GrLivArea','LotArea','YearRemodAdd','LotFrontage','Foundation','LandSlope','LandContour','BldgType']]

In [4]:
features.head()

Unnamed: 0,YearBuilt,OverallQual,GrLivArea,LotArea,YearRemodAdd,LotFrontage,Foundation,LandSlope,LandContour,BldgType
0,2003,7,1710,8450,2003,65.0,PConc,Gtl,Lvl,1Fam
1,1976,6,1262,9600,1976,80.0,CBlock,Gtl,Lvl,1Fam
2,2001,7,1786,11250,2002,68.0,PConc,Gtl,Lvl,1Fam
3,1915,7,1717,9550,1970,60.0,BrkTil,Gtl,Lvl,1Fam
4,2000,8,2198,14260,2000,84.0,PConc,Gtl,Lvl,1Fam


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

## 1. Full Pipeline

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder

# Pipeline for numeric attributes
numeric_pipeline = Pipeline([
    ('median_imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

# Pipeline for categorical attributes
categorical_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder())
])

In [7]:
from sklearn import set_config
set_config(display='diagram')

numeric_pipeline

In [8]:
from sklearn.compose import ColumnTransformer

numeric_attributes = [
    'YearBuilt','OverallQual','GrLivArea','LotArea','YearRemodAdd','LotFrontage'
]

categotrical_attributes = [
    'Foundation','LandSlope','LandContour','BldgType'
]

pipeline = ColumnTransformer([
    ('numerical_transformers', numeric_pipeline, numeric_attributes),
    ('categorical_transformers', categorical_pipeline, categotrical_attributes),
], remainder='passthrough')

In [9]:
pipeline

In [10]:
pipeline.fit_transform(X_train)[0]

array([-0.45546896, -0.82044456, -0.40709315, -0.21289571, -1.34606303,
       -0.01246836,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ])

In [11]:
from sklearn.linear_model import LinearRegression

model_pipeline = Pipeline([
    ('preprocessing', pipeline),
    ('model', LinearRegression())
])

model_pipeline

In [12]:
model_pipeline.fit(X_train, y_train)

In [13]:

from sklearn.metrics import mean_squared_error, r2_score

predictions = model_pipeline.predict(X_test)
rmse = mean_squared_error(y_test, predictions, squared=False)
r2 = r2_score(y_test, predictions)
print(f'RMSE: {rmse}')
print(f'R-squared: {r2}')

RMSE: 42792.874977698164
R-squared: 0.7612578527441198


## 2. Random Forest

In [14]:
from sklearn.ensemble import RandomForestRegressor
forest_pipeline = Pipeline([
    ('preprocessing', pipeline),
    ('model', RandomForestRegressor())
])

forest_pipeline

In [15]:
%%time
forest_pipeline.fit(X_train, y_train.values.ravel())
predictions = forest_pipeline.predict(X_test)
rmse = mean_squared_error(y_test, predictions, squared=False)
r2 = r2_score(y_test, predictions)
print(f'RMSE: {rmse}')
print(f'R-squared: {r2}')

RMSE: 32674.56412404812
R-squared: 0.8608107254035477
Wall time: 2.82 s


## 3. Support Vector Machine

In [23]:
from sklearn.svm import SVR

SVR_pipeline = Pipeline([
    ('preprocessing', pipeline),
    ('model', SVR(kernel='linear'))
])

SVR_pipeline


In [24]:
%%time
SVR_pipeline.fit(X_train, y_train.values.ravel())
predictions = SVR_pipeline.predict(X_test)
rmse = mean_squared_error(y_test, predictions, squared=False)
r2 = r2_score(y_test, predictions)
print(f'RMSE: {rmse}')
print(f'R-squared: {r2}')

RMSE: 86632.78065121344
R-squared: 0.02152288173496142
Wall time: 171 ms


In [25]:
from sklearn.svm import SVR

SVR_pipeline = Pipeline([
    ('preprocessing', pipeline),
    ('model', SVR(kernel='rbf'))
])

SVR_pipeline

In [26]:
%%time
SVR_pipeline.fit(X_train, y_train.values.ravel())
predictions = SVR_pipeline.predict(X_test)
rmse = mean_squared_error(y_test, predictions, squared=False)
r2 = r2_score(y_test, predictions)
print(f'RMSE: {rmse}')
print(f'R-squared: {r2}')

RMSE: 88578.11356142895
R-squared: -0.02291375753349234
Wall time: 228 ms


In [27]:
from sklearn.svm import SVR

SVR_pipeline = Pipeline([
    ('preprocessing', pipeline),
    ('model', SVR(kernel='poly'))
])

SVR_pipeline

In [28]:
%%time
SVR_pipeline.fit(X_train, y_train.values.ravel())
predictions = SVR_pipeline.predict(X_test)
rmse = mean_squared_error(y_test, predictions, squared=False)
r2 = r2_score(y_test, predictions)
print(f'RMSE: {rmse}')
print(f'R-squared: {r2}')

RMSE: 88476.66196998786
R-squared: -0.020571942040289892
Wall time: 175 ms


## 4. (a) GridSearchCV & RandomizedSearchCV

In [None]:
forest_pipeline['model'].get_params()

In [None]:
from sklearn.model_selection import GridSearchCV

hyperparameters = dict(
    model__max_features=['auto', None],
    model__n_estimators=[300, 500]
)

gridsearch = GridSearchCV(forest_pipeline, hyperparameters, verbose=3, cv=3)

gridsearch.fit(X_train, y_train.values.ravel())

In [None]:
gridsearch.best_params_

In [None]:

predictions = gridsearch.best_estimator_.predict(X_test)
rmse = mean_squared_error(y_test, predictions, squared=False)
r2 = r2_score(y_test, predictions)
print(f'RMSE: {rmse}')
print(f'R-squared: {r2}')

In [None]:
from sklearn.model_selection import RandomizedSearchCV

hyperparameters = dict(
    model__max_features=['auto', None],
    model__n_estimators=[100, 500]
)

# randomsearch = RandomizedSearchCV(forest_pipeline, hyperparameters, verbose=3, cv=3)

randomsearch = RandomizedSearchCV(forest_pipeline, hyperparameters, random_state=0)

randomsearch.fit(X_train, y_train.values.ravel())



In [None]:
randomsearch.best_params_

In [None]:
predictions = randomsearch.best_estimator_.predict(X_test)
rmse = mean_squared_error(y_test, predictions, squared=False)
r2 = r2_score(y_test, predictions)
print(f'RMSE: {rmse}')
print(f'R-squared: {r2}')
# 0.8654347643080489

In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC 

param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

grid = GridSearchCV(SVR_pipeline,param_grid,refit=True,verbose=2)
# grid.fit(X_train,y_train)

In [54]:
param = {'kernel' : ('linear', 'poly', 'rbf', 'sigmoid'),'C' : [1,5,10],'degree' : [3,8],'coef0' : [0.01,10,0.5],'gamma' : ('auto','scale')},

grids = GridSearchCV(SVR_pipeline,param,refit=True,cv=5)

grids.fit(X_train,y_train)

ValueError: Invalid parameter logisticregression for estimator Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numerical_transformers',
                                                  Pipeline(steps=[('median_imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('std_scaler',
                                                                   StandardScaler())]),
                                                  ['YearBuilt', 'OverallQual',
                                                   'GrLivArea', 'LotArea',
                                                   'YearRemodAdd',
                                                   'LotFrontage']),
                                                 ('categorical_transformers',
                                                  Pipeline(steps=[('one_hot_encoder',
                                                                   OneHotEncoder())]),
                                                  ['Foundation', 'LandSlope',
                                                   'LandContour',
                                                   'BldgType'])])),
                ('model', SVR(kernel='poly'))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [47]:
from sklearn.model_selection import GridSearchCV 
  
# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  
  
grid = GridSearchCV(SVR_pipeline, param_grid, refit = True, verbose = 3) 
  
# fitting the model for grid search 
grid.fit(X_train, y_train) 

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


ValueError: Invalid parameter C for estimator Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numerical_transformers',
                                                  Pipeline(steps=[('median_imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('std_scaler',
                                                                   StandardScaler())]),
                                                  ['YearBuilt', 'OverallQual',
                                                   'GrLivArea', 'LotArea',
                                                   'YearRemodAdd',
                                                   'LotFrontage']),
                                                 ('categorical_transformers',
                                                  Pipeline(steps=[('one_hot_encoder',
                                                                   OneHotEncoder())]),
                                                  ['Foundation', 'LandSlope',
                                                   'LandContour',
                                                   'BldgType'])])),
                ('model', SVR(kernel='poly'))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [36]:
.best_params_

TypeError: get_params() missing 1 required positional argument: 'self'