In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
houseData = pd.read_csv("./data/train.csv")

features = houseData[["GrLivArea", "YearBuilt"]]
target = houseData["SalePrice"]
features = StandardScaler().fit_transform(features, target)
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.3)

In [8]:
houseData.corr(numeric_only=True)["SalePrice"]

Id              -0.021917
MSSubClass      -0.084284
LotFrontage      0.351799
LotArea          0.263843
OverallQual      0.790982
OverallCond     -0.077856
YearBuilt        0.522897
YearRemodAdd     0.507101
MasVnrArea       0.477493
BsmtFinSF1       0.386420
BsmtFinSF2      -0.011378
BsmtUnfSF        0.214479
TotalBsmtSF      0.613581
1stFlrSF         0.605852
2ndFlrSF         0.319334
LowQualFinSF    -0.025606
GrLivArea        0.708624
BsmtFullBath     0.227122
BsmtHalfBath    -0.016844
FullBath         0.560664
HalfBath         0.284108
BedroomAbvGr     0.168213
KitchenAbvGr    -0.135907
TotRmsAbvGrd     0.533723
Fireplaces       0.466929
GarageYrBlt      0.486362
GarageCars       0.640409
GarageArea       0.623431
WoodDeckSF       0.324413
OpenPorchSF      0.315856
EnclosedPorch   -0.128578
3SsnPorch        0.044584
ScreenPorch      0.111447
PoolArea         0.092404
MiscVal         -0.021190
MoSold           0.046432
YrSold          -0.028923
SalePrice        1.000000
Name: SalePr

### Blending

#### Regression analysis using blending

In [10]:

model1, model2, model3 = LinearRegression(), SVR(kernel="linear", C=0.8), DecisionTreeRegressor(criterion="squared_error", max_depth=5)

models = [model1, model2, model3]

for model in models:
    model.fit(x_train, y_train)

pred1 = model1.predict(x_test)
pred2 = model2.predict(x_test)
pred3 = model3.predict(x_test)

x_merge = np.concatenate([pred1.reshape(-1, 1), pred2.reshape(-1, 1), pred3.reshape(-1, 1)], axis=1)


model4 = VotingRegressor([("Linear Regression", model1), ("SVR", model2), ("DT", model3)])
pred4 = model4.fit(x_merge, y_test).predict(x_merge)


for model in models:
    print(f"{model.__class__.__name__}: {model.score(x_test, y_test)}")

print(f"{model4.__class__.__name__}: {model4.score(x_merge, y_test)}")



LinearRegression: 0.6859287978121427
SVR: -0.0034355265560199744
DecisionTreeRegressor: 0.7573190738165709
VotingRegressor: 0.8127830990058371


#### Classification analysis using blending

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.datasets import load_iris

In [18]:
features= load_iris(as_frame=True).data
target = load_iris(as_frame=True).target

data = pd.concat([features, target], axis=1)

data = data[data["target"] != 2]

features = data[["sepal length (cm)", "petal length (cm)"]]
target = data["target"]

features = StandardScaler().fit_transform(features, target)
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.3)

In [19]:

model1, model2, model3 = LogisticRegression(), SVC(kernel="linear", C=0.8, probability=True), DecisionTreeClassifier(criterion="gini", max_depth=5)

models = [model1, model2, model3]

for model in models:
    model.fit(x_train, y_train)

pred1 = model1.predict_proba(x_test)[:, 0]
pred2 = model2.predict_proba(x_test)[:, 0]
pred3 = model3.predict_proba(x_test)[:, 0]

x_merge = np.concatenate([pred1.reshape(-1, 1), pred2.reshape(-1, 1), pred3.reshape(-1, 1)], axis=1)

model4 = VotingClassifier([("Linear Regression", model1), ("SVR", model2), ("DT", model3)], voting="hard")
pred4 = model4.fit(x_merge, y_test).predict(x_merge)


for model in models:
    print(f"{model.__class__.__name__}: {model.score(x_test, y_test)}")

print(f"{model4.__class__.__name__}: {model4.score(x_merge, y_test)}")

LogisticRegression: 1.0
SVC: 1.0
DecisionTreeClassifier: 1.0
VotingClassifier: 1.0


### Bagging

In [20]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

dtc = DecisionTreeClassifier(criterion="entropy")
model = BaggingClassifier(dtc, 20).fit(x_train, y_train)
knn = KNeighborsClassifier(n_neighbors= 7).fit(x_train, y_train)

print(f"{model.__class__.__name__}: {model.score(x_test, y_test)}")
print(f"{knn.__class__.__name__}: {knn.score(x_test, y_test)}")

BaggingClassifier: 1.0
KNeighborsClassifier: 1.0


In [21]:
confusion_matrix(knn.predict(x_test), y_test)

array([[14,  0],
       [ 0, 16]], dtype=int64)

### Stacking

In [29]:
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score

svc = SVC(C=0.6, kernel="poly")
tree = DecisionTreeClassifier(criterion="log_loss")
knn = KNeighborsClassifier(n_neighbors=7)


svc.fit(x_train, y_train)
tree.fit(x_train, y_train)
knn.fit(x_train, y_train)

svc_pred = svc.predict(x_test)
tree_pred = tree.predict(x_test)
knn_pred = knn.predict(x_test)

meta_train = np.column_stack([svc_pred, tree_pred, knn_pred])

meta = LogisticRegression().fit(meta_train, y_test)
meta_new = meta.predict(meta_train)


print(f"Scratch stacked accuracy: {accuracy_score(y_test, meta_new)}")


Scratch stacked accuracy: 1.0


In [30]:
svc = SVC(C=0.6, kernel="poly")
tree = DecisionTreeClassifier(criterion="log_loss")
knn = KNeighborsClassifier(n_neighbors=7)

mod = StackingClassifier([("svc", svc), ("tree", tree), ("knn", knn)], final_estimator=LogisticRegression())

mod.fit(x_train, y_train)

score = mod.score(x_test, y_test)

print(f"Sklearn stacked accuracy: {score}")

Sklearn stacked accuracy: 1.0
