In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# Modelling
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore') 

# Load the dataset:

In [2]:
data = pd.read_csv("data\House Price Prediction Dataset.csv")
data.head()

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage,Price
0,1,1360,5,4,3,1970,Downtown,Excellent,No,149919
1,2,4272,5,4,3,1958,Downtown,Excellent,No,424998
2,3,3592,2,2,3,1938,Downtown,Good,No,266746
3,4,966,4,2,2,1902,Suburban,Fair,Yes,244020
4,5,4926,1,4,2,1975,Downtown,Fair,Yes,636056


## Remove unnecessary columns:

In [3]:
# remove id  column:
data = data.drop(columns=['Id'])
data.head()

Unnamed: 0,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage,Price
0,1360,5,4,3,1970,Downtown,Excellent,No,149919
1,4272,5,4,3,1958,Downtown,Excellent,No,424998
2,3592,2,2,3,1938,Downtown,Good,No,266746
3,966,4,2,2,1902,Suburban,Fair,Yes,244020
4,4926,1,4,2,1975,Downtown,Fair,Yes,636056


## Define numerical & categorical columns:

In [4]:
numeric_features = [feature for feature in data.columns if data[feature].dtype != 'O']
categorical_features = [feature for feature in data.columns if data[feature].dtype == 'O']

# print columns
print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

We have 6 numerical features : ['Area', 'Bedrooms', 'Bathrooms', 'Floors', 'YearBuilt', 'Price']

We have 3 categorical features : ['Location', 'Condition', 'Garage']


## Categories of each column:

In [5]:
print("Categories in 'Location' variable:     ",end=" " )
print(data['Location'].unique())

print("Categories in 'Condition' variable:  ",end=" ")
print(data['Condition'].unique())

print("Categories in'Garage' variable:",end=" " )
print(data['Garage'].unique())

print("Categories in 'Floors' variable:     ",end=" " )
print(data['Floors'].unique())

print("Categories in 'Bedrooms' variable:     ",end=" " )
print(data['Bedrooms'].unique())

print("Categories in 'Bathrooms' variable:     ",end=" " )
print(data['Bathrooms'].unique())

Categories in 'Location' variable:      ['Downtown' 'Suburban' 'Urban' 'Rural']
Categories in 'Condition' variable:   ['Excellent' 'Good' 'Fair' 'Poor']
Categories in'Garage' variable: ['No' 'Yes']
Categories in 'Floors' variable:      [3 2 1]
Categories in 'Bedrooms' variable:      [5 2 4 1 3]
Categories in 'Bathrooms' variable:      [4 2 1 3]


## Preparing the features and the target columns:

In [6]:
# the features:
X = data.drop(columns=['Price'],axis=1)
print(f"The shape of the features is : {X.shape}")

The shape of the features is : (2000, 8)


In [7]:
# the target :
Y = data['Price']
print(f"The shape of the target is {Y.shape}")

The shape of the target is (2000,)


## Data Transformation:

In [8]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [9]:
X = preprocessor.fit_transform(X)
X.shape

(2000, 15)

In [10]:
Y = np.array(Y).reshape(-1, 1)

# Fit and transform using the numeric_transformer
Y = numeric_transformer.fit_transform(Y)

# Get the shape of the transformed data
Y.shape



(2000, 1)

## Split the data into Train and Test:

In [11]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=42)
print(f'Train: {X_train.shape} \n Test: {X_test.shape}')

Train: (1600, 15) 
 Test: (400, 15)


## An Evaluate Function:

In [12]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

## Model Training:

In [13]:
params={
                "Decision Tree": {
                    'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                    'splitter':['best','random'],
                    'max_features':['sqrt','log2'],
                },
                "Random Forest Regressor":{
                    'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                 
                    'max_features':['sqrt','log2',None],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "Linear Regression":{},
                "Lasso": { 'alpha': [0.001, 0.01, 0.1, 1, 10], 
                          'max_iter': [1000, 5000, 10000], 
                          'tol': [1e-4, 1e-3, 1e-2], 
                          'selection': ['cyclic', 'random'] },
                "Ridge": { 'alpha': [0.001, 0.01, 0.1, 1, 10, 100], 
                          'max_iter': [1000, 5000, 10000], 
                          'tol': [1e-4, 1e-3, 1e-2],
                          'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'] 
                          },
                "K-Neighbors Regressor":{ 'n_neighbors': [3, 5, 7, 10, 15], 
                       'weights': ['uniform', 'distance'], 
                       #'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 
                       'leaf_size': [10, 30, 50], 
                       'p': [1, 2] },
                "AdaBoost Regressor":{
                    'learning_rate':[.1,.01,0.5,.001],
                    'loss':['linear','square','exponential'],
                    'n_estimators': [8,16,32,64,128,256]
                }}

In [14]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]
for i in range(len(list(models))):
    model = list(models.values())[i]
    para=params[list(models.keys())[i]]

    gs = GridSearchCV(model,para,cv=3)
    gs.fit(X_train,y_train)
    model.set_params(**gs.best_params_)
    model.fit(X_train,y_train)

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')
 


Linear Regression
Model performance for Training set
- Root Mean Squared Error: 0.9927
- Mean Absolute Error: 0.8580
- R2 Score: 0.0099
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1.0127
- Mean Absolute Error: 0.8802
- R2 Score: -0.0067


Lasso
Model performance for Training set
- Root Mean Squared Error: 0.9976
- Mean Absolute Error: 0.8619
- R2 Score: 0.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1.0096
- Mean Absolute Error: 0.8774
- R2 Score: -0.0007


Ridge
Model performance for Training set
- Root Mean Squared Error: 0.9928
- Mean Absolute Error: 0.8582
- R2 Score: 0.0097
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1.0125
- Mean Absolute Error: 0.8805
- R2 Score: -0.0064


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
--------------------

## Evaluating the model:

In [15]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
1,Lasso,-0.000716
2,Ridge,-0.006389
0,Linear Regression,-0.006718
6,AdaBoost Regressor,-0.011412
5,Random Forest Regressor,-0.09395
3,K-Neighbors Regressor,-0.12825
4,Decision Tree,-1.209779


## Ensemble Learning:

In [21]:
from sklearn.ensemble import StackingRegressor

estimators = [
    ('rf', KNeighborsRegressor()),
    ('ab', AdaBoostRegressor()),
    ('fr', RandomForestRegressor())
]

stack = StackingRegressor(estimators=estimators, final_estimator=Lasso())
stack.fit(X_train, y_train)
stack.score(X_test, y_test)


-0.0007164349053965235