# Regression after EDA  


### Import CSV and Required Packages

In [9]:
# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Warnings
import warnings
warnings.filterwarnings('ignore')


# Create Dataframe
df = pd.read_csv("./EDA_data/EDA_mumbai-house-price-data-cleaned.csv")

# Print shape of dataset
print(df.shape)

(71936, 10)


In [10]:
df.head()

Unnamed: 0,price,locality,city,property_type,bedroom_num,bathroom_num,balcony_num,furnished,latitude,longitude
0,6600283,138,1,0,2,2,0,2,19.24441,73.123253
1,6169841,138,1,0,2,2,0,2,19.257294,73.148872
2,4599936,85,1,0,1,1,0,2,19.209026,73.081276
3,51980000,400,1,0,3,3,0,2,19.097841,72.851158
4,3915000,224,1,0,1,1,0,2,19.420601,72.809319


**Split X and y**
- Why do we split our data?
> Training Dataset is the part of Original Dataset that we use to train our ML model. The model learns on this data by running the algorithm and maps a function F(x) where “x” in the independent variable (inputs) for “y” where “y” is the dependent variable(output).

In [11]:
X = df.drop("price", axis=1) #dropping the target column which is 'cluster'
y = df["price"]

## Select the best model
- so here we have some list of the best classification algorithms we imported. Now we will compare each model's score and see which model is performing better than rest of the others

In [12]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# Regression Models Dictionary

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "XGB Regressor": XGBRegressor(),
    "CatBoost Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

In [5]:
# Create a function which can evaluate models and return a report 
def evaluate_models(X, y, models):
    '''
    This function takes in X and y and models dictionary as input
    It splits the data into Train Test split
    Iterates through the given model dictionary and evaluates the metrics
    Returns: Dataframe which contains report of all models metrics with cost
    '''
    # separate dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    

    models_list = []
    scores = []
    
    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train) # Train model

        # Make predictions
        y_pred = model.predict(X_test)

        score = r2_score(y_test,y_pred)
        
        model_name = list(models.keys())[i]
        print(f'---- score for --- {model_name} ----')
        print(f"{score}")
        models_list.append(model_name)
        scores.append(score)
    
    print()
    
    report = pd.DataFrame()
    report['Model_name'] = models_list
    report['Score'] = scores        
    return report

### Let's check the report


In [19]:
report = evaluate_models(X, y, models)

---- score for --- Linear Regression ----
0.3244711405572446
---- score for --- Random Forest ----
0.7921020007105873
---- score for --- Decision Tree ----
0.685228251549211
---- score for --- Gradient Boosting ----
0.7399615445230281
---- score for --- Ridge Regression ----
0.32447043370388273
---- score for --- Lasso Regression ----
0.32447114144675626
---- score for --- K-Neighbors Regressor ----
0.7125017463659669
---- score for --- XGB Regressor ----
0.7979398965835571
---- score for --- CatBoost Regressor ----
0.8045909247263949
---- score for --- AdaBoost Regressor ----
0.5595957611862004



In [7]:
report.sort_values('Score')

Unnamed: 0,Model_name,Score
9,AdaBoost Regressor,0.317364
4,Ridge Regression,0.32126
5,Lasso Regression,0.321261
0,Linear Regression,0.321261
3,Gradient Boosting,0.576105
6,K-Neighbors Regressor,0.678312
7,XGB Regressor,0.694632
8,CatBoost Regressor,0.73597
2,Decision Tree,0.7419
1,Random Forest,0.749589


### Let's do hyperparameter tuning


In [23]:
from catboost import CatBoostRegressor

best_cb_model =CatBoostRegressor()

In [24]:

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

best_cb_model = best_cb_model.fit(X_train,y_train)
# Predict on test set
y_pred = best_cb_model.predict(X_test)

# mean_absolute_error
print(mean_absolute_error(y_test, y_pred))

#  r2_scoren matrix
print( r2_score(y_test, y_pred))

Learning rate set to 0.077672
0:	learn: 29443505.8102954	total: 11.5ms	remaining: 11.5s
1:	learn: 28154657.2136206	total: 21.9ms	remaining: 11s
2:	learn: 26980284.6200215	total: 33ms	remaining: 11s
3:	learn: 25917919.6302684	total: 42.2ms	remaining: 10.5s
4:	learn: 24975045.1788939	total: 51.1ms	remaining: 10.2s
5:	learn: 24121501.6561716	total: 61.7ms	remaining: 10.2s
6:	learn: 23335523.4387167	total: 70.5ms	remaining: 10s
7:	learn: 22612231.5407406	total: 81ms	remaining: 10s
8:	learn: 21992256.0625865	total: 90.4ms	remaining: 9.95s
9:	learn: 21383860.4608062	total: 98.9ms	remaining: 9.79s
10:	learn: 20864274.9806168	total: 109ms	remaining: 9.79s
11:	learn: 20388680.2512976	total: 117ms	remaining: 9.66s
12:	learn: 19994329.0897817	total: 128ms	remaining: 9.69s
13:	learn: 19583434.5088057	total: 137ms	remaining: 9.63s
14:	learn: 19171176.7028096	total: 147ms	remaining: 9.65s
15:	learn: 18832888.9921903	total: 156ms	remaining: 9.61s
16:	learn: 18485945.1175031	total: 167ms	remaining: 9.

## Confusion matrix of the model
- **What is confusion matrix ?**
> The confusion matrix is a matrix used to determine the performance of the classification models for a given set of test data. It can only be determined if the true values for test data are known. The matrix itself can be easily understood, but the related terminologies may be confusing.