In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score,cross_val_predict,GridSearchCV,RandomizedSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso


from sklearn.ensemble import RandomForestRegressor

from sklearn.svm import LinearSVR,SVR,NuSVR

from sklearn.metrics import mean_squared_error,mean_absolute_error

import joblib

In [2]:
train_set=pd.read_csv('train_set.csv')
test_set=pd.read_csv('test_set.csv')

In [3]:
X_train=train_set.drop('selling_price',axis=1)
y_train=train_set['selling_price']
X_test=test_set.drop('selling_price',axis=1)
y_test=test_set['selling_price']

In [4]:
X_train_scaled=pd.DataFrame(StandardScaler().fit_transform(X_train),columns=X_train.columns)

In [5]:
X_test_scaled=pd.DataFrame(StandardScaler().fit_transform(X_test),columns=X_test.columns)

In [6]:
## Linear Regression

In [7]:
lin_reg=LinearRegression()
lin_reg_scores=cross_val_score(lin_reg,X_train_scaled,y_train,cv=3,scoring='neg_root_mean_squared_error')
print(lin_reg_scores.mean())

-466362.02991007944


In [8]:
##SVR

In [9]:
svr=SVR()
svr_scores=cross_val_score(svr,X_train_scaled,y_train,cv=3,scoring='neg_root_mean_squared_error')
svr_scores.mean()

-835069.7228353387

In [10]:
## Ridge Regression

In [11]:
ridge=Ridge()
ridge_score=cross_val_score(ridge,X_train_scaled,y_train,cv=3,scoring='neg_root_mean_squared_error')
ridge_score.mean()

-465988.1718875472

In [12]:
## Lasso Regression

In [13]:
lasso=Lasso()
lasso_score=cross_val_score(lasso,X_train_scaled,y_train,cv=3,scoring='neg_root_mean_squared_error')
lasso_score.mean()

-465988.4503259791

In [14]:
## RAndom Forest Regressor

In [15]:
random_for=RandomForestRegressor()
random_for_score=cross_val_score(random_for,X_train_scaled,y_train,cv=3,scoring='neg_root_mean_squared_error')
random_for_score.mean()

-164654.06356239272

In [16]:
## from above steps we understood that Random Forest Regressor is the best for this given dataset

In [17]:
## now we will focus on RandomForestRegressor 
## But firstly lets store the model

In [18]:
joblib.dump(random_for,'RandomForestRegressor.pkl')

['RandomForestRegressor.pkl']

In [19]:
## Now we will use Grid Search to tune the model

In [21]:
param_grid=[
    {
        'n_estimators':[3,10,30],
        'bootstrap':[True,False],
        'max_features':[2,4,6,8,12,14]
    },
    {
        'bootstrap':[True,False],
        'n_estimators':[3,10,30],
    }
    
]


In [22]:
random_reg_grid=RandomForestRegressor()

In [24]:
grid_search=GridSearchCV(random_reg_grid,param_grid,cv=5,scoring='neg_mean_squared_error',return_train_score=True)

In [25]:
grid_search.fit(X_train_scaled,y_train)

KeyboardInterrupt: 

In [None]:
grid_search.best_estimator_

In [None]:
## as you can see best n_estimators is 30... as 30 is the maximum we provided, we will further increase the n_estimators

In [26]:
param_grid=[
    {
        'n_estimators':[30,50,70,90],
        'bootstrap':[True,False],
        'max_features':[2,4,6,8,12,14]
    },
    {
        'bootstrap':[True,False],
        'n_estimators':[3,10,30,50,70,90],
    }
    
]

In [28]:
grid_search=GridSearchCV(random_reg_grid,param_grid,cv=5,scoring='neg_mean_squared_error',return_train_score=True)

In [29]:
grid_search.fit(X_train_scaled,y_train)

KeyboardInterrupt: 

In [None]:
grid_search.best_estimator_

In [30]:
## as you can see best n_estimators is 90... as 90 is the maximum we provided, we will further increase the n_estimators

In [31]:
param_grid=[
    {
        'n_estimators':[90,140,180,220,240],
        'bootstrap':[True,False],
        'max_features':[2,4,6,8,12,14]
    },
    
]

In [33]:
grid_search=GridSearchCV(random_reg_grid,param_grid,cv=5,scoring='neg_mean_squared_error',return_train_score=True)

In [34]:
grid_search.fit(X_train_scaled,y_train)

KeyboardInterrupt: 

In [None]:
grid_search.best_estimator_

In [None]:
## Finally we found our best n_estimators parameter 180 now we can stop the iterative GridSearch

In [None]:
## lets check out cv score

In [None]:
random_for_final=RandomForestRegressor(bootstrap=False, max_features=8, n_estimators=180)

In [None]:
cross_val_score(random_for_final,X_train_scaled,y_train,cv=3,scoring='neg_root_mean_squared_error').mean()

In [None]:
## wow the tuned model works worse than the default one (sometimes result can be quite unexpected)

In [None]:
## next use Random Search CV

In [None]:
random_search_forest=RandomForestRegressor()

In [None]:
param_grid_randomized={
        'n_estimators':[100,110,120,130,140,150,160,170,180,190],
        'bootstrap':[True,False],
        'max_features':[2,4,6,8,12,14],
        'min_samples_split':[1,2,3,4,5],
        'min_samples_leaf':[1,2,3,4,5],
        
}
    


In [None]:
random_search=RandomizedSearchCV(random_search_forest,param_grid_randomized,return_train_score=True,scoring='neg_root_mean_squared_error')

In [None]:
random_search.fit(X_train_scaled,y_train)

In [None]:
random_search.best_estimator_

In [None]:
random_search=RandomForestRegressor(max_features=12, min_samples_split=3, n_estimators=150)_forest_regressor