In [None]:
import pandas as pd
import numpy as np

#For model building
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

#For evaluation
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
# from sklearn.metrics import mean_squared_error

df = pd.read_csv('Bengaluru_House_Data.csv')
df.shape

: 

In [None]:
df.isna().sum()

: 

In [None]:
df=df.drop(['society'],axis=1)
df.isna().sum()

: 

In [None]:
df.isna().sum()

: 

In [None]:
df=df.dropna()

: 

In [None]:
df.isna().sum()

: 

In [None]:
df.shape

: 

In [None]:
df.head()

: 

In [None]:
# Creating a column named BHK
df['BHK']= df['size'].apply(lambda x: int(x.split(' ')[0]))
df.head()

: 

In [None]:
df=df.drop('size',axis=1)

: 

In [None]:
df.shape

: 

In [None]:
df['total_sqft'].unique()

: 

In [None]:
# Creating a function for checking float value
def is_float(x):
  try:
    float(x)
  except:
    return False
  return True

#df['total_sqft']=df['total_sqft'].apply(is_float)

: 

In [None]:
df[~df['total_sqft'].apply(is_float)].head(10)

: 

In [None]:
def preprocess_sqft(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None
        
        

df['total_sqft']=df['total_sqft'].apply(preprocess_sqft)
df.shape

: 

In [None]:
df.total_sqft.head()

: 

In [None]:
df[~df['total_sqft'].apply(is_float)].head(10)

: 

In [None]:
df[~df.total_sqft.notnull()]

: 

In [None]:
df.total_sqft.isnull().sum() 

: 

In [None]:
df=df.dropna()

: 

In [None]:
df[~df.total_sqft.notnull()]

: 

In [None]:
df.shape

: 

In [None]:
# Creating a column for price per sq.feet
df['price_per_sqft']=df['price']*100000/df['total_sqft']
df.head()

: 

In [None]:
df.shape

: 

In [None]:
# Preprocessing Location column
df.location=df.location.apply(lambda x: x.strip())
loc_stats=df.location.value_counts(ascending=False)
loc_stats
df.location

: 

In [None]:
len(loc_stats[loc_stats>10])

: 

In [None]:
loc_stats_less_than_15=loc_stats[loc_stats<15]

: 

In [None]:


df.location=df.location.apply(lambda x: 'other' if x in (loc_stats_less_than_15) else x)
df = df[df.location != "other"]
loc_stats=df.location.value_counts(ascending=False)
loc_stats
# df.shape()

: 

In [None]:
df.shape
data = df

: 

In [None]:
df.availability=df.availability.apply(lambda x: 'Not Ready to Move Yet' if x != "Ready To Move" else x)
df.availability.unique()

: 

In [None]:
df.location[df['location']!='other']

: 

In [None]:
df.location.unique()

: 

In [None]:
df.shape

: 

In [None]:
df.price_per_sqft.describe()

: 

In [None]:
# Removing outliers
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
    
df = remove_pps_outliers(df)
df.shape

: 

In [None]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('BHK'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('BHK'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')

df = remove_bhk_outliers(df)
df.shape

: 

In [None]:
data = df
data.to_csv("Cleaned_data.csv")

: 

In [None]:
# Converting 'Object' values of Location into Numerical form using One Hot Encoding
dummies_a = pd.get_dummies(df.area_type)
dummies_a

: 

In [None]:
dummies = pd.get_dummies(df.location)
dummies

: 

In [None]:
dummies_av = pd.get_dummies(df.availability)
dummies_av

: 

In [None]:
df=pd.concat([df,dummies],axis='columns')
# imp

: 

In [None]:

df=pd.concat([df,dummies_a],axis='columns')
df=pd.concat([df,dummies_av],axis='columns')
df.head()

: 

In [None]:
df=df.drop('location',axis='columns')
df=df.drop('availability',axis='columns')
df=df.drop('area_type',axis='columns')

: 

In [None]:
df.shape

: 

In [None]:
# Model building
X=df.drop("price",axis=1)
Y=df.price

: 

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)
X_train.shape
X_test.shape

: 

In [None]:
rf_clf=RandomForestRegressor()
rf_clf.fit(X_train,Y_train)
rf_Y_pred=rf_clf.predict(X_test)
rf_score=rf_clf.score(X_test,Y_test)
print(rf_score)

: 

In [None]:
# EXTRAS










: 

In [None]:
data.head()

: 

In [None]:
data.drop(columns = ['price_per_sqft'],inplace=True)

: 

In [None]:
data.head()

: 

In [None]:
data.to_csv("Cleaned_data.csv")

: 

In [None]:
X = data.drop(columns=['price'])
y = data['price']

: 

In [None]:
data_temp = data.drop_duplicates(subset=["location","availability","area_type"], keep = 'first')

: 

In [None]:
X_temp = data_temp.drop(columns=['price'])
y_temp = data_temp['price']

: 

In [None]:

# from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25, random_state = 0 )
X_train = X_train.append(X_temp,ignore_index=True)
y_train = y_train.append(y_temp,ignore_index=True)
print(X_train.shape)
print(X_test.shape)

: 

In [None]:
column_trans = make_column_transformer((OneHotEncoder(sparse_output=False), ['location','area_type','availability']), remainder='passthrough')
scaler = StandardScaler()

: 

In [None]:

lr = LinearRegression()
dt = DecisionTreeRegressor()
rf=RandomForestRegressor()
lasso = linear_model.Lasso()
ridge = linear_model.Ridge()


pipe_rf = make_pipeline(column_trans,scaler,rf)
pipe_lr = make_pipeline(column_trans,scaler,lr)
pipe_dt = make_pipeline(column_trans,scaler,dt)
pipe_lasso = make_pipeline(column_trans,scaler,lasso)
pipe_ridge = make_pipeline(column_trans,scaler,ridge)

: 

In [None]:
pipe_rf.fit(X_train,y_train)
pipe_lr.fit(X_train,y_train)
pipe_dt.fit(X_train,y_train)
pipe_lasso.fit(X_train,y_train)
pipe_ridge.fit(X_train,y_train)

: 

In [None]:
y_pred_lr = pipe_lr.predict(X_test)
y_pred_dt = pipe_dt.predict(X_test)
y_pred_rf = pipe_rf.predict(X_test)
y_pred_lasso = pipe_lasso.predict(X_test)
y_pred_ridge = pipe_ridge.predict(X_test)

: 

In [None]:
print("Linear Regressor model accuracy is: ")
r2_score(y_test,y_pred_lr)

: 

In [None]:
print("Desicion Tree model accuracy is: ")
r2_score(y_test,y_pred_dt)

: 

In [None]:
print("Lasso model accuracy is: ")
r2_score(y_test,y_pred_lasso)

: 

In [None]:
print("Ridge model accuracy is: ")
r2_score(y_test,y_pred_ridge)

: 

In [None]:
print("Random Forest model accuracy is: ")
r2_score(y_test,y_pred_rf)

: 

In [None]:
from sklearn.ensemble import RandomForestRegressor
np.random.seed(42)
for i in range(1,11,1):
  print(f"With {i*10} estimators and max_depth {i} of a decision tree is:")
  clf2=RandomForestRegressor(n_estimators=i*10,max_depth=i,random_state=i)
  pipe = make_pipeline(column_trans,scaler,clf2)
  pipe.fit(X_train,y_train)
  y_pred = pipe.predict(X_test)
  print(f"Accuracy: {r2_score(y_test,y_pred)*100:2f}%")

: 

In [None]:
# Random forest Pickel File
import pickle

pickle.dump(pipe_rf,open("RFModel.pkl",'wb'))

: 

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'RandomForest': {
            'model': RandomForestRegressor(),
            'params': {
              'n_estimators': [100, 300, 600, 1000],
              'max_depth': [2,4,8,20],
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        column_trans = make_column_transformer((OneHotEncoder(sparse_output=False), ['location','area_type','availability']), remainder='passthrough')
        scaler = StandardScaler()
        pipe_gcv = make_pipeline(column_trans,scaler,gs)
        pipe_gcv.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])



find_best_model_using_gridsearchcv(X,y)

: 