In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from plotly.subplots import make_subplots
from utils import outlier as ot
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
df = pd.read_csv("data/House_Rent_Dataset.csv", parse_dates=["Posted On"])

df.head()

In [None]:
df["Rental Floor"] = df["Floor"].apply(lambda x: x.split(" out of ")[0])
df["Total Number of Floor"] = df["Floor"].apply(lambda x: x.split(" out of ")[-1])

df.sample(5)

In [None]:
df["Rental Floor"].unique()

In [None]:
df["Total Number of Floor"].unique()

In [None]:
df["Rental Floor"] = df["Rental Floor"].replace(["Ground", "Upper Basement", "Lower Basement"], [0, -1, -2]).astype(int)
df["Total Number of Floor"] = df["Total Number of Floor"].replace("Ground", 0).astype(int)

df.drop(columns="Floor", inplace=True)
df.sample()

In [None]:
num_var = ["Rental Floor", "Total Number of Floor"]
fig, axes = plt.subplots(1, 2, figsize=(12,7))

for num, ax in zip(num_var, axes.flatten()):
    sns.histplot(x=num, data=df, ax=ax)

In [None]:
cat_var = ["Area Type", "Furnishing Status", "Tenant Preferred", "Point of Contact"]
fig, axes = plt.subplots(2, 2, figsize=(12,10))

for cat, ax in zip(cat_var, axes.flatten()):
    sns.countplot(x=cat, data=df, ax=ax)

In [9]:
df.replace(to_replace=["Built Area", "Contact Builder"], 
           value=["Super Area", "Contact Owner"], inplace=True)

In [None]:
area_mean = df.query("BHK + Bathroom == 4").groupby("Area Type", as_index=False).mean(numeric_only=True)
area_mean

In [None]:
carpet_area_size = area_mean.loc[0, "Size"]
super_area_size = area_mean.loc[1, "Size"]
area_per_multi = (super_area_size - carpet_area_size) / carpet_area_size + 1 #area prcentage multiplier
area_per_multi

In [None]:
fixed_size = df.query("`Area Type` == 'Carpet Area'")["Size"].apply(lambda x: x * area_per_multi)
df["Fixed Size"] = np.round(fixed_size, 2)
df["Fixed Size"].fillna(df["Size"], inplace=True)
df.head()

In [13]:
df.drop(columns=["Area Locality", "Posted On"], inplace=True)

In [None]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'xy'}]])

fig.add_trace(go.Pie(labels = df["Point of Contact"] ,name="Point of Contact", textinfo='percent+label', textposition='inside', marker=dict(line=dict(color='white', width=1))),row=1, col=1)
fig.add_trace(go.Histogram(x = df["Point of Contact"],name="Point of Contact"),row=1, col=2)

fig.update_layout(showlegend=False, title="Point of Contact", title_x=.5)

fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x = df["City"], name="City", texttemplate="%{y}")).update_xaxes(categoryorder='total descending')
fig.update_layout(title="Total Number of Rental Houses in Cities", title_x=.5)
fig.show()

In [None]:
square_feet_rent = df["Rent"]/df["Fixed Size"]
df["Square Feet Rent"] = np.round(square_feet_rent, 2)
df.head()

In [None]:
fig = px.box(df, x="Square Feet Rent", color="City")
fig.show()

In [None]:
index = ot.outlier_detect_MAD(data=df, col="Square Feet Rent", threshold=3.5)

In [None]:
df.loc[index, "Square Feet Rent"].sort_values()

In [20]:
df = df.drop(df.query("`Square Feet Rent` > 57.94").index)

In [21]:
df.drop(columns = ["Size", "Area Type"], inplace = True)

In [None]:
sns.heatmap(df.corr(numeric_only=True), annot=True,);

In [None]:
X = df.drop(columns='Rent')
y = df['Rent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=101)
X_train.shape, X_test.shape

In [None]:
X_train.head()

In [None]:
cat_pipe = Pipeline([("encode", OneHotEncoder(handle_unknown="ignore"))])
num_pipe = Pipeline([("scaler", StandardScaler())])

preprocessor = ColumnTransformer([
    ("numeric", num_pipe, [0,4,6,7,8,9]),
    ("categoric", cat_pipe, [1,2,3,5])
])

pipeline_xgbr = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBRegressor())
])

pipeline_xgbr.fit(X_train,y_train)
y_pred = pipeline_xgbr.predict(X_test)
print(f"Score    : {r2_score(y_test, y_pred)}")
print(f"mae      : {mean_absolute_error(y_test, y_pred)}")
print(f"root mse : {np.sqrt(mean_squared_error(y_test, y_pred))}")

In [None]:
cat_pipe = Pipeline([("encode", OneHotEncoder(handle_unknown="ignore"))])
num_pipe = Pipeline([("scaler", StandardScaler())])

preprocessor = ColumnTransformer([
    ("numeric", num_pipe, [0,4,6,7,8,9]),
    ("categoric", cat_pipe, [1,2,3,5])
])

pipeline_lgbm = Pipeline([
    ('prep', preprocessor),
    ('algo', LGBMRegressor())
])

pipeline_lgbm.fit(X_train,y_train)
y_pred = pipeline_lgbm.predict(X_test)
print(f"Score    : {r2_score(y_test, y_pred)}")
print(f"mae      : {mean_absolute_error(y_test, y_pred)}")
print(f"root mse : {np.sqrt(mean_squared_error(y_test, y_pred))}")

In [27]:
from jcopml.tuning.space import Integer, Real

In [28]:
params = {'algo__max_depth': Integer(low=1, high=10),
 'algo__learning_rate': Real(low=-2, high=0, prior='log-uniform'),
 'algo__n_estimators': Integer(low=100, high=200),
 'algo__subsample': Real(low=0.3, high=0.8, prior='uniform'),
 'algo__colsample_bytree': Real(low=0.1, high=1, prior='uniform'),
 'algo__reg_alpha': Real(low=-3, high=1, prior='log-uniform'),
 'algo__reg_lambda': Real(low=-3, high=1, prior='log-uniform')}

In [None]:
cat_pipe = Pipeline([("encode", OneHotEncoder(handle_unknown="ignore"))])
num_pipe = Pipeline([("scaler", StandardScaler())])

preprocessor = ColumnTransformer([
    ("numeric", num_pipe, [0,4,6,7,8,9]),
    ("categoric", cat_pipe, [1,2,3,5])
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBRegressor())
])
    
model = RandomizedSearchCV(pipeline, params, cv=3, n_iter=50, n_jobs=1, verbose=1, random_state=101)
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

print(f"Score    : {r2_score(y_test, y_pred)}")
print(f"mae      : {mean_absolute_error(y_test, y_pred)}")
print(f"root mse : {np.sqrt(mean_squared_error(y_test, y_pred))}")

In [None]:
X_train.head(2)

In [None]:
y_train.head(2)

In [None]:
columns = X_train.columns

data = [[2, "Hyderabad", "Unfurnished", "Bachelors/Family", 2, "Contact Owner", 4, 12, 1000.0, 12.25]]
data = pd.DataFrame(data=data, columns=columns)

pred = pipeline_lgbm.predict(data)
pred

In [33]:
import pickle

TARGET_MODEL_FILE = "./FastAPI/app/lgbm_model.pkl"

with open(TARGET_MODEL_FILE, "wb") as f:
    pickle.dump(pipeline_lgbm, f)