In [1]:
import pandas as pd

data = pd.read_csv("house_prices.csv", index_col=0)
data = data.sample(10000, random_state=0)
data

Unnamed: 0,property_type,price,location,city,baths,purpose,bedrooms,Area_in_Marla
108839,House,13800000,Pak Arab Housing Society,Lahore,3,For Sale,3,5.0
97355,House,17500000,Marghzar Officers Colony,Lahore,6,For Sale,6,10.0
125129,House,12500000,Adiala Road,Rawalpindi,5,For Sale,5,10.0
155467,Lower Portion,47000,Satellite Town,Rawalpindi,3,For Rent,3,7.0
81132,House,7800000,Shalimar Housing Scheme,Lahore,4,For Sale,3,4.0
...,...,...,...,...,...,...,...,...
122491,House,19000000,Lake City,Lahore,5,For Sale,4,10.0
44101,Upper Portion,40000,Korang Town,Islamabad,5,For Rent,4,20.0
99634,House,42500000,DHA Defence,Lahore,5,For Sale,4,10.0
147606,Flat,6800000,Bahria Town Karachi,Karachi,2,For Sale,2,4.2


In [2]:
data = data[data["Area_in_Marla"] > 0]
data

Unnamed: 0,property_type,price,location,city,baths,purpose,bedrooms,Area_in_Marla
108839,House,13800000,Pak Arab Housing Society,Lahore,3,For Sale,3,5.0
97355,House,17500000,Marghzar Officers Colony,Lahore,6,For Sale,6,10.0
125129,House,12500000,Adiala Road,Rawalpindi,5,For Sale,5,10.0
155467,Lower Portion,47000,Satellite Town,Rawalpindi,3,For Rent,3,7.0
81132,House,7800000,Shalimar Housing Scheme,Lahore,4,For Sale,3,4.0
...,...,...,...,...,...,...,...,...
122491,House,19000000,Lake City,Lahore,5,For Sale,4,10.0
44101,Upper Portion,40000,Korang Town,Islamabad,5,For Rent,4,20.0
99634,House,42500000,DHA Defence,Lahore,5,For Sale,4,10.0
147606,Flat,6800000,Bahria Town Karachi,Karachi,2,For Sale,2,4.2


In [3]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.3, random_state=0)
valid_data, test_data = train_test_split(test_data, test_size=0.5, random_state=0)
train_data.shape[0], valid_data.shape[0], test_data.shape[0]

(6999, 1500, 1500)

In [4]:
label = "price"
train_data[label]

57800     29000000
166018     6000000
150291      100000
133319    10500000
119191     9800000
            ...   
101191    14700000
48738     43500000
69691     33500000
160108       53000
124749       30000
Name: price, Length: 6999, dtype: int64

In [5]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler

import numpy as np

label_pipeline = make_pipeline(
    FunctionTransformer(np.log2, inverse_func=np.exp2),
    StandardScaler(),
)

train_data[label] = label_pipeline.fit_transform(train_data[[label]])
valid_data[label] = label_pipeline.transform(valid_data[[label]])
test_data[label] = label_pipeline.transform(test_data[[label]])

train_data[label]

57800     0.983117
166018    0.356093
150291   -1.273352
133319    0.578806
119191    0.551349
            ...   
101191    0.712714
48738     1.144482
69691     1.040525
160108   -1.526018
124749   -1.752503
Name: price, Length: 6999, dtype: float64

In [6]:
features = [
    "city",
    "location",
    "Area_in_Marla",
    "bedrooms",
    "baths",
]

train_data[features]

Unnamed: 0,city,location,Area_in_Marla,bedrooms,baths
57800,Karachi,Cantt,11.4,3,3
166018,Lahore,Green Cap Housing Society,3.0,4,4
150291,Karachi,DHA Defence,20.0,2,2
133319,Rawalpindi,Bahria Town Rawalpindi,5.0,3,4
119191,Lahore,Canal Garden,5.0,3,3
...,...,...,...,...,...
101191,Rawalpindi,Bahria Town Rawalpindi,7.0,5,5
48738,Lahore,DHA Defence,20.0,5,6
69691,Karachi,Gulshan-e-Iqbal Town,12.0,3,3
160108,Karachi,Cantt,11.4,3,3


In [7]:
from sklearn.dummy import DummyRegressor

baseline = DummyRegressor(strategy="mean") # DummyClassifier() for classification
baseline.fit(train_data[features], train_data[label])

baseline_score = baseline.score(test_data[features], test_data[label])
print(f"{baseline_score:,.3f}")

-0.002


In [8]:
cities = train_data["city"].unique()
cities = list(cities)
cities

['Karachi', 'Lahore', 'Rawalpindi', 'Islamabad', 'Faisalabad']

In [9]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer as TfidfVec
from sklearn.linear_model import SGDRegressor

def create_pipeline():
    return make_pipeline(
        make_column_transformer(
            (OneHotEncoder(), ["city"]),
            (TfidfVec(ngram_range=(1, 3), min_df=5, max_df=0.5), "location"),
            (StandardScaler(), ["Area_in_Marla"]),
            (StandardScaler(), ["bedrooms"]),
            (StandardScaler(), ["baths"]),
            remainder="passthrough",
        ),
        SGDRegressor(random_state=0),
    )

pipeline = create_pipeline()
pipeline

In [10]:
pipeline.fit(train_data[features], train_data[label])

train_score = pipeline.score(train_data[features], train_data[label])
print(f"{train_score:,.3f}")

0.372


In [11]:
pipeline.steps[-1][1].n_features_in_

458

In [12]:
from sklearn.model_selection import RandomizedSearchCV

params = dict(
    sgdregressor__penalty=["l1", "l2", "elasticnet"],
    sgdregressor__alpha=[0.00001, 0.0001, 0.001],
    sgdregressor__l1_ratio=[0.01, 0.1, 1],
)

search = RandomizedSearchCV(create_pipeline(), params, n_iter=10, cv=5, n_jobs=2, random_state=0)
search.fit(train_data[features], train_data[label])

print(f"{search.best_score_:,.3f}", search.best_params_)

0.363 {'sgdregressor__penalty': 'elasticnet', 'sgdregressor__l1_ratio': 0.01, 'sgdregressor__alpha': 1e-05}


In [13]:
test_score = search.best_estimator_.score(test_data[features], test_data[label])
print(f"{test_score:,.3f}")

0.365


In [14]:
import joblib

joblib.dump(pipeline, "pipeline.joblib")
joblib.dump(label_pipeline, "label_pipeline.joblib")
joblib.dump(cities, "cities.joblib")

['cities.joblib']

In [15]:
pipeline = joblib.load("pipeline.joblib")
label_pipeline = joblib.load("label_pipeline.joblib")
cities = joblib.load("cities.joblib")
pipeline

In [16]:
%%writefile app.py
# !pip install gradio ipywidgets
import pandas as pd
import gradio as gr
import joblib

# "Artifacts"
pipeline = joblib.load("pipeline.joblib")
label_pipeline = joblib.load("label_pipeline.joblib")
cities = joblib.load("cities.joblib")

def predict(city, location, area, bedrooms, baths):
    sample = dict()
    sample["city"] = city
    sample["location"] = location
    sample["Area_in_Marla"] = area # Column names matching feature names
    sample["bedrooms"] = bedrooms
    sample["baths"] = baths

    price = pipeline.predict(pd.DataFrame([sample]))
    price = label_pipeline.inverse_transform([price])
    return int(price[0][0])

# https://www.gradio.app/guides
with gr.Blocks() as blocks:
    city = gr.Dropdown(cities, value=cities[0], label="City")
    location = gr.Textbox(label="Location")
    area = gr.Number(label="Area", value=1, minimum=0.5, step=0.5)
    bedrooms = gr.Slider(label="Bedrooms", minimum=0, maximum=10, step=1)
    baths = gr.Slider(label="Baths", minimum=0, maximum=10, step=1)
    price = gr.Number(label="Price")

    inputs = [city, location, area, bedrooms, baths]
    outputs = [price]

    predict_btn = gr.Button("Predict")
    predict_btn.click(predict, inputs=inputs, outputs=outputs)

if __name__ == "__main__":
    blocks.launch() # Local machine only
    # blocks.launch(server_name="0.0.0.0") # LAN access to local machine
    # blocks.launch(share=True) # Public access to local machine

Writing app.py


In [23]:
%run app.py

  from .autonotebook import tqdm as notebook_tqdm


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


In [None]:
!pip freeze > requirements.txt

: 

In [None]:
# !git clone xxx
# OR
# !git init
# !git remote add origin xxx

# !git add app.py pipeline.joblib label_pipeline.joblib cities.joblib
# !git config --local user.name "First Last"
# !git config --local user.email "first@last.com"
# !git commit -m "Add artifacts"
# !git push origin master

: 