In [1]:
# !pip install numpy==1.23.1
# !pip install pandas==1.4.3
# !pip install matplotlib==3.3.2
# !pip install seaborn==0.11.0
# !pip install joblib==1.1.0
# !pip install nltk==3.7
# !pip install wordcloud==1.8.2.2
# !pip install scikit_learn==1.0.2
# !pip install scipy==1.9.0

# Contents:

I. [Loading the Data:](#Loading-the-Data:)

II. [Helper functions to prepare the data:](#Helper-functions-to-prepare-the-data:)

III. [Test Inference:](#Test-Inference:)

## Loading the Data:

([Contents:](#Contents:))

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

import matplotlib.pyplot as plt
import seaborn as sns

import os
import time 
import re

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn import metrics

from scipy import stats

import requests
import pickle
import joblib

import re
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

## Helper functions to prepare the data:

([Contents:](#Contents:))

In [3]:
# average propery area (feature cleaning)
def avg_property_area(x):
    numbers = re.compile(r"[-+]?(\d*\.\d+|\d+)") 
    x = numbers.findall(x)
    if len(x) == 1:
        return np.float(x[0])
    elif len(x) == 2:
        return (np.float(x[0])+np.float(x[1]))/2
    else:
        return -99
    
# Outlier treatment
def clip_outliers(df,col):
    q_l = df[col].quantile(0.25)
    q_h = df[col].quantile(0.95)
    df[col] = df[col].clip(lower = q_l, upper = q_h)
    return df    

# Text cleaning
# Preprocessing the text data
REPLACE_BY_SPACE_RE = re.compile("[/(){}\[\]\|@,;!]")
BAD_SYMBOLS_RE = re.compile("[^0-9a-z #+_]")
STOPWORDS_nlp = set(stopwords.words('english'))

#Custom Stoplist
stoplist = ["i","project","living","home",'apartment',"pune","me","my","myself","we","our","ours","ourselves","you","you're","you've","you'll","you'd","your",
            "yours","yourself","yourselves","he","him","his","himself","she","she's","her","hers","herself","it",
            "it's","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","that'll",
            "these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did",
            "doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about",
            "against","between","into","through","during","before","after","above","below","to","from","up","down","in","out",
            "on","off","over","under","again","further","then","once","here","there","when","where","why","all","any",
            "both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too",
            "very","s","t","can","will","just","don","don't","should","should've","now","d","ll","m","o","re","ve","y","ain",
            "aren","couldn","didn","doesn","hadn","hasn",
            "haven","isn","ma","mightn","mustn","needn","shan","shan't",
            "shouldn","wasn","weren","won","rt","rt","qt","for",
            "the","with","in","of","and","its","it","this","i","have","has","would","could","you","a","an",
            "be","am","can","edushopper","will","to","on","is","by","ive","im","your","we","are","at","as","any","ebay","thank","hello","know",
            "need","want","look","hi","sorry","http", "https","body","dear","hello","hi","thanks","sir","tomorrow","sent","send","see","there","welcome","what","well","us"]

STOPWORDS_nlp.update(stoplist)

# Function to preprocess the text
def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.replace("\d+"," ") # removing digits
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #removing mentions and urls
    text = text.lower() # lowercase text
    text =  re.sub('[0-9]+', '', text)
    text = REPLACE_BY_SPACE_RE.sub(" ", text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub(" ", text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join([word for word in text.split() if word not in STOPWORDS_nlp]) # delete stopwors from text
    text = text.strip()
    return text

# Pos counter
def pos_counter(x,pos):
    """
    Returns the count for the given parts of speech tag
    
    NN - Noun
    VB - Verb
    JJ - Adjective
    RB - Adverb
    """
    tokens = nltk.word_tokenize(x.lower())
    tokens = [word for word in tokens if word not in STOPWORDS_nlp]
    text = nltk.Text(tokens)
    tags = nltk.pos_tag(text)
    counts = Counter(tag for word,tag in tags)
    return counts[pos]

In [4]:
def preprocess(df):
    # Extracting State and Country separately from the Location Column
    df['City'] = df['Location'].apply(lambda x: x.split(',')[0].lower().strip())
    df['State'] = df['Location'].apply(lambda x: x.split(',')[1].lower().strip())
    df['Country'] = df['Location'].apply(lambda x: x.split(',')[2].lower().strip())
    
    # Regex to match the numbers and create a separate column
    numbers = re.compile(r"[-+]?(\d*\.\d+|\d+)") 
    df['Property Type Cleaned'] = df['Propert Type'].apply(lambda x: numbers.findall(x)[0] 
                                                           if len(numbers.findall(x)) > 0 else 0)
    
    # Cleaning the text columns
    df['Sub-Area Cleaned'] = df['Sub-Area'].apply(lambda x: x.lower().strip())
    df['Company Name Cleaned'] = df['Company Name'].apply(lambda x: x.lower().strip())
    df['TownShip Name/ Society Name Cleaned'] = df['TownShip Name/ Society Name'].apply(lambda x: x.lower().strip())
    df['Description Cleaned'] = df['Description'].apply(lambda x: x.lower().strip())
    
    
    # Cleaning and encoding Binary Features
    df['ClubHouse Cleaned'] = (df['ClubHouse'].apply(lambda x: x.lower().strip()).map({'yes':1, 'no':0}))
    df['School / University in Township Cleaned'] = (df['School / University in Township ']
                                                         .apply(lambda x: x.lower().strip()).map({'yes':1, 'no':0}))
    df['Hospital in TownShip Cleaned'] = (df['Hospital in TownShip']
                                                         .apply(lambda x: x.lower().strip()).map({'yes':1, 'no':0}))
    df['Mall in TownShip Cleaned'] = (df['Mall in TownShip']
                                                         .apply(lambda x: x.lower().strip()).map({'yes':1, 'no':0}))
    df['Park / Jogging track Cleaned'] = (df['Park / Jogging track']
                                                         .apply(lambda x: x.lower().strip()).map({'yes':1, 'no':0}))
    df['Swimming Pool Cleaned'] = (df['Swimming Pool']
                                                     .apply(lambda x: x.lower().strip()).map({'yes':1, 'no':0}))
    df['Gym Cleaned'] = (df['Gym']
                                 .apply(lambda x: x.lower().strip()).map({'yes':1, 'no':0})) 
    
    

    # Cleaning numerical columns
    numbers = re.compile(r"[-+]?(\d*\.\d+|\d+)")     
    df['Property Area in Sq. Ft. Cleaned'] = df['Property Area in Sq. Ft.'].apply(lambda x: avg_property_area(str(x)))
#     df['Price in lakhs Cleaned'] = (df['Price in lakhs'].apply(lambda x: np.float(numbers.findall(str(x))[0]) 
#                                                                if len(numbers.findall(str(x)))>0 else np.nan ))
    
    # Selecting the requried columns
    features = df.columns.tolist()[18:]
    df1 = df[features]
    df_final = df1.dropna()
    
    return df_final

In [5]:
def create_features(df):
    # outlier treatment
    # Treating outliers in the numeric columns
    cols_to_treat = ['Property Area in Sq. Ft. Cleaned']
    
    for col in cols_to_treat:
        df = clip_outliers(df,col)
    
    # creating the price by sub-area feature
    fileName = 'sub_area_price_map.pkl'
    with open(fileName,'rb') as f:
        sub_area_price_map = pickle.load(f)
        
    df['Price by sub-area'] =  df['Sub-Area Cleaned'].map(sub_area_price_map)
    
    # Adding the amenitites score feature
    amenities_col = ['ClubHouse Cleaned',
                     'School / University in Township Cleaned',
                     'Hospital in TownShip Cleaned',
                     'Mall in TownShip Cleaned',
                     'Park / Jogging track Cleaned',
                     'Swimming Pool Cleaned',
                     'Gym Cleaned']
    temp = df[amenities_col]
    temp['Amenities score'] = temp.sum(axis=1)
    df['Amenities score'] = temp['Amenities score']
    
    # creating the price by amenities score feature
    fileName = 'amenities_score_price_map.pkl'
    with open(fileName,'rb') as f:
        amenities_score_price_map = pickle.load(f)  
        
    df['Price by Amenities score'] =  df['Amenities score'].map(amenities_score_price_map)
    
    # cleaning the description column and creating pos features
    df["Description Cleaned"] =  df["Description Cleaned"].astype(str).apply(text_prepare)
    df['Noun_Counts'] = df['Description Cleaned'].apply(lambda x: pos_counter(x,'NN'))
    df['Verb_Counts'] = df['Description Cleaned'].apply(lambda x: (pos_counter(x,'VB')+pos_counter(x,'RB')))
    df['Adjective_Counts'] = df['Description Cleaned'].apply(lambda x: pos_counter(x,'JJ'))
    
    # Ngram features
    fileName = 'count_vectorizer.pkl'
    with open(fileName,'rb') as f:
        cv_object = pickle.load(f)
    
    X = cv_object.transform(df['Description Cleaned'])
    df_ngram = pd.DataFrame(X.toarray(),columns=cv_object.get_feature_names())
     
    # Adding this to the main dataframe
    df_final = pd.concat([df.reset_index(drop=True),df_ngram.reset_index(drop=True)],axis=1)
    
    # selecting the final model ready features
    fileName = 'raw_features_mapping.pkl'
    with open(fileName,'rb') as f:
        feature_mapping = pickle.load(f)   
        
    fileName = 'features.pkl'
    with open(fileName,'rb') as f:
        feature_list = pickle.load(f)           
    
    # Removing price column as it is not available in test data
    feature_list.remove('Price_in_lakhs')

    df_final = df_final.rename(columns=feature_mapping)
    df_final = df_final[feature_list]
    
    return df_final

## Test Inference:

([Contents:](#Contents:))

In [6]:
# Loading the data
data= pd.read_excel('../real_state_/data/Pune Real Estate Data.xlsx')
data = data.drop(['Price in Millions','Price in lakhs'],axis=1)
print(data.shape)
data.head()

(200, 16)


Unnamed: 0,Sr. No.,Location,Sub-Area,Propert Type,Property Area in Sq. Ft.,Company Name,TownShip Name/ Society Name,Total TownShip Area in Acres,ClubHouse,School / University in Township,Hospital in TownShip,Mall in TownShip,Park / Jogging track,Swimming Pool,Gym,Description
0,1,"Pune, Maharashtra, India",Bavdhan,1 BHK,492,Shapoorji Paloonji,Vanaha,1000.0,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Shapoorji Paloonji comunity located in the sub...
1,2,"Pune, Maharashtra, India",Bavdhan,2 BHK,774,Shapoorji Paloonji,Vanaha,1000.0,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Vanaha Township located near the lonavala hill...
2,3,"Pune, Maharashtra, India",Bavdhan,3 BHK,889,Shapoorji Paloonji,Vanaha,1000.0,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Vanaha Society is suitable for all aged group ...
3,4,"Pune, Maharashtra, India",Bavdhan,3 BHK Grand,1018,Shapoorji Paloonji,Vanaha,1000.0,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Vanaha township are offering 3BHK grand prpoer...
4,5,"Pune, Maharashtra, India",Mahalunge,2BHK,743,Godrej Properties,Godrej Hills retreat,100.0,Yes,Yes,Yes,Yes,Yes,Yes,Yes,The area is a hub of prestigious schools like ...


In [7]:
df_preprocess = preprocess(data)

In [8]:
print(df_preprocess.shape)
df_preprocess.head()

(200, 14)


Unnamed: 0,Country,Property Type Cleaned,Sub-Area Cleaned,Company Name Cleaned,TownShip Name/ Society Name Cleaned,Description Cleaned,ClubHouse Cleaned,School / University in Township Cleaned,Hospital in TownShip Cleaned,Mall in TownShip Cleaned,Park / Jogging track Cleaned,Swimming Pool Cleaned,Gym Cleaned,Property Area in Sq. Ft. Cleaned
0,india,1,bavdhan,shapoorji paloonji,vanaha,shapoorji paloonji comunity located in the sub...,1,1,1,1,1,1,1,492.0
1,india,2,bavdhan,shapoorji paloonji,vanaha,vanaha township located near the lonavala hill...,1,1,1,1,1,1,1,774.0
2,india,3,bavdhan,shapoorji paloonji,vanaha,vanaha society is suitable for all aged group ...,1,1,1,1,1,1,1,889.0
3,india,3,bavdhan,shapoorji paloonji,vanaha,vanaha township are offering 3bhk grand prpoer...,1,1,1,1,1,1,1,1018.0
4,india,2,mahalunge,godrej properties,godrej hills retreat,the area is a hub of prestigious schools like ...,1,1,1,1,1,1,1,743.0


In [9]:
df_features = create_features(df_preprocess)


In [10]:
print(df_features.shape)
df_features.head()

(200, 25)


Unnamed: 0,PropertyType,ClubHouse,School_University_in_Township,Hospital_in_TownShip,Mall_in_TownShip,Park_Jogging_track,Swimming_Pool,Gym,Property_Area_in_Sq_Ft,Price_by_sub_area,Amenities_score,Price_by_Amenities_score,Noun_Counts,Verb_Counts,Adjective_Counts,boasts_elegant,elegant_towers,every_day,great_community,mantra_gold,offering_bedroom,quality_specification,stories_offering,towers_stories,world_class
0,1,1,1,1,1,1,1,1,670.0,58.044,7,72.666667,9,1,3,0,0,0,0,0,0,0,0,0,0
1,2,1,1,1,1,1,1,1,774.0,58.044,7,72.666667,9,1,3,0,0,0,0,0,0,0,0,0,0
2,3,1,1,1,1,1,1,1,889.0,58.044,7,72.666667,9,1,3,0,0,0,0,0,0,0,0,0,0
3,3,1,1,1,1,1,1,1,1018.0,58.044,7,72.666667,8,1,3,0,0,0,0,0,0,0,0,0,0
4,2,1,1,1,1,1,1,1,743.0,73.555556,7,72.666667,12,1,6,0,0,0,0,0,0,0,0,0,0


In [11]:
columns = df_features.columns.tolist()
columns[:5]

['PropertyType',
 'ClubHouse',
 'School_University_in_Township',
 'Hospital_in_TownShip',
 'Mall_in_TownShip']

In [12]:
payload = df_features.iloc[3].to_dict()
#Even if an integer of the type int64 is present in another object like a dictionary, 
#the TypeError exception will occur with the message “TypeError: Object of type int64 is not JSON serializable”
import json
# define a class to avoid that
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)
    
payload = json.dumps(payload,cls=NpEncoder)

In [13]:
payload

'{"PropertyType": "3", "ClubHouse": 1, "School_University_in_Township": 1, "Hospital_in_TownShip": 1, "Mall_in_TownShip": 1, "Park_Jogging_track": 1, "Swimming_Pool": 1, "Gym": 1, "Property_Area_in_Sq_Ft": 1018.0, "Price_by_sub_area": 58.044000000000004, "Amenities_score": 7, "Price_by_Amenities_score": 72.66666666666667, "Noun_Counts": 8, "Verb_Counts": 1, "Adjective_Counts": 3, "boasts_elegant": 0, "elegant_towers": 0, "every_day": 0, "great_community": 0, "mantra_gold": 0, "offering_bedroom": 0, "quality_specification": 0, "stories_offering": 0, "towers_stories": 0, "world_class": 0}'

In [14]:
payload = df_features.iloc[3].to_dict()
payload = json.dumps(payload,cls=NpEncoder)

out =  requests.post(url='https://property-price-prediction-live.herokuapp.com/predict',
                data=payload)

In [15]:
out.text

'"[85.55245708]"'

In [16]:
output = []
for i in range(len(df_features)):
    payload = df_features.iloc[i].to_dict()
    payload = json.dumps(payload,cls=NpEncoder)
    
    out =  requests.post(url='https://property-price-prediction-live.herokuapp.com/predict',
                    data=payload)
    result = np.float(re.sub('[^A-Za-z0-9.]+', '', out.text))  
    output.append(result)

In [17]:
output

[41.73696173,
 57.70878214,
 74.58849812,
 85.55245708,
 66.79123803,
 84.59893199,
 77.66213533,
 108.16028495,
 78.39072133,
 109.57364513,
 159.10621088,
 176.79921586,
 173.62768353,
 82.30834029,
 118.12537567,
 50.24301879,
 57.96254756,
 79.05972822,
 52.32957137,
 69.86078406,
 74.73422689,
 103.96509677,
 38.42625687,
 62.09826553,
 52.03298601,
 56.40783384,
 67.41405046,
 88.43816135,
 71.24488624,
 73.88559514,
 83.5413806,
 84.88514865,
 103.69906684,
 102.73270284,
 115.79059498,
 61.73645075,
 106.18832373,
 85.2035976,
 119.28481463,
 42.50511413,
 74.52375717,
 143.78489959,
 30.68352346,
 37.17655204,
 41.66563326,
 84.43268267,
 87.32860919,
 115.21966742,
 109.0564247,
 123.23834368,
 168.31854671,
 42.84774364,
 51.65022681,
 55.37759299,
 65.17603304,
 60.00438345,
 48.02535026,
 58.46073342,
 64.29291026,
 56.84451424,
 86.81352401,
 42.71811613,
 53.57938083,
 71.38925869,
 68.10531012,
 89.62387405,
 80.54255645,
 198.02413568,
 206.04585597,
 71.86346632,
 75.

In [18]:
# data= pd.read_excel(r'../data/Pune Real Estate Data.xlsx')
# data.head()

In [19]:
def get_prediction_interval(interval_estimate, prediction):
    '''
    Get a prediction interval for a linear regression model.
    
    INPUTS: 
        - interval_estimate based on the final model's performance on the training data 
        - predicted values for the test data,
        - Prediction interval threshold (default = .95) 
    OUTPUT: 
        - Prediction interval for single test prediction
    '''
    
    #generate prediction interval lower and upper bound cs_24
    lower, upper = prediction - interval_estimate, prediction + interval_estimate
    return lower, upper

In [20]:
import pickle
fileName = 'model/interval_est.pkl'
with open(fileName,'rb') as f:
    interval = pickle.load(f)

interval

FileNotFoundError: [Errno 2] No such file or directory: 'model/interval_est.pkl'

In [None]:
# getting prediction intervals for the test data
lower_vet = []
upper_vet = []

for out in output:
    lower, upper =  get_prediction_interval(interval,out)
    lower_vet.append(lower)
    upper_vet.append(upper)    

In [None]:
pd.DataFrame(zip(lower_vet,upper_vet,output),columns=['lower','upper','mean'])