In [356]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

df = pd.read_csv("OFFICIAL_CSV_Labeling_dataset_facebook-groups-scraper_raw_SVM_PREDICTIONS.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 26 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Column1                            1000 non-null   int64  
 1   Supply(Selling)_or_Demand(Buying)  1000 non-null   object 
 2   text                               937 non-null    object 
 3   title                              133 non-null    object 
 4   url                                1000 non-null   object 
 5   legacyId                           1000 non-null   float64
 6   time                               1000 non-null   object 
 7   commentsCount                      1000 non-null   int64  
 8   likesCount                         1000 non-null   int64  
 9   sharesCount                        1000 non-null   int64  
 10  topComments/0/text                 457 non-null    object 
 11  topComments/1/likesCount           189 non-null    float6

In [357]:
#keep only the extracted features:
df = df[['Supply(Selling)_or_Demand(Buying)','rent', 'fromDate', 'toDate', 'location', 'bedrooms', 'bathrooms']].copy()

In [358]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Supply(Selling)_or_Demand(Buying)  1000 non-null   object
 1   rent                               702 non-null    object
 2   fromDate                           851 non-null    object
 3   toDate                             454 non-null    object
 4   location                           566 non-null    object
 5   bedrooms                           499 non-null    object
 6   bathrooms                          416 non-null    object
dtypes: object(7)
memory usage: 54.8+ KB


In [359]:
# Cleaning the data
# Remove spaces
# Remove symbols: $,-<
# Change Not found/anomalies into empty fields
df["rent"] = df["rent"].str.replace("$","")
df["rent"] = df["rent"].str.replace(",","")
df["rent"] = df["rent"].str.replace("-","")
df["rent"] = df["rent"].str.replace(" ","")
df["rent"] = df["rent"].str.replace("<","")
df["rent"] = df["rent"].str.replace("Notfound","")
df["rent"] = df["rent"].str.replace("NotFound","")
df["rent"] = df["rent"].str.replace("Unknown","")
df["rent"] = df["rent"].str.replace("notfound","")
df["rent"] = df["rent"].str.replace("unavailable","")
df["rent"] = df["rent"].str.replace("800to1000","800")
df["rent"] = df["rent"].str.replace("Under1000","1000")
df["rent"] = df["rent"].str.replace("8001k","800")

# Replace empty lines with nan
df = df.replace(r'^\s*$', np.nan, regex=True)
# Replace nan with 0
df["rent"] = df["rent"].fillna(0)
# Convert to float data type
df.rent = df.rent.astype(float)

In [360]:
# Check dataset
df.isnull().sum()

Supply(Selling)_or_Demand(Buying)      0
rent                                   0
fromDate                             149
toDate                               546
location                             434
bedrooms                             501
bathrooms                            584
dtype: int64

In [361]:
# Empty location fields can be filled with San Jose, CA since the search radius was relatively small
df.location = df.location.fillna('San Jose, CA')

In [362]:
# Fill NaN values with 1
# A listing assumes there is at least 1 bedroom/bathroom available
df.bedrooms = df.bedrooms.fillna('1')

In [363]:
df.bathrooms = df.bathrooms.fillna('1')

In [364]:
print(df)

    Supply(Selling)_or_Demand(Buying)    rent    fromDate      toDate  \
0                              Supply     0.0  04/01/2024  05/01/2024   
1                              Supply  1280.0  06/01/2024  08/01/2024   
2                              Demand  1250.0  07/27/2024  08/01/2024   
3                              Supply  1220.0  04/27/2024         NaN   
4                              Supply  1000.0  03/01/2024  03/01/2025   
..                                ...     ...         ...         ...   
995                            Supply     0.0  04/27/2024  06/01/2023   
996                            Demand     0.0  05/28/2024  08/18/2024   
997                            Supply  1000.0  06/01/2023  12/01/2023   
998                            Supply  1800.0  04/27/2024         NaN   
999                           Unknown     0.0         NaN         NaN   

                                     location bedrooms bathrooms  
0                                San Jose, CA        2  

In [365]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Supply(Selling)_or_Demand(Buying)  1000 non-null   object 
 1   rent                               1000 non-null   float64
 2   fromDate                           851 non-null    object 
 3   toDate                             454 non-null    object 
 4   location                           1000 non-null   object 
 5   bedrooms                           1000 non-null   object 
 6   bathrooms                          1000 non-null   object 
dtypes: float64(1), object(6)
memory usage: 54.8+ KB


In [366]:
print(df)

    Supply(Selling)_or_Demand(Buying)    rent    fromDate      toDate  \
0                              Supply     0.0  04/01/2024  05/01/2024   
1                              Supply  1280.0  06/01/2024  08/01/2024   
2                              Demand  1250.0  07/27/2024  08/01/2024   
3                              Supply  1220.0  04/27/2024         NaN   
4                              Supply  1000.0  03/01/2024  03/01/2025   
..                                ...     ...         ...         ...   
995                            Supply     0.0  04/27/2024  06/01/2023   
996                            Demand     0.0  05/28/2024  08/18/2024   
997                            Supply  1000.0  06/01/2023  12/01/2023   
998                            Supply  1800.0  04/27/2024         NaN   
999                           Unknown     0.0         NaN         NaN   

                                     location bedrooms bathrooms  
0                                San Jose, CA        2  

In [367]:
# Convert non-numerical fields into numerical only
# "Not found" was converted to 1 as a listing assumes there is at least 1 bedroom available
# For a range of bedrooms, the higher value was chosen
df["bedrooms"] = df["bedrooms"].str.replace("Not found","1")
df["bedrooms"] = df["bedrooms"].str.replace("Shared","0.5")
df["bedrooms"] = df["bedrooms"].str.replace("2-3","3")
df["bedrooms"] = df["bedrooms"].str.replace("double occupancy","2")
df["bedrooms"] = df["bedrooms"].str.replace("1 single, 1 shared","2")
df["bedrooms"] = df["bedrooms"].str.replace("2 or 3","3")
df["bedrooms"] = df["bedrooms"].str.replace("3-4","4")
df["bedrooms"] = df["bedrooms"].str.replace("4-5","5")
df.bedrooms = df.bedrooms.astype(float)

In [368]:
# "Not found" was converted to 1 as a listing assumes there is at least 1 bathroom available
# For a range of bathrooms, the higher value was chosen
df["bathrooms"] = df["bathrooms"].str.replace("Not found","1")
df["bathrooms"] = df["bathrooms"].str.replace("1 private, shared","1.5")
df["bathrooms"] = df["bathrooms"].str.replace("1 or 2","2")
df["bathrooms"] = df["bathrooms"].str.replace("1-3","3")
df.bathrooms = df.bathrooms.astype(float)

In [369]:
#did not implement due to Excessive Rate Limiter (bot abuse prevention)

#from geopy.geocoders import Nominatim
#
#geolocator = Nominatim(user_agent="myApp")
#
#df[['latitude', 'longitude']] = df['location'].apply(
#    geolocator.geocode).apply(lambda x: pd.Series(
#        [x.latitude, x.longitude], index=['location_lat', 'location_long']))

In [370]:
print(df)

    Supply(Selling)_or_Demand(Buying)    rent    fromDate      toDate  \
0                              Supply     0.0  04/01/2024  05/01/2024   
1                              Supply  1280.0  06/01/2024  08/01/2024   
2                              Demand  1250.0  07/27/2024  08/01/2024   
3                              Supply  1220.0  04/27/2024         NaN   
4                              Supply  1000.0  03/01/2024  03/01/2025   
..                                ...     ...         ...         ...   
995                            Supply     0.0  04/27/2024  06/01/2023   
996                            Demand     0.0  05/28/2024  08/18/2024   
997                            Supply  1000.0  06/01/2023  12/01/2023   
998                            Supply  1800.0  04/27/2024         NaN   
999                           Unknown     0.0         NaN         NaN   

                                     location  bedrooms  bathrooms  
0                                San Jose, CA       2.

In [371]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Supply(Selling)_or_Demand(Buying)  1000 non-null   object 
 1   rent                               1000 non-null   float64
 2   fromDate                           851 non-null    object 
 3   toDate                             454 non-null    object 
 4   location                           1000 non-null   object 
 5   bedrooms                           1000 non-null   float64
 6   bathrooms                          1000 non-null   float64
dtypes: float64(3), object(4)
memory usage: 54.8+ KB


In [372]:
#drop buyer listings
df = df.drop(df[df['Supply(Selling)_or_Demand(Buying)']== 0].index)

#drop unknown listings
df = df.drop(df[df['Supply(Selling)_or_Demand(Buying)']== 2].index)

#only the supply listings remain
#drop column for supply/demand/unknown labels since we know that every entry is part of supply
df = df.drop(df.columns[0], axis=1)

In [373]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   rent       1000 non-null   float64
 1   fromDate   851 non-null    object 
 2   toDate     454 non-null    object 
 3   location   1000 non-null   object 
 4   bedrooms   1000 non-null   float64
 5   bathrooms  1000 non-null   float64
dtypes: float64(3), object(3)
memory usage: 47.0+ KB


In [374]:
df_train = df[["rent", "fromDate", "toDate", "location", "bedrooms", "bathrooms"]]

In [375]:
# Encode category type columns into numerical type
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

df_categories = ['location', 'toDate', 'fromDate']

for column in df_categories:
    df_train[column] = label_encoder.fit_transform(df_train[column])

In [382]:
# Normalize the numerical features
numerical_features = ['rent', 'bedrooms', 'toDate', 'fromDate', 'bathrooms']

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_train[numerical_features] = scaler.fit_transform(df_train[numerical_features])

In [383]:
# Use the Nearest Neighbors model
nn_model = NearestNeighbors(metric='euclidean')
nn_model.fit(df_train)

# Disable warning about not having feature names
nn_model.feature_names_in_ = None

In [384]:
def recommend_similar_listings(index, num_recommendations=5):
    distances, indexes = nn_model.kneighbors(df_train.iloc[index].values.reshape(1, -1), n_neighbors=num_recommendations+1) #including the initial interest for NN calc
    recommended_indexes = indexes.squeeze()[1:] #only want to keep the indexes, exclude the initial interest
    interested_listing  =  df_train.iloc[[index]]
    recommended_listings = df_train.iloc[recommended_indexes]
    mse = np.mean(distances ** 2)
    #rmse = np.sqrt(mse)
    
    #sum the mse values for averaging later
    global mseSum
    mseSum += mse

    return print("--------------------------------------------------------------------\nYour Interested Listing Index #", x, "\n", interested_listing, "\n\nHere are", num_recommendations, "similar listings as recommendation:\n", recommended_listings, "\n\nMean Squared Error between interested and recommended listings: ", mse, "\n")

In [385]:
def myFunction(x):
    index = x # Index x in the dataframe
    recommend_similar_listings(index)

In [386]:
# Get the number of rows in the DataFrame
num_rows = len(df_train)
mseSum = 0

# Loop this function for every row in the dataframe to find five recommendations for each index
# Calculates mean squared error after every iteration

for x in range(num_rows):
    myFunction(x)

--------------------------------------------------------------------
Your Interested Listing Index # 0 
        rent  fromDate    toDate  location  bedrooms  bathrooms
0 -0.197975 -1.131042 -1.968106       171  0.536416   0.955185 

Here are 5 similar listings as recommendation:
          rent  fromDate    toDate  location  bedrooms  bathrooms
20  -0.197628 -1.176041 -2.111773       171  0.536416   0.955185
942 -0.197241 -0.996043 -1.704715       171  0.536416   0.955185
146 -0.197375 -1.423539 -2.423054       171  0.536416   0.955185
135 -0.197218 -1.378539 -1.465268       171  0.536416   0.955185
708 -0.197388 -0.501048 -1.297656       171  0.536416   0.955185 

Mean Squared Error between interested and recommended listings:  0.260549197078215 

--------------------------------------------------------------------
Your Interested Listing Index # 1 
        rent  fromDate    toDate  location  bedrooms  bathrooms
1 -0.197349 -0.636046 -1.034265        69  0.536416    2.41126 

Here are 

In [387]:
#divides the sum of all mse values and divides by the number of rows to give the average mse from all runs
avg_mse = mseSum / num_rows
print("Average Mean Squared Error: ", avg_mse)

Average Mean Squared Error:  2.616198851668428
