# Imports

In [1]:
import pandas as pd
import numpy as np

dtype_mapping = {
    'propertyId': pd.StringDtype(),
    'localityName': 'category',
    'landMarks': pd.StringDtype(),
    'locality': pd.StringDtype(),
    'price': pd.Int64Dtype(),
    'nameOfSociety': pd.StringDtype(),
    'projectName': pd.StringDtype(),
    'carpetArea': pd.Int64Dtype(),
    'coveredArea': pd.Int64Dtype(),
    'carpetAreaSqft': pd.Int64Dtype(),
    'possessionStatus': pd.StringDtype(),
    'developerName': pd.StringDtype(),
    'flooringType': pd.StringDtype(),
    'floorNumber': pd.Int64Dtype(),
    'unitCountonFloor': pd.Int64Dtype(),
    'totalFloorNumber': pd.Int64Dtype(),
    'electricityStatus': pd.StringDtype(),
    'waterStatus': pd.StringDtype(),
    'longitude': pd.Float64Dtype(),
    'latitude': pd.Float64Dtype(),
    'transactionType': 'category',
    'facing': pd.StringDtype(),
    'ownershipType': pd.StringDtype(),
    'carParking': pd.StringDtype(),
    'furnished': 'category',
    'bedrooms': pd.Int64Dtype(),
    'bathrooms': pd.Int64Dtype(),
    'numberOfBalconied': pd.Int64Dtype(),
    'propertyType': 'category',
    'additionalRooms': pd.StringDtype(),
    'bookingAmountExact': pd.Int64Dtype(),
    'maintenanceChargesFrequency': 'category',
    'maintenanceCharges': pd.Int64Dtype(),
    'ageofcons': 'category',
    'isVerified': 'category',
    'listingTypeDesc': 'category',
    'premiumProperty': pd.BooleanDtype(),
    'noOfLifts': pd.Int64Dtype(),
    'propertyAmenities': pd.StringDtype(),
    'facilitiesDesc': pd.StringDtype(),
    'uuid': pd.StringDtype(),
    'flooringType_Vitrified': pd.BooleanDtype(),
    'flooringType_CeramicTiles': pd.BooleanDtype(),
    'flooringType_Marble': pd.BooleanDtype(),
    'flooringType_NormalTilesKotahStone': pd.BooleanDtype(),
    'flooringType_Granite': pd.BooleanDtype(),
    'flooringType_Wooden': pd.BooleanDtype(),
    'flooringType_Mosaic': pd.BooleanDtype(),
    'flooringType_Marbonite': pd.BooleanDtype(),
    'additionalRoom_PujaRoom': pd.BooleanDtype(),
    'additionalRoom_Study': pd.BooleanDtype(),
    'additionalRoom_Store': pd.BooleanDtype(),
    'additionalRoom_ServantRoom': pd.BooleanDtype(),
    'carParking_Open': pd.Int64Dtype(),
    'carParking_Covered': pd.Int64Dtype(),
    'ReservedParking': pd.BooleanDtype(),
}

COLUMNS_TO_DROP = [
    'coveredArea',
    'ReservedParking',
] + [
        'unitCountonFloor',
        'electricityStatus',
        'waterStatus',
        'facing',
        'bookingAmountExact',
        'isVerified',
        'listingTypeDesc',
        'maintenanceCharges',
        'maintenanceChargesFrequency',
        'latitude',
        'longitude',
        'carParking_Open',
        'carParking_Covered',
        'numberOfBalconied',
        'premiumProperty',
        'projectName',
        'nameOfSociety',
        'url',
        'uuid',
        'carpetAreaSqft',
        'noOfLifts',
        'ownershipType',
        'possessionStatus',
        'propertyType',

        'flooringType_Vitrified',
        'flooringType_CeramicTiles',
        'flooringType_Marble',
        'flooringType_NormalTilesKotahStone',
        'flooringType_Granite',
        'flooringType_Wooden',
        'flooringType_Mosaic',
        'flooringType_Marbonite',

        'additionalRoom_PujaRoom',
        'additionalRoom_Study',
        'additionalRoom_Store',
        'additionalRoom_ServantRoom',
        
        'landMarks', 
        'locality', 
        'developerName']

################################################################################
# ONLY USING THE RAW SETs, NOT IMPUTED SET
################################################################################
df_train = pd.read_csv(
    'Data/train.csv',
    dtype = dtype_mapping,
    index_col=0
)
df_train.drop(COLUMNS_TO_DROP, axis=1, inplace=True)

df_test = pd.read_csv(
    'Data/test.csv',
    dtype = dtype_mapping,
    index_col=0
)
df_test.drop(COLUMNS_TO_DROP, axis=1, inplace=True)

################################################################################
# DROPPING ALL ROWS WITH MISSING VALUES
################################################################################

df_train.dropna(axis=0, inplace=True)
df_test.dropna(axis=0, inplace=True)

In [2]:
mapping = {
    "carpetArea": pd.Int64Dtype(),
    "floorNumber": pd.Int64Dtype(),
    "totalFloorNumber": pd.Int64Dtype(),
    "bedrooms": pd.Int64Dtype(),
    "bathrooms": pd.Int64Dtype(),
    "localityName": 'category',
    "transactionType": 'category',
    "furnished": 'category',
    "ageofcons": 'category',
}

df = pd.concat([df_train, df_test], ignore_index=False).astype(mapping)

In [7]:
########################################################################
# DROPPING LOCALITY FOR NEAREST NEIGHBORS
########################################################################

# df.drop(columns=['localityName'], inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21392 entries, 74208793 to 75682303
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   localityName      21392 non-null  category
 1   price             21392 non-null  Int64   
 2   carpetArea        21392 non-null  Int64   
 3   floorNumber       21392 non-null  Int64   
 4   totalFloorNumber  21392 non-null  Int64   
 5   transactionType   21392 non-null  category
 6   furnished         21392 non-null  category
 7   bedrooms          21392 non-null  Int64   
 8   bathrooms         21392 non-null  Int64   
 9   ageofcons         21392 non-null  category
dtypes: Int64(6), category(4)
memory usage: 1.4 MB


# Linear Encoding for Nearest Neighbors

In [9]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np # For numerical operations if needed later

# 1. Define feature lists based on type
numerical_cols = [
    "carpetArea",
    "bedrooms",
    "bathrooms",
    "floorNumber",
    "totalFloorNumber",
]
categorical_cols = [
    # "localityName",
    "transactionType",
    "furnished",
    "ageofcons",
]

# 2. Define the desired final order (matching user input expectation)
# Note: Categorical features will be expanded by OneHotEncoder
user_input_order = [
    "carpetArea",
    "bedrooms",
    "bathrooms",
    "floorNumber",
    "totalFloorNumber",
    # "localityName",
    "transactionType",
    "furnished",
    "ageofcons",
]

# 3. Define weights (currently all 1.0)
# We'll apply these *after* initial scaling/encoding
feature_weights = {
    "carpetArea": 1.0,
    "bedrooms": 1.0,
    "bathrooms": 1.0,
    "floorNumber": 1.0,
    "totalFloorNumber": 1.0,
    # "localityName": 1.0, # Weight applies to all generated OHE columns
    "transactionType": 1.0, # Weight applies to all generated OHE columns
    "furnished": 1.0, # Weight applies to all generated OHE columns
    "ageofcons": 1.0, # Weight applies to all generated OHE columns
}

# --- Preprocessing ---

# Separate features (X) and property IDs (index)
# Exclude 'price' column
features_to_encode = numerical_cols + categorical_cols
X = df[features_to_encode]
property_ids = df.index # Preserve property IDs

# Create the ColumnTransformer
# - Numerical features: Standard Scaling
# - Categorical features: One-Hot Encoding
#   - handle_unknown='ignore': If user input has a category not seen in training,
#     it will be encoded as all zeros for that feature. Important for robustness.
#   - sparse_output=False: Output a dense numpy array, easier to work with.
preprocessor = ColumnTransformer(
    transformers=[
        (
            "num",
            StandardScaler(),
            numerical_cols,
        ),
        (
            "cat",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            categorical_cols,
        ),
    ],
    remainder="passthrough", # Keep other columns if any (shouldn't be any here)
    verbose_feature_names_out=False, # Keep original names for num features
)

# Fit the preprocessor on the data and transform it
print("Fitting and transforming data...")
X_encoded = preprocessor.fit_transform(X)
print(f"Data transformed. Shape: {X_encoded.shape}")

# Get the feature names after transformation (important for OHE)
# This preserves numerical names and creates names like 'localityName_XYZ'
encoded_feature_names = preprocessor.get_feature_names_out()

# Create a DataFrame with the encoded data and correct column names
encoded_df = pd.DataFrame(
    X_encoded, columns=encoded_feature_names, index=property_ids
)

# --- Apply Weighting ---
print("Applying feature weights...")
weighted_df = encoded_df.copy() # Start with the encoded data

for feature_name, weight in feature_weights.items():
    if weight == 1.0: # No need to multiply if weight is 1
        continue

    # Find columns corresponding to this original feature
    # For numerical, it's just the name.
    # For categorical, it's the name + '_' + category value
    if feature_name in numerical_cols:
        cols_to_weight = [feature_name]
    elif feature_name in categorical_cols:
        # Find all columns starting with the categorical feature name + "_"
        # (default separator) or exactly the feature name if OHE created it differently
        # (less likely with verbose_feature_names_out=False)
        cols_to_weight = [
            col
            for col in encoded_feature_names
            if col.startswith(feature_name + "_") or col == feature_name
        ]
    else:
        cols_to_weight = [] # Should not happen with current setup

    if not cols_to_weight:
        print(f"Warning: No columns found for weighting feature '{feature_name}'")
        continue

    # Apply the weight by multiplying the selected columns
    weighted_df[cols_to_weight] *= weight

# --- Reorder Columns to Match User Input Structure ---
# We need to reconstruct the final order, expanding the categorical names
print("Reordering columns...")
final_column_order = []
current_encoded_cols = weighted_df.columns.tolist()

for feature in user_input_order:
    if feature in numerical_cols:
        final_column_order.append(feature)
    elif feature in categorical_cols:
        # Find all columns that were generated from this categorical feature
        generated_cols = [
            col
            for col in current_encoded_cols
            if col.startswith(feature + "_") or col == feature # Handle potential edge cases
        ]
        # Sort them alphabetically for consistency (optional but good practice)
        generated_cols.sort()
        final_column_order.extend(generated_cols)

# Create the final DataFrame with the desired column order
final_encoded_vectors = weighted_df[final_column_order]

# --- Output ---
print("\n--- Final Encoded Vectors ---")
print(f"Shape: {final_encoded_vectors.shape}")
print("Columns:", final_encoded_vectors.columns.tolist())


# --- Important for Backend ---
# You will need to SAVE:
# 1. The `final_encoded_vectors` DataFrame (or its numpy array version).
#    This contains the vectors you'll search against.
#    Example: final_encoded_vectors.to_pickle("property_vectors.pkl")
#             or np.save("property_vectors.npy", final_encoded_vectors.values)
#             and save property_ids separately if using numpy array.
#
# 2. The fitted `preprocessor` object. You NEED this to encode the user's input
#    in the exact same way before performing the nearest neighbor search.
#    Example: import joblib
#             joblib.dump(preprocessor, 'preprocessor.joblib')
#
# 3. The `final_column_order` list (or derive it again in the backend) if needed
#    for verification, although the preprocessor handles the transformation order.
#
# 4. The `property_ids` (which are the index of `final_encoded_vectors`).
#    You need these to map the indices returned by NearestNeighbors back to actual IDs.
#    Example: (already saved if using pickle for the DataFrame)
#             or save separately: pd.Series(property_ids).to_pickle("property_ids.pkl")



Fitting and transforming data...
Data transformed. Shape: (21392, 17)
Applying feature weights...
Reordering columns...

--- Final Encoded Vectors ---
Shape: (21392, 17)
Columns: ['carpetArea', 'bedrooms', 'bathrooms', 'floorNumber', 'totalFloorNumber', 'transactionType_New Property', 'transactionType_Resale', 'furnished_Furnished', 'furnished_Semi-Furnished', 'furnished_Unfurnished', 'ageofcons_10 to 15 years', 'ageofcons_15 to 20 years', 'ageofcons_5 to 10 years', 'ageofcons_Above 20 years', 'ageofcons_Less than 5 years', 'ageofcons_New Construction', 'ageofcons_Under Construction']


In [10]:
final_encoded_vectors.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21392 entries, 74208793 to 75682303
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   carpetArea                    21392 non-null  float64
 1   bedrooms                      21392 non-null  float64
 2   bathrooms                     21392 non-null  float64
 3   floorNumber                   21392 non-null  float64
 4   totalFloorNumber              21392 non-null  float64
 5   transactionType_New Property  21392 non-null  float64
 6   transactionType_Resale        21392 non-null  float64
 7   furnished_Furnished           21392 non-null  float64
 8   furnished_Semi-Furnished      21392 non-null  float64
 9   furnished_Unfurnished         21392 non-null  float64
 10  ageofcons_10 to 15 years      21392 non-null  float64
 11  ageofcons_15 to 20 years      21392 non-null  float64
 12  ageofcons_5 to 10 years       21392 non-null  float64
 

# Creating Vectors

In [11]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import joblib # For saving the model later

# --- Assume 'final_encoded_vectors' DataFrame exists from the previous step ---
# It should have property IDs as the index and encoded features as columns.
# Example:
# final_encoded_vectors = pd.read_pickle("property_vectors.pkl") # If saved

# --- Assume 'property_ids' exists (which is the index of final_encoded_vectors) ---
property_ids = final_encoded_vectors.index

# --- Configuration ---
N_NEIGHBORS = 10 # How many neighbors to find by default
METRIC = 'cosine' # Distance metric ('cosine', 'euclidean'/'l2', 'manhattan'/'l1')

# --- Fit Nearest Neighbors Model ---

print(f"Fitting NearestNeighbors model (k={N_NEIGHBORS}, metric='{METRIC}')...")

# 1. Initialize the model
#    n_jobs=-1 uses all available CPU cores for potentially faster fitting/querying
nn_model = NearestNeighbors(
    n_neighbors=N_NEIGHBORS, metric=METRIC, algorithm='auto', n_jobs=-1
)

# 2. Fit the model on the encoded data vectors
#    It's generally recommended to pass the underlying NumPy array (.values)
nn_model.fit(final_encoded_vectors.values)

print("NearestNeighbors model fitted successfully.")

Fitting NearestNeighbors model (k=10, metric='cosine')...
NearestNeighbors model fitted successfully.


In [12]:
# # --- Trial Run: Find neighbors for a sample property ---

# # 1. Choose a sample property to find neighbors for (e.g., the first one)
# sample_index_position = 0 # Taking the first property in the DataFrame
# sample_property_id = property_ids[sample_index_position]
# sample_vector = final_encoded_vectors.iloc[[sample_index_position]] # Keep as DataFrame row initially

# print(f"\n--- Finding neighbors for sample property ID: {sample_property_id} ---")
# print(f"Sample Vector (head): \n{sample_vector.iloc[:, :5]}") # Show first few features

# # 2. Prepare the sample vector for kneighbors (needs to be 2D NumPy array)
# sample_vector_np = sample_vector.values # Get NumPy array (already 2D)

# # 3. Use the fitted model to find neighbors
# #    kneighbors returns distances and indices
# distances, indices = nn_model.kneighbors(sample_vector_np)

# print(f"\nRaw distances: {distances}")
# print(f"Raw indices: {indices}")

# # The `indices` array contains the row positions (0-based) in the
# # original `final_encoded_vectors` data that are the nearest neighbors.
# # Since we queried with one sample, indices[0] contains the list of neighbor indices.

# # 4. Extract the indices for our single sample query
# neighbor_indices = indices[0]
# print(f"\nIndices of the {N_NEIGHBORS} nearest neighbors: {neighbor_indices}")

# # 5. Map these indices back to the actual Property IDs
# #    We use the original `property_ids` Series/Index we stored earlier
# retrieved_neighbor_ids = property_ids[neighbor_indices].tolist()

# print(f"\nRetrieved Property IDs of neighbors: {retrieved_neighbor_ids}")

# # --- Verification (Optional) ---
# # The first neighbor (index 0) should usually be the sample property itself,
# # unless there's an exact duplicate vector elsewhere.
# if retrieved_neighbor_ids[0] == sample_property_id:
#     print("\nVerification: The first neighbor is the sample property itself (expected).")
# else:
#     print("\nVerification: The first neighbor is NOT the sample property itself.")


# # --- Important for Backend ---
# # Now you need to SAVE the fitted `nn_model` object.
# # Example:
# # joblib.dump(nn_model, 'nearest_neighbors_model.joblib')
# #
# # Remember you also need the saved `preprocessor` and the `property_ids`
# # (or the `final_encoded_vectors` DataFrame which includes the IDs as index)
# # from the previous step.

In [13]:
import joblib
import pandas as pd
import numpy as np # Only needed if you were saving numpy arrays directly

# It's good practice to define filenames as constants
NN_MODEL_FILE = 'PipelinesAndModels/nearest_neighbors_model.joblib'
RECOMMENDATION_PREPROCESSOR_FILE = 'PipelinesAndModels/recommendation_preprocessor.joblib'
PROPERTY_VECTORS_FILE = 'PipelinesAndModels/property_vectors.pkl' # Using pickle for the DataFrame

joblib.dump(nn_model, NN_MODEL_FILE, compress=3)

joblib.dump(preprocessor, RECOMMENDATION_PREPROCESSOR_FILE, compress=3)

final_encoded_vectors.to_pickle(PROPERTY_VECTORS_FILE)

# --- Optional: Save Property IDs separately (if needed, but redundant if saving the DataFrame) ---
# property_ids = final_encoded_vectors.index
# PROPERTY_IDS_FILE = 'property_ids.pkl'
# print(f"Saving property IDs separately to: {PROPERTY_IDS_FILE}")
# pd.Series(property_ids).to_pickle(PROPERTY_IDS_FILE)
# print("-> Saved property_ids.")

# Compiling property Metadata

## Checking which properties are available

In [47]:
import requests
import pandas as pd

rawData = pd.read_csv("../Data/rawExtractedPropertyDetails.csv")
rawData['propertyId'] = rawData['propertyId'].astype(str)
rawData.set_index('propertyId', inplace=True)

  rawData = pd.read_csv("../Data/rawExtractedPropertyDetails.csv")


In [53]:
id = 0
print(rawData.loc[df.index, ['url']].reset_index().iloc[id]['propertyId'])
rawData.loc[df.index, ['url']].reset_index().iloc[id]['url']

74208793


'https://www.magicbricks.com/propertyDetails/3-BHK-1662-Sq-ft-Multistorey-Apartment-FOR-Sale-Hinjewadi-in-Pune&id=4d423734323038373933'

In [33]:
df.index[:100]

Index(['74208793', '73773015', '75162077', '76084143', '75658157', '48337347',
       '74400799', '70265331', '75679247', '73281529', '73508741', '74837269',
       '75692605', '75577751', '75665103', '74903023', '75933165', '73774411',
       '75181979', '75746521', '75687865', '75691897', '72278837', '72991527',
       '75657681', '75658553', '75198967', '74598267', '75709545', '75665171',
       '74185523', '75206021', '76071257', '75694581', '76015789', '75688537',
       '75147705', '74212763', '75658603', '73279171', '75382247', '73693493',
       '75666349', '75668555', '63269545', '62164211', '75672063', '70861971',
       '74918157', '74702487', '75668101', '71333765', '74717153', '75671737',
       '74654451', '73625611', '75702115', '71242815', '74328807', '73877431',
       '72510057', '75665359', '75617881', '75658549', '75378819', '71772115',
       '75885301', '76141693', '75915281', '70762573', '71833131', '76172989',
       '74729393', '76114639', '75664943', '75689699

In [35]:
property_urls = rawData.loc[df.index[:100], 'url']

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Example: Assume property_urls is your pandas Series with propertyId as index and URL as values.
# For example:
# property_urls = pd.Series({
#     101: "https://example.com/property/101",
#     102: "https://example.com/property/102",
#     ...
# })

# Set up Chrome options (you can add more options if needed)
chrome_options = Options()
# chrome_options.add_argument("--headless")  # run in headless mode if you don't need a GUI
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.page_load_strategy = "eager"

# Initialize the webdriver (path to chromedriver may be needed)
driver = webdriver.Chrome(options=chrome_options)

# Dictionary to store results:
# True: property exists, False: property does not exist
results = {}

# Counter for requests to enforce sleep after every 100 requests
request_count = 0

for property_id, url in property_urls.items():
    try:
        driver.get(url)
        # Allow the page some time to load
        # time.sleep(2)

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        # Check if the <body> tag has a class 'error'
        body_element = driver.find_element(By.TAG_NAME, "body")
        body_class = body_element.get_attribute("class")
        
        # Check for the specific error structure: body.error and a nested main tag with class "content error"
        if "error" in body_class.split():
            try:
                # Try to locate the main tag with class "content error" inside body
                driver.find_element(By.CSS_SELECTOR, "main.content.error")
                # If found, mark property as not existing
                results[property_id] = False
            except NoSuchElementException:
                # The structure doesn't match the error pattern; assume property is valid.
                results[property_id] = True
        else:
            results[property_id] = True

    except Exception as e:
        # Handle any exceptions (e.g., network issues, selector issues, etc.)
        print(f"Error processing property {property_id}: {e}")
        results[property_id] = None  # or you could set it to False, or log the error

    request_count += 1

    # After every 100 requests, sleep for a minute to avoid overloading the server
    if request_count % 1000 == 0:
        print(f"Processed {request_count} requests, sleeping for a minute...")
        time.sleep(15)

# Close the driver when done
driver.quit()

# Optionally, convert the results dictionary to a pandas DataFrame or Series for further processing
results_series = pd.Series(results)


In [42]:
results_series.value_counts()

True     51
False    49
Name: count, dtype: int64

## Saving All the recommendations metadata

In [63]:
import json
import pandas as pd

imagePaths = {
    'prefix': "https://img.staticmb.com",
    'imagePaths': {}
}
noImages = 0
for propertyId in df.index:
    with open(f'../Data/propertyDetails/{propertyId}.json', 'r', encoding='utf-8') as f:
        propertyDetails = json.load(f)

    if propertyDetails.get('propertyDetailInfoBeanData') is None:
        noImages += 1
        continue

    temp = propertyDetails['propertyDetailInfoBeanData']['propertyDetail']['detailBean'].get('allImgPath')

    if temp is None:
        noImages += 1
        continue

    imagePaths['imagePaths'][propertyId] = [path.removeprefix('https://img.staticmb.com') for path in temp]


In [64]:
noImages

1708

In [62]:
with open('Data/imagePaths.json', 'w', encoding='utf-8') as f:
    json.dump(imagePaths, f, ensure_ascii=False, indent=0)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21392 entries, 74208793 to 75682303
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   localityName      21392 non-null  category
 1   price             21392 non-null  Int64   
 2   carpetArea        21392 non-null  Int64   
 3   floorNumber       21392 non-null  Int64   
 4   totalFloorNumber  21392 non-null  Int64   
 5   transactionType   21392 non-null  category
 6   furnished         21392 non-null  category
 7   bedrooms          21392 non-null  Int64   
 8   bathrooms         21392 non-null  Int64   
 9   ageofcons         21392 non-null  category
dtypes: Int64(6), category(4)
memory usage: 1.4 MB


In [9]:
df['pricePerSqft'] = df['price'] / df['carpetArea']
df['pricePerSqft'] = df['pricePerSqft'].round(0).astype('Int64')

In [10]:
temp = pd.read_csv('../Data/cleaned_data.csv')
temp['propertyId'] = temp['propertyId'].astype(str)

In [11]:
df['nameOfSociety'] = temp.set_index('propertyId').loc[df.index, 'nameOfSociety'].copy().astype(str)

In [4]:
import json

noImages = 0
df['imagePaths'] = pd.NA
for propertyId in df.index:
    with open(f'../Data/propertyDetails/{propertyId}.json', 'r', encoding='utf-8') as f:
        propertyDetails = json.load(f)

    if propertyDetails.get('propertyDetailInfoBeanData') is None:
        noImages += 1
        continue

    temp = propertyDetails['propertyDetailInfoBeanData']['propertyDetail']['detailBean'].get('allImgPath')

    if temp is None:
        noImages += 1
        continue

    # imagePaths['imagePaths'][propertyId] = [path.removeprefix('https://img.staticmb.com') for path in temp]
    df.at[propertyId, 'imagePaths'] = temp # [path.removeprefix('https://img.staticmb.com') for path in temp]

noURLs = 0
df['url'] = pd.NA
for propertyId in df.index:
    with open(f'../Data/propertyDetails/{propertyId}.json', 'r', encoding='utf-8') as f:
        propertyDetails = json.load(f)

    if propertyDetails.get('propertyDetailInfoBeanData') is None:
        noURLs += 1
        continue

    temp = propertyDetails['propertyDetailInfoBeanData']['propertyDetail']['detailBean'].get('url')

    if temp is None:
        noURLs += 1
        continue

    # imagePaths['imagePaths'][propertyId] = [path.removeprefix('https://img.staticmb.com') for path in temp]
    df.at[propertyId, 'url'] = temp # [path.removeprefix('https://img.staticmb.com') for path in temp]

In [5]:
df['lastUpdatedDate'] = '21 Dec, 2024'

In [12]:
df.to_pickle('Data/recommendations.pkl')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21392 entries, 74208793 to 75682303
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   localityName      21392 non-null  category
 1   price             21392 non-null  Int64   
 2   carpetArea        21392 non-null  Int64   
 3   floorNumber       21392 non-null  Int64   
 4   totalFloorNumber  21392 non-null  Int64   
 5   transactionType   21392 non-null  category
 6   furnished         21392 non-null  category
 7   bedrooms          21392 non-null  Int64   
 8   bathrooms         21392 non-null  Int64   
 9   ageofcons         21392 non-null  category
 10  imagePaths        19684 non-null  object  
 11  url               21390 non-null  object  
 12  lastUpdatedDate   21392 non-null  object  
 13  pricePerSqft      21392 non-null  Int64   
 14  nameOfSociety     21392 non-null  object  
dtypes: Int64(7), category(4), object(4)
memory usage: 2.7+ MB


# API trial

In [14]:
import requests

request_body = {
    'carpetArea': 800,
    'bedrooms': 2,
    'bathrooms': 2,
    'floorNumber': 4,
    'totalFloorNumber': 8,
    'localityName': 'EON Free Zone, Kharadi',
    'transactionType': 'New Property',
    'furnished': 'Semi-Furnished',
    'ageofcons': 'Under Construction'
}

response = requests.post("http://localhost:8000/recommend", json=request_body)

In [15]:
response.json()

{'recommendations': [{'propertyId': '75432267',
   'localityName': 'Ravet, Pimpri Chinchwad',
   'price': 6600000,
   'carpetArea': 762,
   'floorNumber': 4,
   'totalFloorNumber': 7,
   'transactionType': 'New Property',
   'furnished': 'Semi-Furnished',
   'bedrooms': 2,
   'bathrooms': 2,
   'ageofcons': 'Under Construction',
   'imagePaths': ['https://img.staticmb.com/mbphoto/property/cropped_images/2024/Oct/12/Photo_h300_w450/75432267_6_apture17_300_450.jpg',
    'https://img.staticmb.com/mbphoto/property/cropped_images/2024/Oct/12/Photo_h300_w450/75432267_2_apture16_300_450.jpg',
    'https://img.staticmb.com/mbphoto/property/cropped_images/2024/Oct/12/Photo_h300_w450/75432267_3_apture15_300_450.jpg',
    'https://img.staticmb.com/mbphoto/property/cropped_images/2024/Oct/12/Photo_h300_w450/75432267_5_apture18_300_450.jpg',
    'https://img.staticmb.com/mbphoto/property/cropped_images/2024/Oct/12/Photo_h300_w450/75432267_7_apture14_300_450.jpg'],
   'url': 'https://www.magicbricks