In [1]:
import pandas as pd

# **LOAD Dataset**

In [2]:
df = pd.read_csv("/content/gurgaonrealestate_outlierhandling.csv")

In [3]:
df.head()

Unnamed: 0,property_type,society,sector,price,price_per_sqft,bedRoom,bathroom,balcony,floorNum,agePossession,built_up_area,study room,servant room,store room,pooja room,others,furnishing_type,luxury_score
0,flat,signature global park 4,sector 36,0.82,7585.0,3,2,2,2.0,New Property,850.0,0,0,0,0,0,0,8
1,flat,smart world gems,sector 89,0.95,8600.0,2,2,2,4.0,New Property,1226.0,1,1,0,0,0,0,38
2,flat,breez global hill view,sohna road,0.32,5470.0,2,2,1,17.0,New Property,1000.0,0,0,0,0,0,0,49
3,flat,bestech park view sanskruti,sector 92,1.6,8020.0,3,4,3+,10.0,Relatively New,1615.0,0,1,0,0,1,1,174
4,flat,suncity avenue,sector 102,0.48,9022.0,2,2,1,5.0,Relatively New,582.0,0,0,1,0,0,0,159


In [4]:
df.columns

Index(['property_type', 'society', 'sector', 'price', 'price_per_sqft',
       'bedRoom', 'bathroom', 'balcony', 'floorNum', 'agePossession',
       'built_up_area', 'study room', 'servant room', 'store room',
       'pooja room', 'others', 'furnishing_type', 'luxury_score'],
      dtype='object')

In [5]:
df['furnishing_type'].unique()

array([0, 1, 2])

In [6]:
#Function to converting numerical to categorical

def categorize_luxury(score):
    if 0 <= score < 50:
        return "Low"
    elif 50 <= score < 150:
        return "Medium"
    elif 150 <= score <= 175:
        return "High"
    else:
        return None

In [7]:
#Applying function on dataframe

df['luxury_category'] = df['luxury_score'].apply(categorize_luxury)

In [8]:
#Binning for floors

def categorize_floor(floor):
    if 0 <= floor <= 2:
        return "Low Floor"
    elif 3 <= floor <= 10:
        return "Mid Floor"
    elif 11 <= floor <= 51:
        return "High Floor"
    else:
        return None

In [9]:
#Applying binning for floorNum Column

df['floor_category'] = df['floorNum'].apply(categorize_floor)

In [10]:
#Dropping the floorNum and luxury_score column

df.drop(columns=['floorNum','luxury_score'],inplace=True)

In [11]:
#Dropping the columns  price and price_per_sqft are corelated to each other

df = df.drop(columns=['price_per_sqft'])

In [12]:
import pickle
from sklearn.preprocessing import OrdinalEncoder

#Create a copy of the original data for label encoding
data_label_encoded = df.copy()

categorical_cols = df.select_dtypes(include=['object']).columns

encoders = {}  #Dictionary to store encoders for each column

#Apply label encoding to categorical columns and save the encoders

for col in categorical_cols:
    oe = OrdinalEncoder()
    data_label_encoded[col] = oe.fit_transform(data_label_encoded[[col]])
    encoders[col] = oe
    print(oe.categories_)

# Save the encoders using pickle

with open('ordinal_encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)


[array(['flat', 'house'], dtype=object)]
[array(['.', '4s aradhya homes', '511 sarahah tower', 'aardhya homesh',
       'ace palm floors', 'adani aangan arcade', 'adani aangan galleria',
       'adani brahma samsara', 'adani brahma samsara vilasa',
       'adani m2k oyster grande', 'adani oyster greens', 'adarsh nagar',
       'aditya apartment', 'afnhb jalvayu vihar',
       'aindependentt raj estates', 'aipl club residences',
       'aipl joy square', 'aipl peaceful homes',
       'aipl the peaceful homes', 'aipl zen residences', 'allure homes',
       'alpha corp gurgaonone', 'ambience creacions', 'ambience lagoon',
       'anamika enclave', 'ansal api celebrity suites',
       'ansal api esencia', 'ansal api spanish court',
       'ansal api sushant estate', 'ansal celebrity homes',
       'ansal estella', 'ansal florence super', 'ansal harmony homes',
       'ansal height 86', 'ansal heights', 'ansal heights 86',
       'ansal housing highland park', 'ansal maple crescent',
      

In [13]:
#Save the encoders using joblib

import joblib
joblib.dump(encoders, 'ordinal_enc.joblib')

['ordinal_enc.joblib']

In [14]:
from sklearn.preprocessing import OrdinalEncoder

#Create a copy of the original data for label encoding

data_label_encoded = df.copy()

categorical_cols = df.select_dtypes(include=['object']).columns

#Apply label encoding to categorical columns

for col in categorical_cols:
    oe = OrdinalEncoder()
    data_label_encoded[col] = oe.fit_transform(data_label_encoded[[col]])
    print(oe.categories_)

#Splitting the dataset into training and testing sets

X_label = data_label_encoded.drop('price', axis=1)
y_label = data_label_encoded['price']

[array(['flat', 'house'], dtype=object)]
[array(['.', '4s aradhya homes', '511 sarahah tower', 'aardhya homesh',
       'ace palm floors', 'adani aangan arcade', 'adani aangan galleria',
       'adani brahma samsara', 'adani brahma samsara vilasa',
       'adani m2k oyster grande', 'adani oyster greens', 'adarsh nagar',
       'aditya apartment', 'afnhb jalvayu vihar',
       'aindependentt raj estates', 'aipl club residences',
       'aipl joy square', 'aipl peaceful homes',
       'aipl the peaceful homes', 'aipl zen residences', 'allure homes',
       'alpha corp gurgaonone', 'ambience creacions', 'ambience lagoon',
       'anamika enclave', 'ansal api celebrity suites',
       'ansal api esencia', 'ansal api spanish court',
       'ansal api sushant estate', 'ansal celebrity homes',
       'ansal estella', 'ansal florence super', 'ansal harmony homes',
       'ansal height 86', 'ansal heights', 'ansal heights 86',
       'ansal housing highland park', 'ansal maple crescent',
      

In [15]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor  #Import RandomForestRegressor

#Initialize the base estimator

estimator = RandomForestRegressor()

#Apply RFE on the label-encoded and standardized training data

selector_label = RFE(estimator, n_features_to_select=X_label.shape[1], step=1)
selector_label = selector_label.fit(X_label, y_label)

#Get the selected features based on RFE

selected_features = X_label.columns[selector_label.support_]

#Extract the coefficients for the selected features from the underlying linear regression model

selected_coefficients = selector_label.estimator_.feature_importances_

#Organize the results into a DataFrame

fi_df6 = pd.DataFrame({
    'feature': selected_features,
    'rfe_score': selected_coefficients
}).sort_values(by='rfe_score', ascending=False)

fi_df6

Unnamed: 0,feature,rfe_score
7,built_up_area,0.630642
0,property_type,0.099701
2,sector,0.092027
1,society,0.04812
4,bathroom,0.043425
3,bedRoom,0.021294
6,agePossession,0.018634
13,furnishing_type,0.009032
5,balcony,0.007976
9,servant room,0.005684


In [16]:
export_df = X_label.drop(columns=['pooja room', 'study room', 'others'])
export_df['price'] = y_label

In [17]:
#Exporting the csv

export_df.to_csv('gurgaon_properties_post_feature_selection.csv', index=False)

In [18]:
export_df.head()

Unnamed: 0,property_type,society,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,0.0,516.0,35.0,3,2,2.0,1.0,850.0,0,0,0,1.0,1.0,0.82
1,0.0,529.0,93.0,2,2,2.0,1.0,1226.0,1,0,0,1.0,2.0,0.95
2,0.0,102.0,101.0,2,2,1.0,1.0,1000.0,0,0,0,1.0,0.0,0.32
3,0.0,81.0,97.0,3,4,4.0,3.0,1615.0,1,0,1,0.0,2.0,1.6
4,0.0,555.0,5.0,2,2,1.0,3.0,582.0,0,1,0,0.0,2.0,0.48
