In [25]:
import pandas as pd
import numpy as np
import seaborn as sns

In [26]:
df = pd.read_parquet(r'C:\Users\aryan\OneDrive\Desktop\Capstone Project\Data Preprocessing New\cleaned_gurgaon_properties_v2.parquet')

In [27]:
train_df = df.drop(columns= 'Sector_encoded').copy()

In [28]:
from sklearn.preprocessing import OrdinalEncoder

categorical_col = df.select_dtypes(include= 'object').columns

for col in categorical_col:
    encoder = OrdinalEncoder()
    train_df[col] = encoder.fit_transform(train_df[[col]])
    print(encoder.categories_)
    
X_label = train_df.drop('Price', axis= 1)
y_label = train_df['Price']

[array(['Gwal Pahari', 'Sector 1', 'Sector 10', 'Sector 102', 'Sector 103',
       'Sector 104', 'Sector 105', 'Sector 106', 'Sector 107',
       'Sector 108', 'Sector 109', 'Sector 110', 'Sector 111',
       'Sector 112', 'Sector 113', 'Sector 12', 'Sector 13', 'Sector 14',
       'Sector 15', 'Sector 17', 'Sector 2', 'Sector 21', 'Sector 22',
       'Sector 23', 'Sector 24', 'Sector 25', 'Sector 26', 'Sector 27',
       'Sector 28', 'Sector 29', 'Sector 3', 'Sector 30', 'Sector 31',
       'Sector 32', 'Sector 33', 'Sector 35', 'Sector 36', 'Sector 37',
       'Sector 38', 'Sector 39', 'Sector 4', 'Sector 40', 'Sector 41',
       'Sector 42', 'Sector 43', 'Sector 45', 'Sector 46', 'Sector 47',
       'Sector 48', 'Sector 49', 'Sector 5', 'Sector 50', 'Sector 51',
       'Sector 52', 'Sector 53', 'Sector 54', 'Sector 55', 'Sector 56',
       'Sector 57', 'Sector 58', 'Sector 59', 'Sector 6', 'Sector 60',
       'Sector 61', 'Sector 62', 'Sector 63', 'Sector 65', 'Sector 66',
       'S

In [6]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [35]:
X_label.head(1)

Unnamed: 0,Sector,Built Up Area,Bedroom,Bathroom,Balcony,Others,Pooja Room,Servant Room,Store Room,Study Room,Floor Num,Total Floor,Facing_North,Facing_North-East,Facing_North-West,Facing_South,Facing_South-East,Facing_South-West,Facing_Unknown,Facing_West,Property Age,Overlooking_Club,Overlooking_Lake facing,Overlooking_Main Road,Overlooking_Others,Overlooking_Park,Overlooking_Pool,NearbyPlace_Business,NearbyPlace_Education,NearbyPlace_Financial,NearbyPlace_Healthcare,NearbyPlace_Leisure,NearbyPlace_Religious,NearbyPlace_Residentail,NearbyPlace_Shopping,NearbyPlace_Transport,Property Ownership,Furnishing,Power Backup,Covered_Parking,Open_Parking,Total Parking,Rating
0,82.0,1031,2,2,2,1,0,0,0,0,10,14,0,0,0,0,0,0,0,0,3,1,0,0,1,1,1,0,1,0,0,0,0,0,1,1,1,1,2,1,0,1,3.6


**Technique 1 - Correlation Analysis**

In [36]:
fi_df1 = train_df.corr()['Price'].to_frame().reset_index().rename(columns={'index':'feature','Price':'corr_coeff'})
fi_df1

Unnamed: 0,feature,corr_coeff
0,Sector,-0.16476
1,Price,1.0
2,Built Up Area,0.762695
3,Bedroom,0.632636
4,Bathroom,0.642988
5,Balcony,0.220662
6,Others,-0.042783
7,Pooja Room,0.185509
8,Servant Room,0.466315
9,Store Room,0.071166


**Technique 2 - Random Forest Importance**

In [37]:
from sklearn.ensemble import RandomForestRegressor

rf_label = RandomForestRegressor(random_state= 42)
rf_label.fit(X_label, y_label)

fi_df2 = pd.DataFrame({
    'feature': X_label.columns,
    'rf_importance': rf_label.feature_importances_
}).sort_values(by= 'rf_importance', ascending= False)

In [38]:
fi_df2

Unnamed: 0,feature,rf_importance
1,Built Up Area,0.634818
0,Sector,0.119477
11,Total Floor,0.056425
42,Rating,0.030492
3,Bathroom,0.020716
39,Covered_Parking,0.016537
10,Floor Num,0.012572
2,Bedroom,0.011154
34,NearbyPlace_Shopping,0.009644
32,NearbyPlace_Religious,0.007921


**Technique 3 - Gradient Boosting Feature importances**

In [39]:
from sklearn.ensemble import GradientBoostingRegressor

gb_label = GradientBoostingRegressor()
gb_label.fit(X_label, y_label)

fi_df3 = pd.DataFrame({
    'feature': X_label.columns,
    'gb_importance': gb_label.feature_importances_
}).sort_values(by= 'gb_importance', ascending= False)

In [40]:
fi_df3

Unnamed: 0,feature,gb_importance
1,Built Up Area,0.6477236
0,Sector,0.1282527
3,Bathroom,0.05761028
11,Total Floor,0.05271764
39,Covered_Parking,0.04669781
42,Rating,0.01055318
34,NearbyPlace_Shopping,0.009263098
2,Bedroom,0.009018211
7,Servant Room,0.00578745
30,NearbyPlace_Healthcare,0.005774495


**Technique 4 - Permutation Importance**

In [41]:
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

X_train_label, X_test_label, y_train_label, y_test_label = train_test_split(X_label, y_label, test_size= 0.2, random_state= 42)

rf_label = RandomForestRegressor(n_estimators= 100, random_state= 42)
rf_label.fit(X_train_label, y_train_label)

perm_importance = permutation_importance(rf_label, X_test_label, y_test_label, n_repeats= 30, random_state= 42)

fi_df4 = pd.DataFrame({
    'feature': X_label.columns,
    'permutation_importance': perm_importance.importances_mean
}).sort_values(by= 'permutation_importance', ascending= False)

In [42]:
fi_df4

Unnamed: 0,feature,permutation_importance
1,Built Up Area,0.836734
0,Sector,0.250771
11,Total Floor,0.11937
3,Bathroom,0.038803
42,Rating,0.027993
2,Bedroom,0.021405
39,Covered_Parking,0.014573
34,NearbyPlace_Shopping,0.013469
30,NearbyPlace_Healthcare,0.008863
32,NearbyPlace_Religious,0.006837


**Technique 5 - LASSO**

In [43]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_label)

lasso = Lasso(alpha= 0.01, random_state= 42)
lasso.fit(X_scaled, y_label)

fi_df5 = pd.DataFrame({
    'feature': X_label.columns,
    'lasso_coeff': lasso.coef_
}).sort_values(by= 'lasso_coeff', ascending= False)

In [44]:
fi_df5

Unnamed: 0,feature,lasso_coeff
1,Built Up Area,0.576724
11,Total Floor,0.155131
39,Covered_Parking,0.12296
2,Bedroom,0.09882
37,Furnishing,0.094529
3,Bathroom,0.081175
29,NearbyPlace_Financial,0.073463
7,Servant Room,0.062777
35,NearbyPlace_Transport,0.059379
38,Power Backup,0.053236


**Technique 6 - RFE**

In [45]:
from sklearn.feature_selection import RFE

estimator = RandomForestRegressor(random_state= 42)

selector_label = RFE(estimator= estimator, n_features_to_select= X_label.shape[0], step= 1)
selector_label = selector_label.fit(X_label, y_label)

selected_features = X_label.columns[selector_label.support_]

selected_coefficients = selector_label.estimator_.feature_importances_

fi_df6 = pd.DataFrame({
    'feature': selected_features,
    'rfe_score': selected_coefficients
}).sort_values(by='rfe_score', ascending=False)



In [46]:
fi_df6

Unnamed: 0,feature,rfe_score
1,Built Up Area,0.634818
0,Sector,0.119477
11,Total Floor,0.056425
42,Rating,0.030492
3,Bathroom,0.020716
39,Covered_Parking,0.016537
10,Floor Num,0.012572
2,Bedroom,0.011154
34,NearbyPlace_Shopping,0.009644
32,NearbyPlace_Religious,0.007921


**Technique 7 - Linear Regression Weights**

In [47]:
from sklearn.linear_model import LinearRegression

linear_reg = LinearRegression()
linear_reg.fit(X_scaled, y_label)

fi_df7 = pd.DataFrame({
    'feature': X_label.columns,
    'reg_coeffs': linear_reg.coef_
}).sort_values(by='reg_coeffs', ascending=False)

In [54]:
fi_df7

Unnamed: 0,feature,reg_coeffs
1,Built Up Area,0.573029
11,Total Floor,0.177556
2,Bedroom,0.110942
37,Furnishing,0.100299
29,NearbyPlace_Financial,0.099446
39,Covered_Parking,0.091428
3,Bathroom,0.081654
35,NearbyPlace_Transport,0.080981
7,Servant Room,0.068674
38,Power Backup,0.068219


In [49]:
final_fi_df = fi_df1.merge(fi_df2,on='feature').merge(fi_df3,on='feature').merge(fi_df4,on='feature').merge(fi_df5,on='feature').merge(fi_df6,on='feature').merge(fi_df7,on='feature').set_index('feature')

In [51]:
final_fi_df.sort_values(by= 'rf_importance', ascending= False)

Unnamed: 0_level_0,corr_coeff,rf_importance,gb_importance,permutation_importance,lasso_coeff,rfe_score,reg_coeffs
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Built Up Area,0.762695,0.634818,0.6477236,0.836734,0.576724,0.634818,0.573029
Sector,-0.16476,0.119477,0.1282527,0.250771,-0.070938,0.119477,-0.078132
Total Floor,0.040764,0.056425,0.05271764,0.11937,0.155131,0.056425,0.177556
Rating,0.075547,0.030492,0.01055318,0.027993,0.010109,0.030492,0.022511
Bathroom,0.642988,0.020716,0.05761028,0.038803,0.081175,0.020716,0.081654
Covered_Parking,0.506119,0.016537,0.04669781,0.014573,0.12296,0.016537,0.091428
Floor Num,0.025437,0.012572,0.0008114965,0.004347,0.0,0.012572,-0.008204
Bedroom,0.632636,0.011154,0.009018211,0.021405,0.09882,0.011154,0.110942
NearbyPlace_Shopping,-0.083636,0.009644,0.009263098,0.013469,-0.047413,0.009644,-0.053436
NearbyPlace_Religious,0.165636,0.007921,0.003859081,0.006837,-0.050809,0.007921,-0.079507


In [52]:
final_fi_df.to_csv(r'C:\Users\aryan\OneDrive\Desktop\Capstone Project\Misc\feature_importance.csv')

In [55]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
final_fi_df_scaled = min_max_scaler.fit_transform(final_fi_df)

In [56]:
final_fi_df_scaled = pd.DataFrame(final_fi_df_scaled, columns= final_fi_df.columns, index= final_fi_df.index)

In [57]:
final_fi_df_scaled

Unnamed: 0_level_0,corr_coeff,rf_importance,gb_importance,permutation_importance,lasso_coeff,rfe_score,reg_coeffs
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Sector,0.080119,0.187836,0.198005,0.299727,0.010865,0.187836,0.01092
Built Up Area,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Bedroom,0.871002,0.01712,0.013923,0.025616,0.270126,0.01712,0.298113
Bathroom,0.88127,0.032189,0.088943,0.046407,0.243178,0.032189,0.253626
Balcony,0.462393,0.01079,0.000412,0.004745,0.119205,0.01079,0.098835
Others,0.201099,0.001563,0.0,0.0,0.119205,0.001563,0.145346
Pooja Room,0.427528,0.002463,0.00065,0.000328,0.11437,0.002463,0.099204
Servant Room,0.706041,0.006035,0.008935,0.003583,0.21508,0.006035,0.233911
Store Room,0.314118,0.00284,0.000105,0.001253,0.136818,0.00284,0.168945
Study Room,0.281157,0.003702,0.0,0.000707,0.119205,0.003702,0.143915


In [58]:
final_fi_df_scaled[['rf_importance','gb_importance','permutation_importance','rfe_score']].mean(axis=1).sort_values(ascending=False)

feature
Built Up Area              1.000000
Sector                     0.218351
Total Floor                0.100253
Bathroom                   0.049932
Rating                     0.036244
Covered_Parking            0.035189
Bedroom                    0.018444
NearbyPlace_Shopping       0.014978
Floor Num                  0.011298
NearbyPlace_Healthcare     0.010443
NearbyPlace_Religious      0.009553
Property Age               0.007116
Balcony                    0.006684
Servant Room               0.006147
Furnishing                 0.005218
NearbyPlace_Education      0.004401
NearbyPlace_Transport      0.004294
Power Backup               0.004076
NearbyPlace_Residentail    0.003375
Open_Parking               0.002392
Total Parking              0.002264
Study Room                 0.002028
Overlooking_Club           0.001767
Store Room                 0.001760
Overlooking_Main Road      0.001663
Facing_Unknown             0.001570
Pooja Room                 0.001476
NearbyPlace_Business

In [60]:
from sklearn.model_selection import cross_val_score

rf = RandomForestRegressor(n_estimators= 1000, random_state= 42)
scores = cross_val_score(rf, X_label, y_label, cv= 10, scoring= 'r2', n_jobs= -1)

print(scores.mean())

0.8568099337904226


In [61]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['NearbyPlace_Residentail', 'Open_Parking', 'Total Parking', 'Study Room', 'Overlooking_Club', 'Store Room', 'Overlooking_Main Road', 'Facing_Unknown', 'Pooja Room', 'NearbyPlace_Business', 'Facing_North-East', 'Facing_North', 'Overlooking_Others', 'NearbyPlace_Leisure', 'NearbyPlace_Financial', 'Overlooking_Pool', 'Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 25: ", scores.mean())

Bottom 25:  0.8076301182032815


In [62]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Open_Parking', 'Total Parking', 'Study Room', 'Overlooking_Club', 'Store Room', 'Overlooking_Main Road', 'Facing_Unknown', 'Pooja Room', 'NearbyPlace_Business', 'Facing_North-East', 'Facing_North', 'Overlooking_Others', 'NearbyPlace_Leisure', 'NearbyPlace_Financial', 'Overlooking_Pool', 'Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 24: ", scores.mean())

Bottom 24:  0.8078238353042135


In [63]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Total Parking', 'Study Room', 'Overlooking_Club', 'Store Room', 'Overlooking_Main Road', 'Facing_Unknown', 'Pooja Room', 'NearbyPlace_Business', 'Facing_North-East', 'Facing_North', 'Overlooking_Others', 'NearbyPlace_Leisure', 'NearbyPlace_Financial', 'Overlooking_Pool', 'Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 23: ", scores.mean())

Bottom 23:  0.8084725305958376


In [64]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Study Room', 'Overlooking_Club', 'Store Room', 'Overlooking_Main Road', 'Facing_Unknown', 'Pooja Room', 'NearbyPlace_Business', 'Facing_North-East', 'Facing_North', 'Overlooking_Others', 'NearbyPlace_Leisure', 'NearbyPlace_Financial', 'Overlooking_Pool', 'Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 22: ", scores.mean())

Bottom 22:  0.8086794017814709


In [65]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Overlooking_Club', 'Store Room', 'Overlooking_Main Road', 'Facing_Unknown', 'Pooja Room', 'NearbyPlace_Business', 'Facing_North-East', 'Facing_North', 'Overlooking_Others', 'NearbyPlace_Leisure', 'NearbyPlace_Financial', 'Overlooking_Pool', 'Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 21: ", scores.mean())

Bottom 21:  0.8068915456274766


In [66]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Store Room', 'Overlooking_Main Road', 'Facing_Unknown', 'Pooja Room', 'NearbyPlace_Business', 'Facing_North-East', 'Facing_North', 'Overlooking_Others', 'NearbyPlace_Leisure', 'NearbyPlace_Financial', 'Overlooking_Pool', 'Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 20: ", scores.mean())

Bottom 20:  0.8065455241126532


In [67]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Overlooking_Main Road', 'Facing_Unknown', 'Pooja Room', 'NearbyPlace_Business', 'Facing_North-East', 'Facing_North', 'Overlooking_Others', 'NearbyPlace_Leisure', 'NearbyPlace_Financial', 'Overlooking_Pool', 'Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 19: ", scores.mean())

Bottom 19:  0.8109153783161165


In [68]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Facing_Unknown', 'Pooja Room', 'NearbyPlace_Business', 'Facing_North-East', 'Facing_North', 'Overlooking_Others', 'NearbyPlace_Leisure', 'NearbyPlace_Financial', 'Overlooking_Pool', 'Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 18: ", scores.mean())

Bottom 18:  0.8109742201811596


*Most R2 (as of now)*

In [69]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Pooja Room', 'NearbyPlace_Business', 'Facing_North-East', 'Facing_North', 'Overlooking_Others', 'NearbyPlace_Leisure', 'NearbyPlace_Financial', 'Overlooking_Pool', 'Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 17: ", scores.mean())

Bottom 17:  0.8125907615952738


In [70]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['NearbyPlace_Business', 'Facing_North-East', 'Facing_North', 'Overlooking_Others', 'NearbyPlace_Leisure', 'NearbyPlace_Financial', 'Overlooking_Pool', 'Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 16: ", scores.mean())

Bottom 16:  0.8094156153074579


In [71]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Facing_North-East', 'Facing_North', 'Overlooking_Others', 'NearbyPlace_Leisure', 'NearbyPlace_Financial', 'Overlooking_Pool', 'Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 15: ", scores.mean())

Bottom 15:  0.8097162563741769


In [72]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Facing_North', 'Overlooking_Others', 'NearbyPlace_Leisure', 'NearbyPlace_Financial', 'Overlooking_Pool', 'Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 14: ", scores.mean())

Bottom 14:  0.8106011837485457


In [73]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Overlooking_Others', 'NearbyPlace_Leisure', 'NearbyPlace_Financial', 'Overlooking_Pool', 'Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 13: ", scores.mean())

Bottom 13:  0.8115550655082903


In [74]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['NearbyPlace_Leisure', 'NearbyPlace_Financial', 'Overlooking_Pool', 'Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 12: ", scores.mean())

Bottom 12:  0.8112380238897321


In [75]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['NearbyPlace_Financial', 'Overlooking_Pool', 'Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 11: ", scores.mean())

Bottom 11:  0.8093291003412084


In [76]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Overlooking_Pool', 'Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 10: ", scores.mean())

Bottom 10:  0.8101772378908269


In [77]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 9: ", scores.mean())

Bottom 9:  0.8105204298123952


In [78]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 8: ", scores.mean())

Bottom 8:  0.8099320404809165


In [79]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 7: ", scores.mean())

Bottom 7:  0.8104782243238097


In [80]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 6: ", scores.mean())

Bottom 6:  0.8077008570781599


In [81]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 5: ", scores.mean())

Bottom 5:  0.8107430269343043


In [82]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 4: ", scores.mean())

Bottom 4:  0.806974491226811


In [83]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 3: ", scores.mean())

Bottom 3:  0.8087320375006023


In [84]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 2: ", scores.mean())

Bottom 2:  0.8076319460504816


In [85]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Property Ownership']), y_label, cv=5, scoring='r2')
print("Bottom 1: ", scores.mean())

Bottom 1:  0.8084774502906431


checking by also dropping Facing Unknown and not dropping Facing North East

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['Facing_North-East', 'Pooja Room', 'NearbyPlace_Business', 'Facing_Unknown', 'Facing_North', 'Overlooking_Others', 'NearbyPlace_Leisure', 'NearbyPlace_Financial', 'Overlooking_Pool', 'Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership']), y_label, cv=5, scoring='r2')
print(scores.mean())

0.8109742201811596


Also dropping Facing Unknown and Sector encoded

In [None]:
# export_df = df.drop(columns= ['Facing_North-East', 'Facing_Unknown', 'Sector_encoded', 'Pooja Room', 'NearbyPlace_Business', 'Facing_North-East', 'Facing_North', 'Overlooking_Others', 'NearbyPlace_Leisure', 'NearbyPlace_Financial', 'Overlooking_Pool', 'Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership', 'Price'])
# export_df['Price'] = df['Price']

In [29]:
df = df.drop(columns= ['Facing_North-East', 'Facing_Unknown', 'Sector_encoded', 'Pooja Room', 'NearbyPlace_Business', 'Facing_North-East', 'Facing_North', 'Overlooking_Others', 'NearbyPlace_Leisure', 'NearbyPlace_Financial', 'Overlooking_Pool', 'Overlooking_Park', 'Overlooking_Lake facing', 'Others', 'Facing_West', 'Facing_South-West', 'Facing_North-West', 'Facing_South-East', 'Facing_South', 'Property Ownership'])

In [30]:
nearby_cols = [col for col in df.columns if 'NearbyPlace_' in col]
df['Nearby'] = df[nearby_cols].idxmax(axis=1)
df['Nearby'] = df['Nearby'].str.replace('NearbyPlace_', '')

overlooking_cols = [col for col in df.columns if 'Overlooking_' in col]
df['Overlooking'] = df[overlooking_cols].idxmax(axis=1)
df['Overlooking'] = df['Overlooking'].str.replace('Overlooking_', '')

In [31]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [32]:
df = df.drop(columns=nearby_cols + overlooking_cols)

In [33]:
df.sample(1)

Unnamed: 0,Sector,Price,Built Up Area,Bedroom,Bathroom,Balcony,Servant Room,Store Room,Study Room,Floor Num,Total Floor,Property Age,Furnishing,Power Backup,Covered_Parking,Open_Parking,Total Parking,Rating,Nearby,Overlooking
3131,Sector 108,3.15,1764,2,2,2,0,0,0,7,24,2,1,2,1,0,1,3.7,Education,Club


In [34]:
df['Furnishing'].replace({0: 'Unfurnished', 1: 'Semi Furnished', 2: 'Furnished'}, inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Furnishing'].replace({0: 'Unfurnished', 1: 'Semi Furnished', 2: 'Furnished'}, inplace= True)


In [35]:
df['Power Backup'].replace({2: 'Full', 0: 'None', 1: 'Partial'}, inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Power Backup'].replace({2: 'Full', 0: 'None', 1: 'Partial'}, inplace= True)


In [36]:
df['Property Age'].replace({0: 'Under Construction', 1: '0 to 1 Year Old', 2: '1 to 5 Year Old', 3: '5 to 10 Year Old', 4: '10+ Year Old'}, inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Property Age'].replace({0: 'Under Construction', 1: '0 to 1 Year Old', 2: '1 to 5 Year Old', 3: '5 to 10 Year Old', 4: '10+ Year Old'}, inplace= True)


In [37]:
export_df = df.drop(columns= 'Price')
export_df['Price'] = df['Price']

In [38]:
export_df.to_parquet(r'C:\Users\aryan\OneDrive\Desktop\Capstone Project\Data Preprocessing New\gurgaon_properties_post_feature_selection_wo_onehot_features.parquet')

In [39]:
df.sample(1)

Unnamed: 0,Sector,Price,Built Up Area,Bedroom,Bathroom,Balcony,Servant Room,Store Room,Study Room,Floor Num,Total Floor,Property Age,Furnishing,Power Backup,Covered_Parking,Open_Parking,Total Parking,Rating,Nearby,Overlooking
46,Sector 85,2.45,2165,4,4,3,0,0,0,8,12,5 to 10 Year Old,Semi Furnished,Full,2,1,3,3.92,Shopping,Club
