In [32]:
import pandas as pd

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

print('The shape of training dataset : ' + str(train_data.shape))
print('The shape of testing dataset : ' + str(test_data.shape))

The shape of training dataset : (31599, 15)
The shape of testing dataset : (7900, 14)


In [33]:
good_features = train_data.dropna(axis=1).columns.tolist()
missing_features = [col for col in train_data.columns if col not in good_features]

print('Features with missing values :', missing_features)
print('Features without missing values :', good_features)

Features with missing values : ['Number_of_Windows', 'Furnishing', 'Frequency_of_Powercuts', 'Crime_Rate', 'Dust_and_Noise']
Features without missing values : ['Id', 'Property_Type', 'Property_Area', 'Number_of_Doors', 'Power_Backup', 'Water_Supply', 'Traffic_Density_Score', 'Air_Quality_Index', 'Neighborhood_Review', 'Habitability_score']


In [34]:
good_features.extend([missing_features[0]])
print(good_features)

['Id', 'Property_Type', 'Property_Area', 'Number_of_Doors', 'Power_Backup', 'Water_Supply', 'Traffic_Density_Score', 'Air_Quality_Index', 'Neighborhood_Review', 'Habitability_score', 'Number_of_Windows']


In [36]:
temp_data = train_data[good_features]
temp_data.head()

Unnamed: 0,Id,Property_Type,Property_Area,Number_of_Doors,Power_Backup,Water_Supply,Traffic_Density_Score,Air_Quality_Index,Neighborhood_Review,Habitability_score,Number_of_Windows
0,0x21e3,Apartment,106,1,No,Once in a day - Morning,5.89,90.0,3.86,71.98,
1,0x68d4,Apartment,733,2,No,Once in a day - Evening,4.37,96.0,3.55,71.2,2.0
2,0x7d81,Apartment,737,2,No,Once in a day - Morning,7.45,121.0,3.81,71.39,4.0
3,0x7a57,Apartment,900,2,Yes,Once in a day - Morning,6.16,100.0,1.34,31.46,3.0
4,0x9409,Bungalow,2238,6,No,All time,5.46,116.0,4.77,93.7,14.0


In [37]:
from sklearn.preprocessing import OrdinalEncoder

temp_data = temp_data.drop(columns=['Id'])
categorical_features = ['Property_Type', 'Power_Backup', 'Water_Supply'] 

ordinal_encoder = OrdinalEncoder()
temp_data[categorical_features] = ordinal_encoder.fit_transform(temp_data[categorical_features])

temp_data.head()

Unnamed: 0,Property_Type,Property_Area,Number_of_Doors,Power_Backup,Water_Supply,Traffic_Density_Score,Air_Quality_Index,Neighborhood_Review,Habitability_score,Number_of_Windows
0,1.0,106,1,1.0,3.0,5.89,90.0,3.86,71.98,
1,1.0,733,2,1.0,2.0,4.37,96.0,3.55,71.2,2.0
2,1.0,737,2,1.0,3.0,7.45,121.0,3.81,71.39,4.0
3,1.0,900,2,2.0,3.0,6.16,100.0,1.34,31.46,3.0
4,2.0,2238,6,1.0,0.0,5.46,116.0,4.77,93.7,14.0


In [38]:
missing_set = temp_data[temp_data.isnull().any(axis=1)]
print(missing_set.shape)

temp_data.dropna(axis=0, inplace=True)
print(temp_data.shape)

(1333, 10)
(30266, 10)


In [39]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import numpy as np

X_train = temp_data.drop(columns=['Number_of_Windows'])
y_train = temp_data['Number_of_Windows']

model = LinearRegression()
model.fit(X_train, y_train)

X_test = missing_set.drop(columns=['Number_of_Windows'])
predicted_values = model.predict(X_test)

print(predicted_values)

[2.37332398 2.8866663  4.33774741 ... 3.49503777 4.83939758 4.46964845]


In [40]:
missing_set['Number_of_Windows'] = predicted_values
combined_data = pd.concat([temp_data, missing_set])
combined_data.sort_index(inplace=True)

print(combined_data)

       Property_Type  Property_Area  Number_of_Doors  Power_Backup  \
0                1.0            106                1           1.0   
1                1.0            733                2           1.0   
2                1.0            737                2           1.0   
3                1.0            900                2           2.0   
4                2.0           2238                6           1.0   
...              ...            ...              ...           ...   
31594            1.0            851                1           1.0   
31595            3.0            315                1           2.0   
31596            4.0            480                3           1.0   
31597            1.0            642                2           1.0   
31598            5.0           1738                4           1.0   

       Water_Supply  Traffic_Density_Score  Air_Quality_Index  \
0               3.0                   5.89               90.0   
1               2.0          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_set['Number_of_Windows'] = predicted_values


In [41]:
temp_data = combined_data
temp_data['Furnishing'] = train_data['Furnishing']

categorical_features = ['Furnishing'] 

ordinal_encoder = OrdinalEncoder()
temp_data[categorical_features] = ordinal_encoder.fit_transform(temp_data[categorical_features])

missing_set = temp_data[temp_data.isnull().any(axis=1)]
temp_data.dropna(axis=0, inplace=True)

X_train = temp_data.drop(columns=['Furnishing'])
y_train = temp_data['Furnishing']

model = LinearRegression()
model.fit(X_train, y_train)
X_test = missing_set.drop(columns=['Furnishing'])
predicted_values = model.predict(X_test)

missing_set['Furnishing'] = predicted_values
combined_data = pd.concat([temp_data, missing_set])
combined_data.sort_index(inplace=True)

print(combined_data)

       Property_Type  Property_Area  Number_of_Doors  Power_Backup  \
0                1.0            106                1           1.0   
1                1.0            733                2           1.0   
2                1.0            737                2           1.0   
3                1.0            900                2           2.0   
4                2.0           2238                6           1.0   
...              ...            ...              ...           ...   
31594            1.0            851                1           1.0   
31595            3.0            315                1           2.0   
31596            4.0            480                3           1.0   
31597            1.0            642                2           1.0   
31598            5.0           1738                4           1.0   

       Water_Supply  Traffic_Density_Score  Air_Quality_Index  \
0               3.0                   5.89               90.0   
1               2.0          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_set['Furnishing'] = predicted_values


In [42]:
temp_data = combined_data
temp_data['Frequency_of_Powercuts'] = train_data['Frequency_of_Powercuts']

missing_set = temp_data[temp_data.isnull().any(axis=1)]
temp_data.dropna(axis=0, inplace=True)

X_train = temp_data.drop(columns=['Frequency_of_Powercuts'])
y_train = temp_data['Frequency_of_Powercuts']

model = LinearRegression()
model.fit(X_train, y_train)
X_test = missing_set.drop(columns=['Frequency_of_Powercuts'])
predicted_values = model.predict(X_test)

missing_set['Frequency_of_Powercuts'] = predicted_values
combined_data = pd.concat([temp_data, missing_set])
combined_data.sort_index(inplace=True)

print(combined_data)

       Property_Type  Property_Area  Number_of_Doors  Power_Backup  \
0                1.0            106                1           1.0   
1                1.0            733                2           1.0   
2                1.0            737                2           1.0   
3                1.0            900                2           2.0   
4                2.0           2238                6           1.0   
...              ...            ...              ...           ...   
31594            1.0            851                1           1.0   
31595            3.0            315                1           2.0   
31596            4.0            480                3           1.0   
31597            1.0            642                2           1.0   
31598            5.0           1738                4           1.0   

       Water_Supply  Traffic_Density_Score  Air_Quality_Index  \
0               3.0                   5.89               90.0   
1               2.0          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_set['Frequency_of_Powercuts'] = predicted_values


In [43]:
temp_data = combined_data
temp_data['Crime_Rate'] = train_data['Crime_Rate']

categorical_features = ['Crime_Rate'] 

ordinal_encoder = OrdinalEncoder()
temp_data[categorical_features] = ordinal_encoder.fit_transform(temp_data[categorical_features])

missing_set = temp_data[temp_data.isnull().any(axis=1)]
temp_data.dropna(axis=0, inplace=True)

X_train = temp_data.drop(columns=['Crime_Rate'])
y_train = temp_data['Crime_Rate']

model = LinearRegression()
model.fit(X_train, y_train)
X_test = missing_set.drop(columns=['Crime_Rate'])
predicted_values = model.predict(X_test)

missing_set['Crime_Rate'] = predicted_values
combined_data = pd.concat([temp_data, missing_set])
combined_data.sort_index(inplace=True)

print(combined_data)

       Property_Type  Property_Area  Number_of_Doors  Power_Backup  \
0                1.0            106                1           1.0   
1                1.0            733                2           1.0   
2                1.0            737                2           1.0   
3                1.0            900                2           2.0   
4                2.0           2238                6           1.0   
...              ...            ...              ...           ...   
31594            1.0            851                1           1.0   
31595            3.0            315                1           2.0   
31596            4.0            480                3           1.0   
31597            1.0            642                2           1.0   
31598            5.0           1738                4           1.0   

       Water_Supply  Traffic_Density_Score  Air_Quality_Index  \
0               3.0                   5.89               90.0   
1               2.0          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_set['Crime_Rate'] = predicted_values


In [44]:
temp_data = combined_data
temp_data['Dust_and_Noise'] = train_data['Dust_and_Noise']

categorical_features = ['Dust_and_Noise'] 

ordinal_encoder = OrdinalEncoder()
temp_data[categorical_features] = ordinal_encoder.fit_transform(temp_data[categorical_features])

missing_set = temp_data[temp_data.isnull().any(axis=1)]
temp_data.dropna(axis=0, inplace=True)

X_train = temp_data.drop(columns=['Dust_and_Noise'])
y_train = temp_data['Dust_and_Noise']

model = LinearRegression()
model.fit(X_train, y_train)
X_test = missing_set.drop(columns=['Dust_and_Noise'])
predicted_values = model.predict(X_test)

missing_set['Dust_and_Noise'] = predicted_values
combined_data = pd.concat([temp_data, missing_set])
combined_data.sort_index(inplace=True)

print(combined_data)

       Property_Type  Property_Area  Number_of_Doors  Power_Backup  \
0                1.0            106                1           1.0   
1                1.0            733                2           1.0   
2                1.0            737                2           1.0   
3                1.0            900                2           2.0   
4                2.0           2238                6           1.0   
...              ...            ...              ...           ...   
31594            1.0            851                1           1.0   
31595            3.0            315                1           2.0   
31596            4.0            480                3           1.0   
31597            1.0            642                2           1.0   
31598            5.0           1738                4           1.0   

       Water_Supply  Traffic_Density_Score  Air_Quality_Index  \
0               3.0                   5.89               90.0   
1               2.0          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_set['Dust_and_Noise'] = predicted_values


In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.metrics import root_mean_squared_error
from scipy.stats import randint

X = combined_data.drop(columns=['Habitability_score'])
y = combined_data['Habitability_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

model = RandomForestRegressor()
param_grid = {
    'n_estimators': randint(100, 300),
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5)
}

# Step 3: Use RandomizedSearchCV with cross-validation for hyperparameter tuning
kf = KFold(n_splits=3, shuffle=True, random_state=42)
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=300, scoring='neg_mean_squared_error', cv=kf, verbose=4, random_state=42)
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_

# Step 4: Train the final model on the combined training and validation sets using the best hyperparameters
final_model = RandomForestRegressor(**best_params)
final_model.fit(X_valid, y_valid)

# Step 5: Evaluate the final model on the unseen test set
y_pred_test = final_model.predict(X_test)
mse_test = root_mean_squared_error(y_test, y_pred_test)
print("Root Mean Squared Error (Test Set):", mse_test)

Fitting 3 folds for each of 300 candidates, totalling 900 fits
[CV 1/3] END max_depth=7, min_samples_leaf=4, min_samples_split=6, n_estimators=114;, score=-42.656 total time=   2.1s
[CV 2/3] END max_depth=7, min_samples_leaf=4, min_samples_split=6, n_estimators=114;, score=-42.862 total time=   2.0s
[CV 3/3] END max_depth=7, min_samples_leaf=4, min_samples_split=6, n_estimators=114;, score=-42.257 total time=   2.1s
[CV 1/3] END max_depth=11, min_samples_leaf=4, min_samples_split=6, n_estimators=120;, score=-37.186 total time=   3.8s
[CV 2/3] END max_depth=11, min_samples_leaf=4, min_samples_split=6, n_estimators=120;, score=-37.839 total time=   4.6s
[CV 3/3] END max_depth=11, min_samples_leaf=4, min_samples_split=6, n_estimators=120;, score=-37.162 total time=   5.2s
[CV 1/3] END max_depth=7, min_samples_leaf=2, min_samples_split=4, n_estimators=174;, score=-42.365 total time=   4.7s
[CV 2/3] END max_depth=7, min_samples_leaf=2, min_samples_split=4, n_estimators=174;, score=-42.710 t

In [22]:
print("Best hyperparameters for Random Forest:")
print(random_search.best_params_)

Best hyperparameters for Random Forest:
{'max_depth': 13, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 225}


In [47]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

X = combined_data.drop(columns=['Habitability_score'])
y = combined_data['Habitability_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_regressor = RandomForestRegressor(max_depth=13, min_samples_leaf=1, min_samples_split=3, n_estimators=225)
rf_regressor.fit(X_train, y_train)
y_pred = rf_regressor.predict(X_test)

mse = root_mean_squared_error(y_test, y_pred)
print("Root Mean Squared Error (Random Forest):", mse)

Root Mean Squared Error (Random Forest): 5.503267803764468


Root Mean Squared Error (Random Forest): 5.504527736257842

In [54]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import LinearRegression
from scipy.stats import randint, uniform

X = combined_data.drop(columns=['Habitability_score'])
y = combined_data['Habitability_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_param_grid = {
    'n_estimators': randint(100, 300),
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5)
}

xgb_param_grid = {
    'n_estimators': randint(100, 300),
    'max_depth': randint(1, 20),
    'learning_rate': uniform(0.01, 0.3)
}

et_param_grid = {
    'n_estimators': randint(100, 300),
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5)
}

kf = KFold(n_splits=3, shuffle=True, random_state=42)
# rf_grid = RandomizedSearchCV(estimator=RandomForestRegressor(), param_distributions=rf_param_grid, n_iter=300, scoring='neg_mean_squared_error', cv=kf, verbose=4, random_state=42)
xgb_grid = RandomizedSearchCV(estimator=XGBRegressor(), param_distributions=xgb_param_grid, n_iter=300, scoring='neg_mean_squared_error', cv=kf, verbose=4, random_state=42)
et_grid = RandomizedSearchCV(estimator=ExtraTreesRegressor(), param_distributions=et_param_grid, n_iter=300, scoring='neg_mean_squared_error', cv=kf, verbose=4, random_state=42)

# rf_grid.fit(X_train, y_train)
xgb_grid.fit(X_train, y_train)
et_grid.fit(X_train, y_train)

best_rf_model = RandomForestRegressor(max_depth=13, min_samples_leaf=1, min_samples_split=3, n_estimators=225)
best_rf_model.fit(X_train, y_train)

# best_rf_model = rf_grid.best_estimator_
best_xgb_model = xgb_grid.best_estimator_
best_et_model = et_grid.best_estimator_

rf_preds = best_rf_model.predict(X_test)
xgb_preds = best_xgb_model.predict(X_test)
et_preds = best_et_model.predict(X_test)

stacked_X_train = np.column_stack((rf_preds, xgb_preds, et_preds))

meta_param_grid = {
    'fit_intercept': [True, False],
    'positive': [True, False]
}

meta_grid = GridSearchCV(estimator=LinearRegression(), param_grid=meta_param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose= 2)
meta_grid.fit(stacked_X_train, y_test)

best_meta_learner = meta_grid.best_estimator_

stacked_preds = best_meta_learner.predict(stacked_X_train)

mse_stacked_tuned = root_mean_squared_error(y_test, stacked_preds)
print("Root Mean Squared Error (Tuned Stacked Model):", mse_stacked_tuned)


Fitting 3 folds for each of 300 candidates, totalling 900 fits
[CV 1/3] END learning_rate=0.12236203565420874, max_depth=15, n_estimators=206;, score=-38.874 total time=   4.4s
[CV 2/3] END learning_rate=0.12236203565420874, max_depth=15, n_estimators=206;, score=-38.944 total time=   4.4s
[CV 3/3] END learning_rate=0.12236203565420874, max_depth=15, n_estimators=206;, score=-39.190 total time=   4.0s
[CV 1/3] END learning_rate=0.24390730008183079, max_depth=7, n_estimators=221;, score=-39.277 total time=   0.4s
[CV 2/3] END learning_rate=0.24390730008183079, max_depth=7, n_estimators=221;, score=-38.832 total time=   0.4s
[CV 3/3] END learning_rate=0.24390730008183079, max_depth=7, n_estimators=221;, score=-38.557 total time=   0.4s
[CV 1/3] END learning_rate=0.05679835610086079, max_depth=11, n_estimators=187;, score=-35.881 total time=   1.3s
[CV 2/3] END learning_rate=0.05679835610086079, max_depth=11, n_estimators=187;, score=-35.584 total time=   1.4s
[CV 3/3] END learning_rate=0

In [55]:
print("\nBest hyperparameters for XGBoost:")
print(xgb_grid.best_params_)

print("\nBest hyperparameters for Extra Trees:")
print(et_grid.best_params_)


Best hyperparameters for XGBoost:
{'learning_rate': 0.02604558902421067, 'max_depth': 9, 'n_estimators': 246}

Best hyperparameters for Extra Trees:
{'max_depth': 17, 'min_samples_leaf': 1, 'min_samples_split': 9, 'n_estimators': 260}


In [56]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import LinearRegression

X = combined_data.drop(columns=['Habitability_score']) 
y = combined_data['Habitability_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(max_depth=13, min_samples_leaf=1, min_samples_split=3, n_estimators=225)
xgb_model = XGBRegressor(max_depth=9, learning_rate=0.02604558902421067, n_estimators=246)
et_model = ExtraTreesRegressor(max_depth=17, min_samples_leaf=1, min_samples_split=9, n_estimators=260)

rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
et_model.fit(X_train, y_train)

rf_preds = rf_model.predict(X_test)
xgb_preds = xgb_model.predict(X_test)
et_preds = et_model.predict(X_test)

stacked_X_train = np.column_stack((rf_preds, xgb_preds, et_preds))

meta_learner = LinearRegression()
meta_learner.fit(stacked_X_train, y_test)

stacked_preds = meta_learner.predict(stacked_X_train)

mse_stacked = root_mean_squared_error(y_test, stacked_preds)
print("Root Mean Squared Error (Stacked Model):", mse_stacked)

Root Mean Squared Error (Stacked Model): 5.444075324348654


# Test Dataset

In [107]:
import pandas as pd

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

print('The shape of training dataset : ' + str(train_data.shape))
print('The shape of testing dataset : ' + str(test_data.shape))

The shape of training dataset : (31599, 15)
The shape of testing dataset : (7900, 14)


In [108]:
good_features = test_data.dropna(axis=1).columns.tolist()
missing_features = [col for col in test_data.columns if col not in good_features]

print('Features with missing values :', missing_features)
print('Features without missing values :', good_features)

Features with missing values : ['Number_of_Windows', 'Furnishing', 'Frequency_of_Powercuts', 'Crime_Rate', 'Dust_and_Noise']
Features without missing values : ['Id', 'Property_Type', 'Property_Area', 'Number_of_Doors', 'Power_Backup', 'Water_Supply', 'Traffic_Density_Score', 'Air_Quality_Index', 'Neighborhood_Review']


In [109]:
good_features.extend([missing_features[0]])
print(good_features)

['Id', 'Property_Type', 'Property_Area', 'Number_of_Doors', 'Power_Backup', 'Water_Supply', 'Traffic_Density_Score', 'Air_Quality_Index', 'Neighborhood_Review', 'Number_of_Windows']


In [110]:
temp_data = test_data[good_features]
temp_data.head()

Unnamed: 0,Id,Property_Type,Property_Area,Number_of_Doors,Power_Backup,Water_Supply,Traffic_Density_Score,Air_Quality_Index,Neighborhood_Review,Number_of_Windows
0,0x6808,Single-family home,1521,1,No,All time,7.61,156.0,4.71,5.0
1,0x6a98,Single-family home,2233,1,No,Once in a day - Morning,5.28,82.0,4.5,2.0
2,0xacc0,Apartment,986,1,No,All time,7.72,91.0,4.79,1.0
3,0x8225,Single-family home,1625,1,No,Once in a day - Evening,6.19,159.0,4.09,2.0
4,0xaee8,Single-family home,1166,2,No,Once in a day - Evening,5.84,82.0,3.56,2.0


In [111]:
from sklearn.preprocessing import OrdinalEncoder

temp_data = temp_data.drop(columns=['Id'])
categorical_features = ['Property_Type', 'Power_Backup', 'Water_Supply'] 

ordinal_encoder = OrdinalEncoder()
temp_data[categorical_features] = ordinal_encoder.fit_transform(temp_data[categorical_features])

temp_data.head()

Unnamed: 0,Property_Type,Property_Area,Number_of_Doors,Power_Backup,Water_Supply,Traffic_Density_Score,Air_Quality_Index,Neighborhood_Review,Number_of_Windows
0,5.0,1521,1,1.0,0.0,7.61,156.0,4.71,5.0
1,5.0,2233,1,1.0,3.0,5.28,82.0,4.5,2.0
2,1.0,986,1,1.0,0.0,7.72,91.0,4.79,1.0
3,5.0,1625,1,1.0,2.0,6.19,159.0,4.09,2.0
4,5.0,1166,2,1.0,2.0,5.84,82.0,3.56,2.0


In [112]:
missing_set = temp_data[temp_data.isnull().any(axis=1)]
print(missing_set.shape)

temp_data.dropna(axis=0, inplace=True)
print(temp_data.shape)

(321, 9)
(7579, 9)


In [113]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import numpy as np

X_train = temp_data.drop(columns=['Number_of_Windows'])
y_train = temp_data['Number_of_Windows']

model = LinearRegression()
model.fit(X_train, y_train)

X_test = missing_set.drop(columns=['Number_of_Windows'])
predicted_values = model.predict(X_test)

print(predicted_values)

[3.88238936 4.30907509 3.31853319 3.32534974 3.15428861 3.47289286
 4.31390312 2.56390777 4.72645942 2.62089728 2.65468693 3.69858819
 3.32013255 4.15414715 4.11524689 2.54336615 6.72995612 4.07430669
 3.87494772 3.66322971 2.89544055 4.05661236 3.22581637 5.17299366
 3.32878263 4.51761147 3.23879915 4.11273257 3.62062128 4.38138647
 3.22720243 4.40383702 3.35545796 3.9472239  4.60974803 4.65636319
 3.92637396 4.14424541 5.51461568 3.10612938 4.1805483  2.68238399
 3.29920486 3.03155371 4.25495565 4.83275922 2.65373266 6.42306036
 4.09929131 5.34793514 3.1635618  7.38449878 3.32930009 3.29320043
 2.35853323 4.45289567 5.77507697 5.41667392 4.43231953 4.00375611
 3.38571138 3.23573981 2.58729581 5.50952428 5.68320285 3.43864451
 4.6596327  4.13417274 4.37570861 5.56602872 3.18271913 3.97713427
 4.92038022 2.61553981 4.62283386 3.87024809 2.7289888  3.99152497
 3.85595812 2.60393848 3.29324638 4.18767154 6.52153993 2.43933145
 2.48526865 4.39079912 5.32938331 3.82084646 9.8454314  4.7957

In [114]:
missing_set['Number_of_Windows'] = predicted_values
combined_test = pd.concat([temp_data, missing_set])
combined_test.sort_index(inplace=True)

print(combined_test)

      Property_Type  Property_Area  Number_of_Doors  Power_Backup  \
0               5.0           1521                1           1.0   
1               5.0           2233                1           1.0   
2               1.0            986                1           1.0   
3               5.0           1625                1           1.0   
4               5.0           1166                2           1.0   
...             ...            ...              ...           ...   
7895            5.0           1120                2           1.0   
7896            1.0            445                3           1.0   
7897            2.0           3780                6           2.0   
7898            5.0           1266                1           1.0   
7899            5.0           1229                4           1.0   

      Water_Supply  Traffic_Density_Score  Air_Quality_Index  \
0              0.0                   7.61              156.0   
1              3.0                   5.28  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_set['Number_of_Windows'] = predicted_values


In [116]:
temp_data = combined_test
temp_data['Furnishing'] = train_data['Furnishing']

categorical_features = ['Furnishing'] 

ordinal_encoder = OrdinalEncoder()
temp_data[categorical_features] = ordinal_encoder.fit_transform(temp_data[categorical_features])

missing_set = temp_data[temp_data.isnull().any(axis=1)]
temp_data.dropna(axis=0, inplace=True)

X_train = temp_data.drop(columns=['Furnishing'])
y_train = temp_data['Furnishing']

model = LinearRegression()
model.fit(X_train, y_train)
X_test = missing_set.drop(columns=['Furnishing'])
predicted_values = model.predict(X_test)

missing_set['Furnishing'] = predicted_values
combined_test = pd.concat([temp_data, missing_set])
combined_test.sort_index(inplace=True)

print(combined_test)

      Property_Type  Property_Area  Number_of_Doors  Power_Backup  \
0               5.0           1521                1           1.0   
1               5.0           2233                1           1.0   
2               1.0            986                1           1.0   
3               5.0           1625                1           1.0   
4               5.0           1166                2           1.0   
...             ...            ...              ...           ...   
7895            5.0           1120                2           1.0   
7896            1.0            445                3           1.0   
7897            2.0           3780                6           2.0   
7898            5.0           1266                1           1.0   
7899            5.0           1229                4           1.0   

      Water_Supply  Traffic_Density_Score  Air_Quality_Index  \
0              0.0                   7.61              156.0   
1              3.0                   5.28  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_set['Furnishing'] = predicted_values


In [117]:
temp_data = combined_test
temp_data['Frequency_of_Powercuts'] = test_data['Frequency_of_Powercuts']

missing_set = temp_data[temp_data.isnull().any(axis=1)]
temp_data.dropna(axis=0, inplace=True)

X_train = temp_data.drop(columns=['Frequency_of_Powercuts'])
y_train = temp_data['Frequency_of_Powercuts']

model = LinearRegression()
model.fit(X_train, y_train)
X_test = missing_set.drop(columns=['Frequency_of_Powercuts'])
predicted_values = model.predict(X_test)

missing_set['Frequency_of_Powercuts'] = predicted_values
combined_test = pd.concat([temp_data, missing_set])
combined_test.sort_index(inplace=True)

print(combined_test)

      Property_Type  Property_Area  Number_of_Doors  Power_Backup  \
0               5.0           1521                1           1.0   
1               5.0           2233                1           1.0   
2               1.0            986                1           1.0   
3               5.0           1625                1           1.0   
4               5.0           1166                2           1.0   
...             ...            ...              ...           ...   
7895            5.0           1120                2           1.0   
7896            1.0            445                3           1.0   
7897            2.0           3780                6           2.0   
7898            5.0           1266                1           1.0   
7899            5.0           1229                4           1.0   

      Water_Supply  Traffic_Density_Score  Air_Quality_Index  \
0              0.0                   7.61              156.0   
1              3.0                   5.28  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_set['Frequency_of_Powercuts'] = predicted_values


In [118]:
temp_data = combined_test
temp_data['Crime_Rate'] = test_data['Crime_Rate']

categorical_features = ['Crime_Rate'] 

ordinal_encoder = OrdinalEncoder()
temp_data[categorical_features] = ordinal_encoder.fit_transform(temp_data[categorical_features])

missing_set = temp_data[temp_data.isnull().any(axis=1)]
temp_data.dropna(axis=0, inplace=True)

X_train = temp_data.drop(columns=['Crime_Rate'])
y_train = temp_data['Crime_Rate']

model = LinearRegression()
model.fit(X_train, y_train)
X_test = missing_set.drop(columns=['Crime_Rate'])
predicted_values = model.predict(X_test)

missing_set['Crime_Rate'] = predicted_values
combined_test = pd.concat([temp_data, missing_set])
combined_test.sort_index(inplace=True)

print(combined_test)

      Property_Type  Property_Area  Number_of_Doors  Power_Backup  \
0               5.0           1521                1           1.0   
1               5.0           2233                1           1.0   
2               1.0            986                1           1.0   
3               5.0           1625                1           1.0   
4               5.0           1166                2           1.0   
...             ...            ...              ...           ...   
7895            5.0           1120                2           1.0   
7896            1.0            445                3           1.0   
7897            2.0           3780                6           2.0   
7898            5.0           1266                1           1.0   
7899            5.0           1229                4           1.0   

      Water_Supply  Traffic_Density_Score  Air_Quality_Index  \
0              0.0                   7.61              156.0   
1              3.0                   5.28  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_set['Crime_Rate'] = predicted_values


In [119]:
temp_data = combined_test
temp_data['Dust_and_Noise'] = test_data['Dust_and_Noise']

categorical_features = ['Dust_and_Noise'] 

ordinal_encoder = OrdinalEncoder()
temp_data[categorical_features] = ordinal_encoder.fit_transform(temp_data[categorical_features])

missing_set = temp_data[temp_data.isnull().any(axis=1)]
temp_data.dropna(axis=0, inplace=True)

X_train = temp_data.drop(columns=['Dust_and_Noise'])
y_train = temp_data['Dust_and_Noise']

model = LinearRegression()
model.fit(X_train, y_train)
X_test = missing_set.drop(columns=['Dust_and_Noise'])
predicted_values = model.predict(X_test)

missing_set['Dust_and_Noise'] = predicted_values
combined_test = pd.concat([temp_data, missing_set])
combined_test.sort_index(inplace=True)

print(combined_test)

      Property_Type  Property_Area  Number_of_Doors  Power_Backup  \
0               5.0           1521                1           1.0   
1               5.0           2233                1           1.0   
2               1.0            986                1           1.0   
3               5.0           1625                1           1.0   
4               5.0           1166                2           1.0   
...             ...            ...              ...           ...   
7895            5.0           1120                2           1.0   
7896            1.0            445                3           1.0   
7897            2.0           3780                6           2.0   
7898            5.0           1266                1           1.0   
7899            5.0           1229                4           1.0   

      Water_Supply  Traffic_Density_Score  Air_Quality_Index  \
0              0.0                   7.61              156.0   
1              3.0                   5.28  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_set['Dust_and_Noise'] = predicted_values


In [120]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import LinearRegression

X = combined_data.drop(columns=['Habitability_score']) 
y = combined_data['Habitability_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(max_depth=13, min_samples_leaf=1, min_samples_split=3, n_estimators=225)
xgb_model = XGBRegressor(max_depth=9, learning_rate=0.02604558902421067, n_estimators=246)
et_model = ExtraTreesRegressor(max_depth=17, min_samples_leaf=1, min_samples_split=9, n_estimators=260)

rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
et_model.fit(X_train, y_train)

rf_preds = rf_model.predict(X_test)
xgb_preds = xgb_model.predict(X_test)
et_preds = et_model.predict(X_test)

stacked_X_train = np.column_stack((rf_preds, xgb_preds, et_preds))

meta_learner = LinearRegression()
meta_learner.fit(stacked_X_train, y_test)

stacked_preds = meta_learner.predict(stacked_X_train)

mse_stacked = root_mean_squared_error(y_test, stacked_preds)
print("Root Mean Squared Error (Stacked Model):", mse_stacked)

Root Mean Squared Error (Stacked Model): 5.445734366170904


In [121]:
combined_test.head()

Unnamed: 0,Property_Type,Property_Area,Number_of_Doors,Power_Backup,Water_Supply,Traffic_Density_Score,Air_Quality_Index,Neighborhood_Review,Number_of_Windows,Furnishing,Frequency_of_Powercuts,Crime_Rate,Dust_and_Noise
0,5.0,1521,1,1.0,0.0,7.61,156.0,4.71,5.0,1.0,0.0,3.0,2.0
1,5.0,2233,1,1.0,3.0,5.28,82.0,4.5,2.0,2.0,0.0,3.0,2.0
2,1.0,986,1,1.0,0.0,7.72,91.0,4.79,1.0,0.0,0.0,3.0,2.0
3,5.0,1625,1,1.0,2.0,6.19,159.0,4.09,2.0,2.0,0.0,3.0,2.0
4,5.0,1166,2,1.0,2.0,5.84,82.0,3.56,2.0,0.0,0.0,1.0,2.0


In [122]:
import pandas as pd

# Assuming test_data contains the test dataset without the 'Id' column
test_data = combined_test
test1 = pd.read_csv("test.csv")

# Make predictions on the test data using base models
rf_preds = rf_model.predict(test_data)
xgb_preds = xgb_model.predict(test_data)
et_preds = et_model.predict(test_data)

# Create stacked dataset
stacked_test_data = np.column_stack((rf_preds, xgb_preds, et_preds))

# Make predictions on the stacked test data using meta-learner
stacked_preds = meta_learner.predict(stacked_test_data)

# Add the predictions to the test_data DataFrame
test_data['Habitability_score'] = stacked_preds

test_data.head()

# Create a new DataFrame with 'Id' and 'Habitability_score' columns
predictions_df = pd.DataFrame({'Id': test1['Id'], 'Habitability_score': test_data['Habitability_score']})

# Write the predictions to a new CSV file
predictions_df.to_csv("predicted_test13.csv", index=False)

In [125]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

X = combined_data.drop(columns=['Habitability_score'])
y = combined_data['Habitability_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data into LightGBM Dataset format
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# Set parameters for LightGBM
params = {
    'objective': 'regression',
    'metric': 'rmse',  # Root Mean Squared Error
    'boosting_type': 'gbdt',  # Gradient Boosting Decision Tree
    'learning_rate': 0.1,
    'num_leaves': 31,  # Maximum number of leaves in one tree
    'max_depth': 10,  # No limit on the depth of trees
    'random_state': 42
}

# Train the model
num_round = 1000  # Number of boosting iterations
lgb_model = lgb.train(params, train_data, num_round, valid_sets=[test_data])

# Make predictions on the test set
y_pred = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)

# Evaluate the model
mse = root_mean_squared_error(y_test, y_pred)
print("Mean Squared Error (LightGBM):", mse)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001617 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2294
[LightGBM] [Info] Number of data points in the train set: 25279, number of used features: 13
[LightGBM] [Info] Start training from score 73.381298
Mean Squared Error (LightGBM): 5.698803335188904
