In [1]:
import pandas as pd

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

print('The shape of training dataset : ' + str(train_data.shape))
print('The shape of testing dataset : ' + str(test_data.shape))

The shape of training dataset : (31599, 15)
The shape of testing dataset : (7900, 14)


In [2]:
good_features = train_data.dropna(axis=1).columns.tolist()
missing_features = [col for col in train_data.columns if col not in good_features]

print('Features with missing values :', missing_features)
print('Features without missing values :', good_features)

Features with missing values : ['Number_of_Windows', 'Furnishing', 'Frequency_of_Powercuts', 'Crime_Rate', 'Dust_and_Noise']
Features without missing values : ['Id', 'Property_Type', 'Property_Area', 'Number_of_Doors', 'Power_Backup', 'Water_Supply', 'Traffic_Density_Score', 'Air_Quality_Index', 'Neighborhood_Review', 'Habitability_score']


In [3]:
good_features.extend([missing_features[0]])
print(good_features)

['Id', 'Property_Type', 'Property_Area', 'Number_of_Doors', 'Power_Backup', 'Water_Supply', 'Traffic_Density_Score', 'Air_Quality_Index', 'Neighborhood_Review', 'Habitability_score', 'Number_of_Windows']


In [4]:
temp_data = train_data[good_features]
temp_data.head()

Unnamed: 0,Id,Property_Type,Property_Area,Number_of_Doors,Power_Backup,Water_Supply,Traffic_Density_Score,Air_Quality_Index,Neighborhood_Review,Habitability_score,Number_of_Windows
0,0x21e3,Apartment,106,1,No,Once in a day - Morning,5.89,90.0,3.86,71.98,
1,0x68d4,Apartment,733,2,No,Once in a day - Evening,4.37,96.0,3.55,71.2,2.0
2,0x7d81,Apartment,737,2,No,Once in a day - Morning,7.45,121.0,3.81,71.39,4.0
3,0x7a57,Apartment,900,2,Yes,Once in a day - Morning,6.16,100.0,1.34,31.46,3.0
4,0x9409,Bungalow,2238,6,No,All time,5.46,116.0,4.77,93.7,14.0


In [5]:
from sklearn.preprocessing import OrdinalEncoder

temp_data = temp_data.drop(columns=['Id'])
categorical_features = ['Property_Type', 'Power_Backup', 'Water_Supply'] 

ordinal_encoder = OrdinalEncoder()
temp_data[categorical_features] = ordinal_encoder.fit_transform(temp_data[categorical_features])

temp_data.head()

Unnamed: 0,Property_Type,Property_Area,Number_of_Doors,Power_Backup,Water_Supply,Traffic_Density_Score,Air_Quality_Index,Neighborhood_Review,Habitability_score,Number_of_Windows
0,1.0,106,1,1.0,3.0,5.89,90.0,3.86,71.98,
1,1.0,733,2,1.0,2.0,4.37,96.0,3.55,71.2,2.0
2,1.0,737,2,1.0,3.0,7.45,121.0,3.81,71.39,4.0
3,1.0,900,2,2.0,3.0,6.16,100.0,1.34,31.46,3.0
4,2.0,2238,6,1.0,0.0,5.46,116.0,4.77,93.7,14.0


In [6]:
missing_set = temp_data[temp_data.isnull().any(axis=1)]
print(missing_set.shape)

temp_data.dropna(axis=0, inplace=True)
print(temp_data.shape)

(1333, 10)
(30266, 10)


In [41]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTEENN
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

# Separate the features and target variable
X = temp_data.drop(columns=['Number_of_Windows'])
y = temp_data['Number_of_Windows']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote_enn = SMOTEENN(random_state=42)
X_smote_enn, y_smote_enn = smote_enn.fit_resample(X_train, y_train)

# Define the parameter distributions to search
param_dist = {
    'n_estimators': [50, 100, 200],
    'learning_rate': uniform(0.01, 1.0),
    'max_depth': [3, 5, 7]
}

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=GradientBoostingClassifier(), param_distributions=param_dist, n_iter=10, cv=3, random_state=42, n_jobs=-1, verbose=2)

# Fit the random search to the data
random_search.fit(X_smote_enn, y_smote_enn)

# Get the best parameters found by RandomizedSearch
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Use the best model found by RandomizedSearch
best_model = random_search.best_estimator_

# Predict missing values using the best model
missing_values_predicted = best_model.predict(missing_set_X)

# Evaluate the best model
accuracy = accuracy_score(y_test, best_model.predict(X_test))
precision = precision_score(y_test, best_model.predict(X_test), average='weighted')
recall = recall_score(y_test, best_model.predict(X_test), average='weighted')
f1 = f1_score(y_test, best_model.predict(X_test), average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Obtain confusion matrix for the best model
conf_matrix = confusion_matrix(y_test, best_model.predict(X_test))
print("Confusion Matrix:")
print(conf_matrix)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'learning_rate': 0.16601864044243653, 'max_depth': 7, 'n_estimators': 200}
Accuracy: 0.23769408655434424
Precision: 0.22898341412985077
Recall: 0.23769408655434424
F1-score: 0.1729241877000511
Confusion Matrix:
[[ 96   9   1   1   0   0   1   0   0   1   0   0   0   0   0   0]
 [ 92 516  37  22  14   1   4   0   0   0   1   1   1   0   0   0]
 [101 473  71  59  31 320   2   6   0   3   2   0   0   0   0   0]
 [ 93 471  67  48  41 368   0   2   0   1   1   3   0   1   0   1]
 [  1 495  75  52  29 365 118 118   8   6   8   9   9   7   6   9]
 [  0   5  28  40  19 390 120 117   1   3  12   9   5  11  12   7]
 [  0   3   0   0   0   1 118  97   8   7   3   8   9   8   3   5]
 [  0   1   0   0   1   1 121 129   9   4   9   9   5  10   6   5]
 [  0   0   0   0   0   0   0   0   6   7   7   5   9  11   4   6]
 [  0   0   0   0   0   0   0   1   5   8  10  11   6   5   5   6]
 [  0   0   0   0   0   0   0   0   8   

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error
from imblearn.combine import SMOTEENN
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Separate the features and target variable
X = temp_data.drop(columns=['Number_of_Windows'])
y = temp_data['Number_of_Windows']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote_enn = SMOTEENN(random_state=42)
X_smote_enn, y_smote_enn = smote_enn.fit_resample(X_train, y_train)

clf = RandomForestClassifier()
# clf = SVC()
# clf = GradientBoostingClassifier(learning_rate= 0.16601864044243653, max_depth= 7, n_estimators= 200)
# clf = KNeighborsClassifier()
clf.fit(X_smote_enn, y_smote_enn)

# Predict missing values using the trained model
missing_set_X = missing_set.drop(columns=['Number_of_Windows'])
missing_values_predicted = clf.predict(missing_set_X)

# Evaluate the classifier model
accuracy = accuracy_score(y_test, clf.predict(X_test))
precision = precision_score(y_test, clf.predict(X_test), average='weighted')
recall = recall_score(y_test, clf.predict(X_test), average='weighted')
f1 = f1_score(y_test, clf.predict(X_test), average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Obtain confusion matrix
conf_matrix = confusion_matrix(y_test, clf.predict(X_test))
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.23951106706309877
Precision: 0.1499295995990905
Recall: 0.23951106706309877
F1-score: 0.13818555904483387
Confusion Matrix:
[[103   4   0   0   0   0   2   0   0   0   0   0   0   0   0   0]
 [ 97 577   2   2   0   0   4   1   1   1   2   0   0   1   0   1]
 [106 538   1   4   0 408   4   2   0   0   0   2   0   1   0   2]
 [102 522   2   4   1 457   0   0   1   0   4   2   2   0   0   0]
 [  2 557   6   6   0 440 119 119  11   5  14  11   4   9   3   9]
 [  0   3   4   2   0 465 117 129   9   3  11  13   5   8   5   5]
 [  0   0   0   0   0   0 124  94   7  14   4   6   9   5   3   4]
 [  0   1   0   0   0   0 121 130   4   4  11   8   5  13   6   7]
 [  0   0   0   0   0   0   1   1   4   5   8   4  10   8   9   5]
 [  0   0   0   0   0   0   0   0   5  10   6  12   9   4   5   6]
 [  0   0   0   0   0   0   1   0   9   3  13   7   5   4   1   3]
 [  0   0   0   0   0   0   1   0   6  13  10   3   4   2   7   3]
 [  0   0   0   0   0   0   0   0   4  13   9   8   4   4   

In [9]:
missing_set['Number_of_Windows'] = missing_values_predicted
combined_data = pd.concat([temp_data, missing_set])
combined_data.sort_index(inplace=True)

print(combined_data)

       Property_Type  Property_Area  Number_of_Doors  Power_Backup  \
0                1.0            106                1           1.0   
1                1.0            733                2           1.0   
2                1.0            737                2           1.0   
3                1.0            900                2           2.0   
4                2.0           2238                6           1.0   
...              ...            ...              ...           ...   
31594            1.0            851                1           1.0   
31595            3.0            315                1           2.0   
31596            4.0            480                3           1.0   
31597            1.0            642                2           1.0   
31598            5.0           1738                4           1.0   

       Water_Supply  Traffic_Density_Score  Air_Quality_Index  \
0               3.0                   5.89               90.0   
1               2.0          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_set['Number_of_Windows'] = missing_values_predicted


In [10]:
temp_data = combined_data
temp_data['Furnishing'] = train_data['Furnishing']

categorical_features = ['Furnishing'] 

ordinal_encoder = OrdinalEncoder()
temp_data[categorical_features] = ordinal_encoder.fit_transform(temp_data[categorical_features])

missing_set = temp_data[temp_data.isnull().any(axis=1)]
temp_data.dropna(axis=0, inplace=True)

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Separate the features and target variable
X = temp_data.drop(columns=['Furnishing'])
y = temp_data['Furnishing']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=5, min_samples_leaf=5)
# clf = SVC()
# clf = GradientBoostingClassifier()
# clf = KNeighborsClassifier()
clf.fit(X_train, y_train)

# Predict missing values using the trained model
missing_set_X = missing_set.drop(columns=['Furnishing'])
missing_values_predicted = clf.predict(missing_set_X)

# Evaluate the classifier model
accuracy = accuracy_score(y_test, clf.predict(X_test))
precision = precision_score(y_test, clf.predict(X_test), average='weighted')
recall = recall_score(y_test, clf.predict(X_test), average='weighted')
f1 = f1_score(y_test, clf.predict(X_test), average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Obtain confusion matrix
conf_matrix = confusion_matrix(y_test, clf.predict(X_test))
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.7251015434606012
Precision: 0.7386408036742939
Recall: 0.7251015434606012
F1-score: 0.7054308466518507
Confusion Matrix:
[[ 485  731   23]
 [ 100 3129  115]
 [ 124  599  849]]


In [12]:
missing_set['Furnishing'] = missing_values_predicted
combined_data = pd.concat([temp_data, missing_set])
combined_data.sort_index(inplace=True)

print(combined_data)

       Property_Type  Property_Area  Number_of_Doors  Power_Backup  \
0                1.0            106                1           1.0   
1                1.0            733                2           1.0   
2                1.0            737                2           1.0   
3                1.0            900                2           2.0   
4                2.0           2238                6           1.0   
...              ...            ...              ...           ...   
31594            1.0            851                1           1.0   
31595            3.0            315                1           2.0   
31596            4.0            480                3           1.0   
31597            1.0            642                2           1.0   
31598            5.0           1738                4           1.0   

       Water_Supply  Traffic_Density_Score  Air_Quality_Index  \
0               3.0                   5.89               90.0   
1               2.0          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_set['Furnishing'] = missing_values_predicted


In [13]:
temp_data = combined_data
temp_data['Frequency_of_Powercuts'] = train_data['Frequency_of_Powercuts']

missing_set = temp_data[temp_data.isnull().any(axis=1)]
temp_data.dropna(axis=0, inplace=True)

In [14]:
print(combined_data.isnull().sum())

Property_Type             0
Property_Area             0
Number_of_Doors           0
Power_Backup              0
Water_Supply              0
Traffic_Density_Score     0
Air_Quality_Index         0
Neighborhood_Review       0
Habitability_score        0
Number_of_Windows         0
Furnishing                0
Frequency_of_Powercuts    0
dtype: int64


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Separate the features and target variable
X = temp_data.drop(columns=['Frequency_of_Powercuts'])
y = temp_data['Frequency_of_Powercuts']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestClassifier
clf = RandomForestClassifier()
# clf = SVC()
# clf = GradientBoostingClassifier()
# clf = KNeighborsClassifier()
clf.fit(X_train, y_train)

# Predict missing values using the trained model
missing_set_X = missing_set.drop(columns=['Frequency_of_Powercuts'])
missing_values_predicted = clf.predict(missing_set_X)

# Evaluate the classifier model
accuracy = accuracy_score(y_test, clf.predict(X_test))
precision = precision_score(y_test, clf.predict(X_test), average='weighted')
recall = recall_score(y_test, clf.predict(X_test), average='weighted')
f1 = f1_score(y_test, clf.predict(X_test), average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Obtain confusion matrix
conf_matrix = confusion_matrix(y_test, clf.predict(X_test))
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.7138406034765497
Precision: 0.6915230609297979
Recall: 0.7138406034765497
F1-score: 0.6964118568404333
Confusion Matrix:
[[3400  498   12    9]
 [ 695  845   39   15]
 [ 141  173   49   41]
 [  44   54   24   59]]


In [16]:
missing_set['Frequency_of_Powercuts'] = missing_values_predicted
combined_data = pd.concat([temp_data, missing_set])
combined_data.sort_index(inplace=True)

print(combined_data)

       Property_Type  Property_Area  Number_of_Doors  Power_Backup  \
0                1.0            106                1           1.0   
1                1.0            733                2           1.0   
2                1.0            737                2           1.0   
3                1.0            900                2           2.0   
4                2.0           2238                6           1.0   
...              ...            ...              ...           ...   
31594            1.0            851                1           1.0   
31595            3.0            315                1           2.0   
31596            4.0            480                3           1.0   
31597            1.0            642                2           1.0   
31598            5.0           1738                4           1.0   

       Water_Supply  Traffic_Density_Score  Air_Quality_Index  \
0               3.0                   5.89               90.0   
1               2.0          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_set['Frequency_of_Powercuts'] = missing_values_predicted


In [17]:
temp_data = combined_data
temp_data['Crime_Rate'] = train_data['Crime_Rate']

categorical_features = ['Crime_Rate'] 

ordinal_encoder = OrdinalEncoder()
temp_data[categorical_features] = ordinal_encoder.fit_transform(temp_data[categorical_features])

missing_set = temp_data[temp_data.isnull().any(axis=1)]
temp_data.dropna(axis=0, inplace=True)

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Separate the features and target variable
X = temp_data.drop(columns=['Crime_Rate'])
y = temp_data['Crime_Rate']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestClassifier
clf = RandomForestClassifier()
# clf = SVC()
# clf = GradientBoostingClassifier()
# clf = KNeighborsClassifier()
clf.fit(X_train, y_train)

# Predict missing values using the trained model
missing_set_X = missing_set.drop(columns=['Crime_Rate'])
missing_values_predicted = clf.predict(missing_set_X)

# Evaluate the classifier model
accuracy = accuracy_score(y_test, clf.predict(X_test))
precision = precision_score(y_test, clf.predict(X_test), average='weighted')
recall = recall_score(y_test, clf.predict(X_test), average='weighted')
f1 = f1_score(y_test, clf.predict(X_test), average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Obtain confusion matrix
conf_matrix = confusion_matrix(y_test, clf.predict(X_test))
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.9707781724249274
Precision: 0.9709489612458123
Recall: 0.9707781724249274
F1-score: 0.9708235243359467
Confusion Matrix:
[[ 785   22   23    2]
 [  25 1843    4   38]
 [  11    1  366    0]
 [   4   47    4 3019]]


In [19]:
missing_set['Crime_Rate'] = missing_values_predicted
combined_data = pd.concat([temp_data, missing_set])
combined_data.sort_index(inplace=True)

print(combined_data)

       Property_Type  Property_Area  Number_of_Doors  Power_Backup  \
0                1.0            106                1           1.0   
1                1.0            733                2           1.0   
2                1.0            737                2           1.0   
3                1.0            900                2           2.0   
4                2.0           2238                6           1.0   
...              ...            ...              ...           ...   
31594            1.0            851                1           1.0   
31595            3.0            315                1           2.0   
31596            4.0            480                3           1.0   
31597            1.0            642                2           1.0   
31598            5.0           1738                4           1.0   

       Water_Supply  Traffic_Density_Score  Air_Quality_Index  \
0               3.0                   5.89               90.0   
1               2.0          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_set['Crime_Rate'] = missing_values_predicted


In [20]:
temp_data = combined_data
temp_data['Dust_and_Noise'] = train_data['Dust_and_Noise']

categorical_features = ['Dust_and_Noise'] 

ordinal_encoder = OrdinalEncoder()
temp_data[categorical_features] = ordinal_encoder.fit_transform(temp_data[categorical_features])

missing_set = temp_data[temp_data.isnull().any(axis=1)]
temp_data.dropna(axis=0, inplace=True)

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Separate the features and target variable
X = temp_data.drop(columns=['Dust_and_Noise'])
y = temp_data['Dust_and_Noise']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestClassifier
clf = RandomForestClassifier()
# clf = SVC()
# clf = GradientBoostingClassifier()
# clf = KNeighborsClassifier()
clf.fit(X_train, y_train)

# Predict missing values using the trained model
missing_set_X = missing_set.drop(columns=['Dust_and_Noise'])
missing_values_predicted = clf.predict(missing_set_X)

# Evaluate the classifier model
accuracy = accuracy_score(y_test, clf.predict(X_test))
precision = precision_score(y_test, clf.predict(X_test), average='weighted')
recall = recall_score(y_test, clf.predict(X_test), average='weighted')
f1 = f1_score(y_test, clf.predict(X_test), average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Obtain confusion matrix
conf_matrix = confusion_matrix(y_test, clf.predict(X_test))
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.9709150326797386
Precision: 0.9709286065732323
Recall: 0.9709150326797386
F1-score: 0.9695851739783165
Confusion Matrix:
[[ 417    4   83]
 [   5  254   75]
 [   8    3 5271]]


In [22]:
missing_set['Dust_and_Noise'] = missing_values_predicted
combined_data = pd.concat([temp_data, missing_set])
combined_data.sort_index(inplace=True)

print(combined_data)

       Property_Type  Property_Area  Number_of_Doors  Power_Backup  \
0                1.0            106                1           1.0   
1                1.0            733                2           1.0   
2                1.0            737                2           1.0   
3                1.0            900                2           2.0   
4                2.0           2238                6           1.0   
...              ...            ...              ...           ...   
31594            1.0            851                1           1.0   
31595            3.0            315                1           2.0   
31596            4.0            480                3           1.0   
31597            1.0            642                2           1.0   
31598            5.0           1738                4           1.0   

       Water_Supply  Traffic_Density_Score  Air_Quality_Index  \
0               3.0                   5.89               90.0   
1               2.0          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_set['Dust_and_Noise'] = missing_values_predicted


In [23]:
print(combined_data.isnull().sum())

Property_Type             0
Property_Area             0
Number_of_Doors           0
Power_Backup              0
Water_Supply              0
Traffic_Density_Score     0
Air_Quality_Index         0
Neighborhood_Review       0
Habitability_score        0
Number_of_Windows         0
Furnishing                0
Frequency_of_Powercuts    0
Crime_Rate                0
Dust_and_Noise            0
dtype: int64


In [104]:
combined_data.to_csv('combined_test1.csv', index=False) 

In [105]:
combined_data = pd.read_csv("combined_data1.csv")

In [26]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

X = combined_data.drop(columns=['Habitability_score'])
y = combined_data['Habitability_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train, y_train)
y_pred = rf_regressor.predict(X_test)

mse = root_mean_squared_error(y_test, y_pred)
print("Root Mean Squared Error (Random Forest):", mse)

Root Mean Squared Error (Random Forest): 5.603791970896622


In [31]:
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_trains = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_trains, columns=X_train.columns)
X_tests = scaler.fit_transform(X_test)
X_test = pd.DataFrame(X_tests, columns=X_test.columns)

# Initialize the MLPRegressor with early stopping
mlp_regressor = MLPRegressor(early_stopping=True, random_state=42)

# Fit the MLPRegressor model
mlp_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = mlp_regressor.predict(X_test)

# Calculate the root mean squared error
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error (MLPRegressor with Early Stopping):", rmse)


Root Mean Squared Error (MLPRegressor with Early Stopping): 6.005103097997346




In [109]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import LinearRegression

X = combined_data.drop(columns=['Habitability_score']) 
y = combined_data['Habitability_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(max_depth= 20, max_features= 'sqrt', min_samples_leaf= 1, min_samples_split= 2, n_estimators=300)
xgb_model = XGBRegressor(colsample_bytree= 1.0, learning_rate= 0.1, max_depth= 7, n_estimators= 100, subsample= 1.0)
et_model = ExtraTreesRegressor(max_depth= None, max_features= 'sqrt', min_samples_leaf= 1, min_samples_split= 5, n_estimators= 300)

rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
et_model.fit(X_train, y_train)

rf_preds = rf_model.predict(X_test)
xgb_preds = xgb_model.predict(X_test)
et_preds = et_model.predict(X_test)

stacked_X_train = np.column_stack((rf_preds, xgb_preds, et_preds))

meta_learner = LinearRegression()
meta_learner.fit(stacked_X_train, y_test)

stacked_preds = meta_learner.predict(stacked_X_train)

mse_stacked = root_mean_squared_error(y_test, stacked_preds)
print("Root Mean Squared Error (Stacked Model):", mse_stacked)

Root Mean Squared Error (Stacked Model): 5.557848961092746


In [111]:
combined_test = pd.read_csv("combined_test1.csv")

In [112]:
import pandas as pd

# Assuming test_data contains the test dataset without the 'Id' column
test_data = combined_test
test1 = pd.read_csv("test.csv")

# Make predictions on the test data using base models
rf_preds = rf_model.predict(test_data)
xgb_preds = xgb_model.predict(test_data)
et_preds = et_model.predict(test_data)

# Create stacked dataset
stacked_test_data = np.column_stack((rf_preds, xgb_preds, et_preds))

# Make predictions on the stacked test data using meta-learner
stacked_preds = meta_learner.predict(stacked_test_data)

# Add the predictions to the test_data DataFrame
test_data['Habitability_score'] = stacked_preds

test_data.head()

# Create a new DataFrame with 'Id' and 'Habitability_score' columns
predictions_df = pd.DataFrame({'Id': test1['Id'], 'Habitability_score': test_data['Habitability_score']})

# Write the predictions to a new CSV file
predictions_df.to_csv("predicted_test11.csv", index=False)

In [114]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Define parameter distributions for random search
rf_param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': [None, 10, 20],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

xgb_param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': [3, 5, 7],
    'learning_rate': uniform(0.01, 0.3)
}

et_param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': [None, 10, 20],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

# Create RandomizedSearchCV instances
rf_random = RandomizedSearchCV(estimator=RandomForestRegressor(), param_distributions=rf_param_dist, scoring='neg_mean_squared_error', cv=3, n_iter=100, random_state=42, verbose=4)
xgb_random = RandomizedSearchCV(estimator=XGBRegressor(), param_distributions=xgb_param_dist, scoring='neg_mean_squared_error', cv=3, n_iter=100, random_state=42, verbose=4)
et_random = RandomizedSearchCV(estimator=ExtraTreesRegressor(), param_distributions=et_param_dist, scoring='neg_mean_squared_error', cv=3, n_iter=100, random_state=42, verbose=4)

# Fit RandomizedSearchCV instances
rf_random.fit(X_train, y_train)
xgb_random.fit(X_train, y_train)
et_random.fit(X_train, y_train)

# Get best models from RandomizedSearchCV
best_rf_model = rf_random.best_estimator_
best_xgb_model = xgb_random.best_estimator_
best_et_model = et_random.best_estimator_

# Predictions using best models
rf_preds = best_rf_model.predict(X_test)
xgb_preds = best_xgb_model.predict(X_test)
et_preds = best_et_model.predict(X_test)

# Create stacked features
stacked_X_train = np.column_stack((rf_preds, xgb_preds, et_preds))

# Define parameter distributions for meta learner
meta_param_dist = {
    'fit_intercept': [True, False],
    'positive': [True, False]
}

# Create RandomizedSearchCV for meta learner
meta_random = RandomizedSearchCV(estimator=LinearRegression(), param_distributions=meta_param_dist, scoring='neg_mean_squared_error', cv=3, n_iter=100, random_state=42, verbose=4)
meta_random.fit(stacked_X_train, y_test)

# Get best meta learner
best_meta_learner = meta_random.best_estimator_

# Predict using stacked ensemble
stacked_preds = best_meta_learner.predict(stacked_X_train)

# Calculate MSE
mse_stacked_tuned = mean_squared_error(y_test, stacked_preds)
print("Mean Squared Error (Tuned Stacked Model):", mse_stacked_tuned)


Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV 1/3] END max_depth=20, min_samples_leaf=4, min_samples_split=9, n_estimators=288;, score=-34.064 total time=  24.8s
[CV 2/3] END max_depth=20, min_samples_leaf=4, min_samples_split=9, n_estimators=288;, score=-34.884 total time=  24.2s
[CV 3/3] END max_depth=20, min_samples_leaf=4, min_samples_split=9, n_estimators=288;, score=-35.200 total time=  25.1s
[CV 1/3] END max_depth=None, min_samples_leaf=3, min_samples_split=4, n_estimators=174;, score=-34.019 total time=  17.3s
[CV 2/3] END max_depth=None, min_samples_leaf=3, min_samples_split=4, n_estimators=174;, score=-34.936 total time=  16.6s
[CV 3/3] END max_depth=None, min_samples_leaf=3, min_samples_split=4, n_estimators=174;, score=-35.119 total time=  16.7s
[CV 1/3] END max_depth=20, min_samples_leaf=4, min_samples_split=6, n_estimators=199;, score=-34.044 total time=  17.4s
[CV 2/3] END max_depth=20, min_samples_leaf=4, min_samples_split=6, n_estimators=199;, scor

