In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, r2_score, root_mean_squared_error

In [None]:
df = pd.read_csv('') #tbc
df.replace({np.nan: None})
df.head()
pred_set = []
pred_target = []

In [None]:
X = df.loc[:, df.columns != 'resale_price'] #tbc
y = df.loc[:, 'resale_price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf = RandomForestClassifier(random_state = 42)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
kf = KFold(n_splits = 10)
scores = cross_val_score(rf, X, y, cv = kf, scoring='r2') 
print("Cross Validation Scores: ", scores)
print("R2 Score: ", scores.mean())

In [None]:
scores1 = cross_val_score(RandomForestClassifier(n_estimators=5),X, y, cv=10)
np.average(scores1)
scores2 = cross_val_score(RandomForestClassifier(n_estimators=10),X, y, cv=10)
np.average(scores2)
scores3 = cross_val_score(RandomForestClassifier(n_estimators=30),X, y, cv=10)
np.average(scores3)
scores4 = cross_val_score(RandomForestClassifier(n_estimators=40),X, y, cv=10)
np.average(scores4)

In [None]:
prediction = rf.predict(pred_set)

In [None]:
r2 = r2_score(pred_target, prediction)
rmse = root_mean_squared_error(pred_target, prediction)
print("R2 score of Random Forest:", r2)
print("Root Mean Squared Error of Random Forest:", rmse)

In [None]:
accuracy = accuracy_score(pred_target, prediction)
precision = precision_score(pred_target, prediction)
recall = recall_score(pred_target, prediction)
roc_auc = roc_auc_score(pred_target, prediction)
cm = confusion_matrix(pred_target, prediction)
print("Accuracy of Random Forest:", accuracy)
print("Precision of Random Forest:", precision)
print("Tecall of Random Forest:", recall)
print("ROC AUC of Random Forest:", roc_auc)
print(f"Confusion Matrix for Random Forest:\n{cm}")

In [None]:
importance = rf.feature_importances_
feature_importances = zip(importance, X.columns)
sorted_feature_importances = sorted(feature_importances, reverse = True)

top_15_predictors = sorted_feature_importances[0:15]
values = [value for value, predictors in top_15_predictors]
predictors = [predictors for value, predictors in top_15_predictors]
print(predictors)

In [None]:
plt.figure()
plt.title("Feature importances")
plt.bar(range(len(predictors)), values,color="r", align="center")
plt.xticks(range(len(predictors)), predictors, rotation=90)