In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report, f1_score,
                             matthews_corrcoef, roc_curve, roc_auc_score, precision_recall_curve)

# Load data from CSV file
file_path = "data.csv"
data = pd.read_csv(file_path)

# Assuming the last column is the target variable and all other columns are features
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values

# Split data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=0)

# Train the model
rf_classifier.fit(X_train, Y_train)

# Predict the test set results
Y_pred = rf_classifier.predict(X_test)
Y_pred_proba = rf_classifier.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(Y_test, Y_pred)
conf_matrix = confusion_matrix(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred, average='weighted')
mcc = matthews_corrcoef(Y_test, Y_pred)

# Sensitivity, Specificity, and other metrics from confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()
sensitivity = TP / (TP + FN)
specificity = TN / (TN + FP)
precision = TP / (TP + FP)
recall = sensitivity  # Recall is the same as sensitivity

# Print out metrics
print(f"Accuracy: {accuracy}")
print(f"Sensitivity (Recall): {sensitivity}")
print(f"Specificity: {specificity}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")
print(f"Matthews Correlation Coefficient (MCC): {mcc}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_report(Y_test, Y_pred))

#  Confusion Matrix Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix Heatmap')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

#  ROC Curve
fpr, tpr, _ = roc_curve(Y_test, Y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label='ROC Curve (AUC = {:.2f})'.format(roc_auc_score(Y_test, Y_pred_proba)))
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

#  Precision-Recall Curve
precision_curve, recall_curve, _ = precision_recall_curve(Y_test, Y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(recall_curve, precision_curve, color='green', label='Precision-Recall Curve')
plt.title('Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()

# Feature Importances
plt.figure(figsize=(10, 8))
importances = rf_classifier.feature_importances_
indices = np.argsort(importances)[::-1]
plt.barh(range(X.shape[1]), importances[indices], align='center')
plt.yticks(range(X.shape[1]), [f'Feature {i}' for i in indices])
plt.xlabel('Feature Importance')
plt.title('Feature Importance in Random Forest')
plt.gca().invert_yaxis()
plt.show()

#  Distribution of Predicted Probabilities
plt.figure(figsize=(8, 6))
sns.histplot(Y_pred_proba, bins=30, kde=True, color='purple')
plt.title('Distribution of Predicted Probabilities')
plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.show()

#  Pairplot of Features
plt.figure(figsize=(10, 10))
sns.pairplot(data.iloc[:, :-1].assign(Target=Y))
plt.title('Pairplot of Features')
plt.show()

#  Boxplot of Features
plt.figure(figsize=(10, 8))
sns.boxplot(data=data.iloc[:, :-1])
plt.title('Boxplot of Features')
plt.xticks(rotation=90)
plt.show()

#  Violin Plot of Feature Distributions by Target
plt.figure(figsize=(10, 8))
sns.violinplot(x=Y, y=data.iloc[:, :-1].values.flatten(), scale='width')
plt.title('Violin Plot of Feature Distributions by Target')
plt.show()

# Correlation Matrix Heatmap
plt.figure(figsize=(10, 8))
corr_matrix = data.iloc[:, :-1].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix Heatmap')
plt.show()

#  Learning Curve
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(rf_classifier, X, Y, cv=5, scoring='accuracy', 
                                                        train_sizes=np.linspace(0.1, 1.0, 10))
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_scores_mean, 'o-', color='blue', label='Training Accuracy')
plt.plot(train_sizes, test_scores_mean, 'o-', color='red', label='Validation Accuracy')
plt.title('Learning Curve')
plt.xlabel('Training Size')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
