In [None]:
# If you don't have imbalanced-learn, you can install it
# !pip install imbalanced-learn

# Import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE  # Import SMOTE
from scipy import stats


In [None]:
# Load the dataset from Kaggle Stroke Prediction Dataset
url = "https://path-to-dataset.csv"  # Replace with actual dataset path
df = pd.read_csv(url)

# Display the first few rows of the dataset
df.head()


In [None]:
# Check for null values and basic info about the dataset
df.info()

# Check for missing values
df.isnull().sum()

# Basic statistics
df.describe()

# Visualize the distribution of the target variable (stroke)
sns.countplot(data=df, x='stroke', palette='coolwarm')
plt.title('Stroke Distribution')
plt.show()

# Visualize correlations between numeric features
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Calculate Z-scores for numerical columns
numeric_cols = df.select_dtypes(include=[np.number]).columns

# Apply Z-score method
z_scores = np.abs(stats.zscore(df[numeric_cols]))

# Set a threshold for Z-scores to define outliers
threshold = 3
df_no_outliers = df[(z_scores < threshold).all(axis=1)]

# Check the shape of the dataset after removing outliers
print(f"Shape before removing outliers: {df.shape}")
print(f"Shape after removing outliers: {df_no_outliers.shape}")


In [None]:
# Use the dataset without outliers for further processing
df = df_no_outliers  # or df_no_outliers_iqr if using IQR method

# Convert categorical columns to numeric using Label Encoding
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

df['gender'] = label_encoder.fit_transform(df['gender'])
df['ever_married'] = label_encoder.fit_transform(df['ever_married'])
df['work_type'] = label_encoder.fit_transform(df['work_type'])
df['Residence_type'] = label_encoder.fit_transform(df['Residence_type'])
df['smoking_status'] = label_encoder.fit_transform(df['smoking_status'])

# Define feature variables (X) and target variable (y)
X = df.drop('stroke', axis=1)  # Features (all columns except target)
y = df['stroke']  # Target variable (stroke)


In [None]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the data (important for models like SVM and Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Initialize SMOTE and apply it to the training data
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Check the class distribution after applying SMOTE
print(f"Original class distribution in y_train: {y_train.value_counts()}")
print(f"Resampled class distribution in y_train_smote: {pd.Series(y_train_smote).value_counts()}")


In [None]:
# Logistic Regression Model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_smote, y_train_smote)

# Predict on the test set
y_pred_log_reg = log_reg.predict(X_test_scaled)

# Evaluate the model
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Classification Report:\n", classification_report(y_test, y_pred_log_reg))


In [None]:
# Random Forest Model
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_smote, y_train_smote)

# Predict on the test set
y_pred_rf = rf_clf.predict(X_test)

# Evaluate the model
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


In [None]:
# XGBoost Model
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_clf.fit(X_train_smote, y_train_smote)

# Predict on the test set
y_pred_xgb = xgb_clf.predict(X_test)

# Evaluate the model
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))


In [None]:
# Support Vector Machine Model
svm_clf = SVC(kernel='linear', random_state=42)
svm_clf.fit(X_train_smote, y_train_smote)

# Predict on the test set
y_pred_svm = svm_clf.predict(X_test_scaled)

# Evaluate the model
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))


In [None]:
# Comparing all models based on accuracy
models = ['Logistic Regression', 'Random Forest', 'XGBoost', 'SVM']
accuracies = [
    accuracy_score(y_test, y_pred_log_reg),
    accuracy_score(y_test, y_pred_rf),
    accuracy_score(y_test, y_pred_xgb),
    accuracy_score(y_test, y_pred_svm)
]

# Plotting the comparison
plt.figure(figsize=(8, 6))
sns.barplot(x=models, y=accuracies, palette='viridis')
plt.title('Model Comparison After SMOTE')
plt.ylabel('Accuracy')
plt.show()


In [None]:
# From the comparison above, choose the best model based on accuracy
# Example: If XGBoost gave the highest accuracy, you can finalize it as the best model
final_model = xgb_clf  # Chosen model based on previous comparison

# Retrain final model with all training data
final_model.fit(X, y)

# Save the model using joblib for future use
import joblib
joblib.dump(final_model, 'stroke_prediction_model_with_smote.pkl')


In [None]:
# Additional imports for hyperparameter tuning and ROC curve plotting
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [None]:
# Hyperparameter grid for RandomForestClassifier
param_grid_rf = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider for the best split
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at each leaf node
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Initialize the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=3, n_jobs=-1, verbose=2, scoring='roc_auc')

# Fit GridSearchCV to training data
grid_search_rf.fit(X_train_smote, y_train_smote)

# Best hyperparameters for Random Forest
print("Best parameters for Random Forest:", grid_search_rf.best_params_)

# Get the best model
best_rf = grid_search_rf.best_estimator_


In [None]:
# Hyperparameter grid for XGBoost
param_grid_xgb = {
    'learning_rate': [0.01, 0.05, 0.1],  # Step size at each iteration
    'n_estimators': [50, 100, 200],  # Number of boosting rounds
    'max_depth': [3, 6, 10],  # Maximum depth of a tree
    'min_child_weight': [1, 5, 10],  # Minimum sum of instance weight (hessian) in a child
    'subsample': [0.6, 0.8, 1.0],  # Fraction of samples to use for fitting trees
    'colsample_bytree': [0.6, 0.8, 1.0]  # Fraction of features to choose from for each tree
}

# Initialize XGBClassifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Initialize GridSearchCV
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=3, n_jobs=-1, verbose=2, scoring='roc_auc')

# Fit GridSearchCV to training data
grid_search_xgb.fit(X_train_smote, y_train_smote)

# Best hyperparameters for XGBoost
print("Best parameters for XGBoost:", grid_search_xgb.best_params_)

# Get the best model
best_xgb = grid_search_xgb.best_estimator_


In [None]:
# Hyperparameter grid for SVM
param_grid_svm = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Type of kernel to use
    'gamma': ['scale', 'auto'],  # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’
    'class_weight': [None, 'balanced']  # Weights for classes to deal with class imbalance
}

# Initialize SVC
svm = SVC(probability=True, random_state=42)

# Initialize GridSearchCV
grid_search_svm = GridSearchCV(estimator=svm, param_grid=param_grid_svm, cv=3, n_jobs=-1, verbose=2, scoring='roc_auc')

# Fit GridSearchCV to training data
grid_search_svm.fit(X_train_smote, y_train_smote)

# Best hyperparameters for SVM
print("Best parameters for SVM:", grid_search_svm.best_params_)

# Get the best model
best_svm = grid_search_svm.best_estimator_


In [None]:
# Get predicted probabilities for the positive class (class 1)
y_pred_prob_rf = best_rf.predict_proba(X_test_scaled)[:, 1]
y_pred_prob_xgb = best_xgb.predict_proba(X_test_scaled)[:, 1]
y_pred_prob_svm = best_svm.predict_proba(X_test_scaled)[:, 1]

# Calculate ROC curve for each model
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_prob_rf)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_prob_xgb)
fpr_svm, tpr_svm, _ = roc_curve(y_test, y_pred_prob_svm)

# Calculate AUC for each model
roc_auc_rf = auc(fpr_rf, tpr_rf)
roc_auc_xgb = auc(fpr_xgb, tpr_xgb)
roc_auc_svm = auc(fpr_svm, tpr_svm)

# Plot ROC curves
plt.figure(figsize=(10, 8))

# Plot Random Forest ROC curve
plt.plot(fpr_rf, tpr_rf, color='blue', lw=2, label='Random Forest (AUC = %0.2f)' % roc_auc_rf)

# Plot XGBoost ROC curve
plt.plot(fpr_xgb, tpr_xgb, color='green', lw=2, label='XGBoost (AUC = %0.2f)' % roc_auc_xgb)

# Plot SVM ROC curve
plt.plot(fpr_svm, tpr_svm, color='red', lw=2, label='SVM (AUC = %0.2f)' % roc_auc_svm)

# Plot the diagonal line (no skill)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
# Print out the final AUC scores for each model
print("Random Forest AUC: ", roc_auc_rf)
print("XGBoost AUC: ", roc_auc_xgb)
print("SVM AUC: ", roc_auc_svm)
