In [None]:
import os
import numpy as np
import pandas as pd
import pandas_profiling
import plotnine
from plotnine import *  # Provides a ggplot-like interface to matplotlib.
from IPython.display import display

## Plot setup.
theme_set(theme_bw(base_size = 11)) # Default theme for plots.

def get_boxplot_fun_data(df):
  """Returns a data frame with a y position and a label, for use annotating ggplot boxplots.

  Args:
    d: A data frame.
  Returns:
    A data frame with column y as max and column label as length.
  """
  d = {'y': max(df), 'label': f'N = {len(df)}'}
  return(pd.DataFrame(data=d, index=[0]))

# NOTE: if you get any errors from this cell, restart your kernel and run it again.


In [None]:
# This snippet assumes you run setup first

# This code copies file in your Google Bucket and loads it into a dataframe

# Replace 'test.csv' with THE NAME of the file you're going to download from the bucket (don't delete the quotation marks)
name_of_file_in_bucket = 'gs://fc-secure-f691f9bf-9814-462b-be62-53c8dff2e698/data/full_data_set.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file from the bucket to the current working space
os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')
# save dataframe in a csv file in the same workspace as the notebook
my_dataframe = pd.read_csv(name_of_file_in_bucket)
my_dataframe.head()


In [None]:
df = my_dataframe

print(len(df))

In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import numpy as np
import pandas as pd
from tabulate import tabulate
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

In [None]:
# Select features and target variable
features = ['first_prescribed', 'gender', 'race', 'ethnicity', 'sex_at_birth', 'wheelchair-user-status', 'weight', 'heart-rate-mean', 'heart-rhythm-status', 'waist-circumference-mean', 'hip-circumference-mean', 'blood-pressure-systolic-mean', 'blood-pressure-diastolic-mean', 'height', 'bmi']
X = df[features]
y = df['successful']
y = y.replace({True: 1, False: 0}) 

In [None]:
# Preprocess categorical variables (you may need to handle date_of_birth differently)
X = pd.get_dummies(X, columns=['first_prescribed', 'gender', 'race', 'ethnicity', 'sex_at_birth', 'wheelchair-user-status', 'heart-rhythm-status'], drop_first=True)

In [None]:
# Convert text data to numerical representations using TF-IDF
X_text = df['all_disorders']
X_text = X_text.fillna('')
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X_text)

In [None]:
# Combine numerical and text features
X_combined = pd.concat([X, pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())], axis=1)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

In [None]:
X_train = X_train.fillna(X_train.mean())  # Replace NaN values with mean
X_test = X_test.fillna(X_train.mean())    # Use mean from the training set for consistency

In [None]:
# Initialize and train the Naive Bayes model
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)

# Make predictions on the test set
predictions = naive_bayes_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

In [None]:
# 1. Naive Bayes
naive_bayes_model = BernoulliNB()
naive_bayes_model.fit(X_train, y_train)
naive_bayes_predictions = naive_bayes_model.predict(X_test)

print("Naive Bayes:")
print("Accuracy:", accuracy_score(y_test, naive_bayes_predictions))
print("Classification Report:\n", classification_report(y_test, naive_bayes_predictions))

In [None]:
# 2. Logistic Regression
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)
logistic_regression_predictions = logistic_regression_model.predict(X_test)

print("Logistic Regression:")
print("Accuracy:", accuracy_score(y_test, logistic_regression_predictions))
print("Classification Report:\n", classification_report(y_test, logistic_regression_predictions))

In [None]:
# 3. Decision Tree
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)
decision_tree_predictions = decision_tree_model.predict(X_test)

print("Decision Tree:")
print("Accuracy:", accuracy_score(y_test, decision_tree_predictions))
print("Classification Report:\n", classification_report(y_test, decision_tree_predictions))

In [None]:
# 4. Support Vector Machine
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)

print("Support Vector Machine:")
print("Accuracy:", accuracy_score(y_test, svm_predictions))
print("Classification Report:\n", classification_report(y_test, svm_predictions))

In [None]:
# 5. K Nearest Neighbors
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
knn_predictions = knn_model.predict(X_test)

print("K Nearest Neighbors:")
print("Accuracy:", accuracy_score(y_test, knn_predictions))
print("Classification Report:\n", classification_report(y_test, knn_predictions))

In [None]:
# 6. Random Forest Classifier
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)
random_forest_predictions = random_forest_model.predict(X_test)

print("Random Forest Classifier:")
print("Accuracy:", accuracy_score(y_test, random_forest_predictions))
print("Classification Report:\n", classification_report(y_test, random_forest_predictions))

In [None]:
models = {
    'Naive Bayes': BernoulliNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Support Vector Machine': SVC(),
    'K Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest Classifier': RandomForestClassifier(),
    'Gradient Boosting Classifier': GradientBoostingClassifier(),
    'XGBoost': xgb.XGBClassifier()
}

# Create a DataFrame to store the results
results_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])

# Loop through each model
for model_name, model in models.items():
    # Fit the model and make predictions
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions, output_dict=True)
    
    # Append results to DataFrame
    results_df = results_df.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': report['1']['precision'],
        'Recall': report['1']['recall'],
        'F1-Score': report['1']['f1-score']
    }, ignore_index=True)


# Display the results table using tabulate
table = tabulate(results_df, headers='keys', tablefmt='pretty', showindex=False)
print(table)

In [None]:
results_df.head()

In [None]:
import os
import subprocess
import numpy as np
import pandas as pd




In [None]:
# Save results to a CSV file
results_df.to_csv('model_results.csv', index=False)

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = results_df   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'model_results.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
args = ["gsutil", "cp", f"./{destination_filename}", f"{my_bucket}/data/"]
output = subprocess.run(args, capture_output=True)

# print output from gsutil
output.stderr

In [None]:
# This snippet assumes that you run setup first

# This code lists objects in your Google Bucket

# Get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# List objects in the bucket
print(subprocess.check_output(f"gsutil ls -r {my_bucket}", shell=True).decode('utf-8'))




In [None]:
models = {
    'Naive Bayes': BernoulliNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Support Vector Machine': SVC(),
    'K Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest Classifier': RandomForestClassifier(),
    'Gradient Boosting Classifier': GradientBoostingClassifier(),
    'XGBoost': xgb.XGBClassifier()
}

# Create a DataFrame to store the results
results_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])

# Loop through each model
for model_name, model in models.items():
    # Fit the model and make predictions
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions, output_dict=True)
    
    # Append results to DataFrame
    results_df = results_df.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': report['1']['precision'],
        'Recall': report['1']['recall'],
        'F1-Score': report['1']['f1-score']
    }, ignore_index=True)

# Plotting results using matplotlib with rotated x-axis labels
plt.figure(figsize=(12, 8))

# Plot accuracy
plt.subplot(2, 2, 1)
plt.bar(results_df['Model'], results_df['Accuracy'], color='blue')
plt.title('Accuracy')
plt.ylim(0, 1)
plt.xticks(rotation=45, ha='right')

# Plot precision
plt.subplot(2, 2, 2)
plt.bar(results_df['Model'], results_df['Precision'], color='green')
plt.title('Precision')
plt.ylim(0, 1)
plt.xticks(rotation=45, ha='right')

# Plot recall
plt.subplot(2, 2, 3)
plt.bar(results_df['Model'], results_df['Recall'], color='orange')
plt.title('Recall')
plt.ylim(0, 1)
plt.xticks(rotation=45, ha='right')

# Plot F1-Score
plt.subplot(2, 2, 4)
plt.bar(results_df['Model'], results_df['F1-Score'], color='red')
plt.title('F1-Score')
plt.ylim(0, 1)
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

In [None]:
models = {
    'Naive Bayes': BernoulliNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'K Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest Classifier': RandomForestClassifier(),
    'Gradient Boosting Classifier': GradientBoostingClassifier(),
    'XGBoost': xgb.XGBClassifier()
}

# models = {
#     'Naive Bayes': BernoulliNB(),
#     'Logistic Regression': LogisticRegression(max_iter=1000),
#     'Decision Tree': DecisionTreeClassifier(),
#     'Support Vector Machine': SVC(),
#     'K Nearest Neighbors': KNeighborsClassifier(),
#     'Random Forest Classifier': RandomForestClassifier(),
#     'Gradient Boosting Classifier': GradientBoostingClassifier(),
#     'XGBoost': xgb.XGBClassifier()
# }

# Plotting AUC curves for all models
plt.figure(figsize=(10, 8))

for model_name, model in models.items():
    # Fit the model and predict probabilities
    model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_test)[:, 1]

    # Calculate ROC curve and AUC
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)

    # Plot ROC curve
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.2f})')

# Plot the random guess line
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guess')

# Customize the plot
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Different Models')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()


In [None]:
pip install shap

In [None]:
import shap

In [None]:
print(X_test.columns.tolist())

In [None]:
logistic_regression_model = LogisticRegression(max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

# Create a SHAP explainer object
explainer = shap.Explainer(logistic_regression_model, X_train)

# Calculate SHAP values for a subset of the data (you can adjust the number of samples)
shap_values = explainer.shap_values(X_test)

# Summary plot
shap.summary_plot(shap_values, X_test, feature_names=X_test.columns)
