In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, confusion_matrix
import matplotlib.pyplot as plt
from ipywidgets import interact, FloatSlider
import seaborn as sns

In [2]:
def generate_species_data(n_samples=1000, presence_ratio=0.3):
    # Calculate number of samples for each class
    n_present = int(n_samples * presence_ratio)
    n_absent = n_samples - n_present
    
    # Generate features for presence sites 
    # Green crabs prefer warmer temps (between 64 and 79 degrees Fahrenheit) and  salinity between 26 and 39 ppt
    temp_present = np.random.normal(loc=71, scale= 4, size=n_present)
    salinity_present = np.random.normal(loc=32, scale=3, size=n_present)
    X_present = np.column_stack([temp_present, salinity_present])
    y_present = np.ones(n_present)
    
    # Generate features for absence sites
    # Sites with warmer temps or lower salinity
    temp_absent = np.random.normal(loc=26, scale=3, size=n_absent)
    salinity_absent = np.random.normal(loc=28, scale=2, size=n_absent)
    X_absent = np.column_stack([temp_absent, salinity_absent])
    y_absent = np.zeros(n_absent)
    
    # Combine and shuffle the data
    X = np.concatenate([X_present, X_absent])
    y = np.concatenate([y_present, y_absent])
    
    # Shuffle the data
    shuffle_idx = np.random.permutation(n_samples)
    X = X[shuffle_idx]
    y = y[shuffle_idx]
    
    return X, y

Create a function that creates a bar plot of a species presence distribution 

In [3]:
def plot_class_distribution(y):
    plt.figure(figsize = (8,4))
    
    #Count the values in each category
    class_counts = pd.Series(y).value_counts().sort_index()
    
    # Create a bar plot of absent and present species
    sns.barplot(x = ['Absent', 'Present'], y = class_counts, color = "seagreen")
    plt.title('Distribution of species presence or absence')
    plt.ylabel('number of sampling sites')
    
    # Add percent over each bar
    total = len(y)
    for i, count in enumerate(class_counts):
        percentage = count/total * 100
        plt.text(i, count, f'{percentage:.1f}%', ha = 'center', va = 'bottom')
        plt.show()
    

Create a function that plots a confusion matrix

In [4]:
def plot_confusion_matrix(y_true, y_pred):
    
    # Create confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Create confusion matrix plot
    plt.figure(figsize = (8,6))
    sns.heatmap(cm, annot = True, fmt = 'd', cmap = 'GnBu', 
               xticklabels = ['Absent', 'Present'],
               yticklabels = ['Absent', 'Present'])
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    # Calcualte metrix
    TP = cm[1,1]
    TN = cm[0,0]
    FP = cm[0,1]
    FN = cm[1,0]
    
    print('\nMetrics from Confusion Matrix:')
    print(f'True Positives (correctly predicted presence): {TP}')
    print(f'True Negatives (correctly predicted absence): {TN}')
    print(f'False Positive (incorrectly predicted presence):{FP}')
    print(f'False Negatives (incoreetly predicted absence):{FN}')
    
    # Calcualte accuracy 
    accuracy = (TP + TN)/(TP + TN + FP + FN)
    majority_baseline = max(np.mean(y_true), 1-np.mean(y_true))
    sensitivity = TP/(TP + FN)
    specificity = TN/ (TN + FP)
    
    # Print metrics
    print(f'\nModel Performance Metrics:')
    print(f'Sensitivity (true positive rate): {sensitivity: .3f}')
    print(f'Specificity (true negative rate): {specificity: .3f}')

In [5]:
def plot_roc_curve(y_test, y_pred_prob):
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize = (8,6))
    plt.plot(fpr, tpr, color = 'seagreen', lw=2,
            label = f'ROC curve(AUC = {roc_auc:.2f})')
    plt.plot([0,1], [0,1], color = 'navy', lw=2, linestyle= '--',
            label = "Random classifier (AUC = 0.5)")
    plt.xlabel('False Positive Rate')
    plt.ylabel('True positive rate')
    plt.title('ROC Curve: Species Presence Prediction')
    plt.legend()
    plt.grid(True)
    plt.show()

Creat function that runs a logistic regressiona nd outputs a distribution plot, confusion matrix, and ROC curve

In [6]:
def interactive_logistic_regression(presence_ratio = 0.3):
    X, y = generate_species_data(presence_ratio = presence_ratio)
    
    # Plot class distribution
    print("\nClass Distribution")
    plot_class_distribution(y)
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)
    
    # Train the model
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    # Make my predictions
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    
    # Plot confusion matrix
    print("n\Confusion Matrix")
    plot_confusion_matrix(y_test, y_pred)
    
    # Plot ROC curve
    print("\nROC Curve:")
    plot_roc_curve(y_test, y_pred_prob)

Create interactivity

In [7]:
# Create interactive widget
def generate_log_regression():
    interact(interactive_logistic_regression,
            presence_ratio = FloatSlider(min = 0.1, max = 0.9, step = .1, value = 0.3,
                                        description = "% Present")
            )
    
generate_log_regression()

interactive(children=(FloatSlider(value=0.3, description='% Present', max=0.9, min=0.1), Output()), _dom_class…