<a href="https://colab.research.google.com/github/inbalv/tictactoe/blob/master/eda_help_functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_histograms_by_label(df, numeric_cols, figsize=(10,25)):
    """
    Plots histograms for each numeric column in the DataFrame, differentiated by the 'label' column.

    Parameters:
    - df: pandas DataFrame containing the data.
    - numeric_cols: list of column names (str) representing the numeric columns to plot.
    - nrows: number of rows in the subplot grid.
    - ncols: number of columns in the subplot grid.
    - figsize: tuple defining the overall figure size.
    """
    # Create the subplot grid
    fig, axes = plt.subplots(nrows= len(numeric_cols)+1, ncols=2, figsize=figsize)
    axes = axes.flatten()

    # Loop through each numeric column and create a histogram
    for i, col in enumerate(numeric_cols):
        if i < len(axes):
            ax = axes[i]
            sns.histplot(x=col, hue='label', data=df, ax=ax, log_scale=True,
                         element="step", fill=False)
            ax.set_title(f"Collisions Outcome by {col}")
            ax.set_xlabel(col)
            ax.set_ylabel("Count")

    # Hide any unused subplots if there are more axes than numeric columns
    for j in range(len(numeric_cols), len(axes)):
        axes[j].set_visible(False)

    plt.tight_layout()
    plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def plot_monthly_feature_averages_with_error(df, month_col='month', numeric_cols=None, date_format='%b-%y'):
    """
    Plots the monthly average of selected numeric features with standard error bars.

    Parameters:
    - df: pandas DataFrame containing the data.
    - month_col: the name of the column containing month information (default 'month').
    - numeric_cols: list of numeric column names to plot. Must be provided.
    - date_format: datetime format to parse the month column (default '%b-%y').

    This function:
      1. Converts the month column into a datetime object and sorts the DataFrame.
      2. For each numeric column, calculates the monthly average and standard error.
      3. Plots each feature on a separate subplot with error bars.
    """
    if numeric_cols is None:
        raise ValueError("Please provide a list of numeric columns to examine.")

    # Convert the month column to datetime and sort the data chronologically.
    df['month_date'] = pd.to_datetime(df[month_col], format=date_format, errors='coerce')
    df = df.sort_values('month_date')

    # Set up the subplots: one row per feature.
    num_features = len(numeric_cols)
    fig, axs = plt.subplots(num_features, 1, figsize=(12, 6 * num_features))

    # Ensure axs is iterable even if there's only one subplot.
    if num_features == 1:
        axs = [axs]

    # Loop through each feature, calculate the monthly mean and standard error, then plot.
    for i, feat in enumerate(numeric_cols):
        grouped = df.groupby('month_date')[feat]
        monthly_mean = grouped.mean()
        monthly_std = grouped.std()
        monthly_count = grouped.count()
        monthly_se = monthly_std / np.sqrt(monthly_count)

        axs[i].errorbar(
            monthly_mean.index,
            monthly_mean.values,
            yerr=monthly_se.values,
            marker='o',
            linestyle='-',
            capsize=5
        )
        axs[i].set_title(f"Average {feat} Over Time")
        axs[i].set_xlabel("Month")
        axs[i].set_ylabel(f"Average {feat}")
        axs[i].grid(True)
        axs[i].tick_params(axis='x', rotation=45)

    plt.tight_layout()
    plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, average_precision_score, precision_recall_curve, auc
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from tabulate import tabulate

def print_formatted_metrics(y_test, y_pred):
    """
    Print a nicely formatted confusion matrix and classification report.

    Args:
        y_test: Array-like of true labels.
        y_pred: Array-like of predicted labels.
    """
    # Create confusion matrix DataFrame with labels.
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm,
                         index=["Actual Negative", "Actual Positive"],
                         columns=["Predicted Negative", "Predicted Positive"])

    # Print the confusion matrix using tabulate for prettier formatting.
    print("\nConfusion Matrix:")
    print(tabulate(cm_df, headers="keys", tablefmt="psql"))

    # Create the classification report as a DataFrame.
    report = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report).transpose()

    # Print the classification report using tabulate.
    print("\nClassification Report:")
    print(tabulate(report_df, headers="keys", tablefmt="psql"))

def evaluate_model(model, X_test, y_test):
    """
    Evaluate the trained model on the test data using AUC-PR as the main metric.

    This function:
      - Computes predictions and predicted probabilities.
      - Calculates ROC AUC and Average Precision (AUC-PR).
      - Prints the metrics, confusion matrix, and classification report.
      - Plots the Precision-Recall Curve.
    """
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    roc_auc = roc_auc_score(y_test, y_pred_proba)
    avg_precision = average_precision_score(y_test, y_pred_proba)

    print("ROC AUC Score: {:.4f}".format(roc_auc))
    print("Average


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, average_precision_score, precision_recall_curve, auc
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from tabulate import tabulate

def print_formatted_metrics(y_test, y_pred):
    """
    Print a nicely formatted confusion matrix and classification report.

    Args:
        y_test: Array-like of true labels.
        y_pred: Array-like of predicted labels.
    """
    # Create confusion matrix DataFrame with labels.
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm,
                         index=["Actual Negative", "Actual Positive"],
                         columns=["Predicted Negative", "Predicted Positive"])

    # Print the confusion matrix using tabulate for prettier formatting.
    print("\nConfusion Matrix:")
    print(tabulate(cm_df, headers="keys", tablefmt="psql"))

    # Create the classification report as a DataFrame.
    report = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report).transpose()

    # Print the classification report using tabulate.
    print("\nClassification Report:")
    print(tabulate(report_df, headers="keys", tablefmt="psql"))



def evaluate_model(model, X_test, y_test):
    """
    Evaluate the trained model on the test data using AUC-PR as the main metric.
    """
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    roc_auc = roc_auc_score(y_test, y_pred_proba)
    avg_precision = average_precision_score(y_test, y_pred_proba)

    print("ROC AUC Score: {:.4f}".format(roc_auc))
    print("Average Precision (AUC-PR): {:.4f}".format(avg_precision))
    #print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    #print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print_formatted_metrics(y_test, y_pred)
    # Plot Precision-Recall Curve
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    pr_auc = auc(recall, precision)
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, marker='.', label='Precision-Recall curve (AUC = {:.4f})'.format(pr_auc))
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend()
    plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_feature_importance(model, importance_type='gain', top_n=40, figsize=(20,10)):
    """
    Plots the top N features based on the specified importance type from an XGBoost model.

    Parameters:
        model: The trained XGBoost model.
        importance_type (str): The type of feature importance to use ('gain', 'weight', etc.). Default is 'gain'.
        top_n (int): Number of top features to plot. Default is 40.
        figsize (tuple): Size of the figure for the plot. Default is (20, 10).
    """
    # Retrieve feature importance scores from the booster.
    feature_importance = model.get_booster().get_score(importance_type=importance_type)

    # Extract keys (feature names) and values (importance scores)
    keys = list(feature_importance.keys())
    values = list(feature_importance.values())

    # Create a DataFrame, sort it, and plot the top features.
    data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by="score", ascending=False)
    data.nlargest(top_n, columns="score").plot(kind='barh', figsize=figsize)

    plt.title(f'Top {top_n} Features by {importance_type.capitalize()} Importance')
    plt.xlabel("Importance Score")
    plt.ylabel("Features")
    plt.show()


In [None]:

def missing_and_distinct_stats(df):
    total_count = len(df)
    distinct_counts = df.nunique(dropna=True)
    missing_counts = df.isnull().sum()

    summary = pd.DataFrame({
        'Distinct Count': distinct_counts,
        'Missing Count': missing_counts,
        'Missing%': (missing_counts / total_count) * 100,
        'Data Type': df.dtypes
    })

    display(summary)

    summary_stats = df.describe()
    display(summary_stats.T)
    return summary
