In [1]:
import polars as pl
from plotnine import *
import pandas as pd

# Read the CSV file
df = pl.read_csv("familyData.csv")

# Get the number of columns
num_columns = df.shape[1]

print(f"The dataset has {num_columns} columns.")

print(df.columns)


The dataset has 74 columns.
['#', 'Name', 'TE COMMENTS JJ', 'PLAY#', 'pff_DRIVE', 'pff_DRIVEPLAY', 'pff_QUARTER', 'pff_DOWN', 'DN', 'pff_DISTANCE', 'DIST', 'pff_FIELDPOSITION', 'FPOS', 'pff_GAINLOSS', 'GAIN', 'pff_PENALTY', 'pff_PENALTYYARDS', 'pff_RUNPASS', 'R/P', 'pff_PASSRESULT', 'P RES', 'PERS O', 'pff_OFFPERSONNELBASIC', 'QB#', 'pff_MDUN_O_TEALIGN', 'pff_TES', 'FIB', 'FORM', 'Protection', 'PLAY', 'Play Family', 'pff_RUNCONCEPTPRIMARY', 'pff_RUNCONCEPTSECONDARY', 'pff_MDUN_O_PTRUN', 'pff_MDUN_O_PTCONCEPT', 'PT Concept 1', 'PT Concept 2', 'PT Concept 3', 'Run Concept', 'Screen Concept', 'pff_MDUN_O_FORMSTR', 'pff_HASH', 'pff_SPOTLEFT', 'pff_DEFPERSONNEL', 'DEF PERS', 'pff_DLTECHNIQUES', 'pff_MDUN_O_FRONT', 'FRONT', 'Front Family', 'FRONT ADJ', 'pff_STUNT', 'STUNT 1', 'STUNT 2', 'pff_BLITZDOG', 'pff_MDUN_O_BLITZERDB', 'pff_MDUN_O_BLITZERLB', 'BLITZ', 'Blitz Family', 'BLITZ TYPE', 'COVER', 'COVER FAMILY', 'QB Comments', 'Open/Closed', 'Man/Zone', 'pff_MDUN_O_FORMATION', 'pff_MDUN_D_FO

In [2]:
# Define only the columns you want to keep
selected_columns = [
    "pff_HASH",
    "pff_SPOTLEFT",
    "pff_DLTECHNIQUES",
    "pff_MDUN_O_FRONT",
    "Front Family"
]

# Read the CSV with column selection (predicate pushdown)
df_push = pl.read_csv("familyData.csv", columns=selected_columns)

# Preview the result
print(df_push.head())

# There are some nulls
df_push.write_csv("familyDataPush.csv")


shape: (5, 5)
┌──────────┬──────────────┬────────────────────┬──────────────────┬──────────────┐
│ pff_HASH ┆ pff_SPOTLEFT ┆ pff_DLTECHNIQUES   ┆ pff_MDUN_O_FRONT ┆ Front Family │
│ ---      ┆ ---          ┆ ---                ┆ ---              ┆ ---          │
│ str      ┆ i64          ┆ str                ┆ str              ┆ str          │
╞══════════╪══════════════╪════════════════════╪══════════════════╪══════════════╡
│ R        ┆ 33           ┆ DLT (18); DRT (27) ┆ EVEN G           ┆ FOG          │
│ R        ┆ 33           ┆ DLT (19); DRT (27) ┆ EVEN G           ┆ FOG          │
│ R        ┆ 33           ┆ DLT (19); DRT (27) ┆ EVEN G           ┆ FOG          │
│ R        ┆ 33           ┆ DLT (19); DRT (27) ┆ EVEN G           ┆ FOG          │
│ R        ┆ 33           ┆ DLT (19); DRT (28) ┆ EVEN G           ┆ FOG          │
└──────────┴──────────────┴────────────────────┴──────────────────┴──────────────┘


In [3]:
# Group by 'Front Family' and count rows
front_family_counts = (
    df_push.group_by("Front Family")
      .agg(pl.count().alias("Count"))
      .sort("Count", descending=True)
)

# Filter out rows with null Front Family
df_filtered = df_push.filter(pl.col("Front Family").is_not_null())

# Group by 'Front Family' and count
front_family_counts = (
    df_filtered.group_by("Front Family")
      .agg(pl.count().alias("Count"))
)

# Convert to pandas for manipulation and plotting
pdf = front_family_counts.to_pandas()

# Calculate percentages
total = pdf["Count"].sum()
pdf["Percentage"] = (pdf["Count"] / total) * 100

# Sort by percentage and make 'Front Family' categorical for correct plot order
pdf["Front Family"] = pd.Categorical(
    pdf["Front Family"],
    categories=pdf.sort_values("Percentage", ascending=True)["Front Family"],
    ordered=True
)

# Plot with percentage
plot = (
    ggplot(pdf, aes(x="Front Family", y="Percentage")) +
    geom_bar(stat="identity", fill="#3498db") +
    coord_flip() +
    labs(
        title="Percentage by Front Family",
        x="Front Family",
        y="Percentage (%)"
    ) +
    theme_classic()  # White background
)

# Save the plot
plot.save("front_family_percentages.png", dpi=300)


(Deprecated in version 0.20.5)
(Deprecated in version 0.20.5)


In [12]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

def model_feature_importance_by_label(csv_path, label_value, save_dir="."):
    """
    Train a Random Forest to predict whether Front Family == label_value.
    Shows top 20 features by importance.
    """
    # Load data
    df = pd.read_csv(csv_path)
    df = df.dropna(subset=["Front Family"])

    # Create binary target
    df["target"] = (df["Front Family"] == label_value).astype(int)
    df = df.drop(columns=["Front Family"])

    # One-hot encode all string/categorical columns
    df = pd.get_dummies(df)

    # Features and target
    X = df.drop(columns=["target"])
    y = df["target"]

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

    # Train model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Feature importances
    importances = model.feature_importances_
    features = X.columns

    # Plot top 20
    sorted_idx = importances.argsort()[-20:]
    plt.figure(figsize=(10, 8))
    plt.barh(features[sorted_idx], importances[sorted_idx])
    plt.xlabel("Feature Importance")
    plt.title(f"Top Features Predicting {label_value}")
    plt.tight_layout()

    # Save to disk
    filename = f"{save_dir}/feature_importance_{label_value}.png"
    plt.savefig(filename, dpi=300)
    plt.close()

    print(f"Feature importance plot saved as: {filename}")


In [13]:
# Run for FOG
model_feature_importance_by_label("familyDataPush.csv", "FOG")

# Run for FUG
model_feature_importance_by_label("familyDataPush.csv", "FUG")

# Run for FUN
model_feature_importance_by_label("familyDataPush.csv", "FUN")


Feature importance plot saved as: ./feature_importance_FOG.png
Feature importance plot saved as: ./feature_importance_FUG.png
Feature importance plot saved as: ./feature_importance_FUN.png


The top feature that contributes to predicting whether a row is labeled as FOG.

pff_MDUB_O_FRONT_EVEN: Strongest indicator
pff_SPOTLEFT: Higher or specific values of this numeric feature help differentiate FOG.
Techniques like DLT (19); DRT (27) and DLT (19); DRT (28): These specific combinations of defensive line techniques are common in FOG cases.
Hashes (pff_HASH_L, pff_HASH_R, etc.): The ball alignment also plays a moderate role.

# Why a Random Forest?
The Random Forest model is a robust ensemble learning method that combines multiple decision trees to improve prediction accuracy and control overfitting. It is particularly effective for classification tasks with complex interactions among features, making it suitable for our dataset.
