In [20]:
import os
import pandas as pd
import numpy as np
from typing import List
from sklearn.preprocessing import PowerTransformer
from scipy.stats import skew, shapiro

pd.set_option('display.max_rows', 500)

os.chdir("C:/Users/jerem/OneDrive/Documents/rocketleague_ml")
file_name = "features"

processed_file = f"data/features/{file_name}.csv"
features = pd.read_csv(processed_file, low_memory=False) # type: ignore
features.columns.values

array(['Fifty-Fifty Touch Percentage', 'Towards Goal Touch Percentage',
       'Towards Teammate Touch Percentage',
       'Towards Opponent Touch Percentage',
       'Towards Open Space Touch Percentage',
       'Percent Time while Closest to Ball',
       'Average Stint while Closest to Ball',
       'Percent Time while Farthest from Ball',
       'Average Stint while Farthest from Ball',
       'Average Distance to Ball', 'Average Distance to Teammates',
       'Average Distance to Opponents', 'Percent Time In Offensive Half',
       'Average Stint In Offensive Half',
       'Percent Time In Defensive Half',
       'Average Stint In Defensive Half', 'Percent Time In Left Half',
       'Average Stint In Left Half', 'Percent Time In Right Half',
       'Average Stint In Right Half', 'Percent Time In Highest Half',
       'Average Stint In Highest Half', 'Percent Time In Lowest Half',
       'Average Stint In Lowest Half', 'Percent Time In Offensive Third',
       'Average Stint In Off

In [21]:
# Define spatial keywords by dimension
field_zones = ["Offensive", "Neutral", "Defensive"]
field_lateral = ["Left", "Middle", "Right"]
field_height = ["Lowest", "Middle-Aerial", "Highest"]

def count_keywords(name: str, keywords: List[str]):
    return sum(1 for k in keywords if k in name)

def is_nested_spatial_col(name: str):
    # Count how many distinct dimensions appear
    zone_hits = count_keywords(name, field_zones)
    lat_hits = count_keywords(name, field_lateral)
    height_hits = count_keywords(name, field_height)
    total_hits = zone_hits + lat_hits + height_hits
    
    # Drop if column name contains 2+ of these components
    return total_hits >= 2 and "Third" in name  # only apply to positional "Third" metrics

drop_nested_spatial = [c for c in features.columns if is_nested_spatial_col(c)]

drop_features = [
    # --- Redundant "Average Stint ..." versions (keep only Percent Time) ---
    *[c for c in features.columns if "Average Stint" in c],

    # --- Team-level or opponent-level fields ---
    *[c for c in features.columns if any(x in c for x in [
        "Team in", "Team With", "Team with", "Opponent Team", "Percentage Blue", "Percentage Orange"
    ])],

    # --- Aggregated spatial layers (too coarse or sums of others) ---
    *[c for c in features.columns if any(x in c for x in [
        "Half"
    ])],

    # --- Nested redundant thirds (we keep basic Third layers, not all cross-combos) ---
    *drop_nested_spatial,

    # --- Rotations: drop one full system (we’ll keep "Full" and drop "Simple") ---
    *[c for c in features.columns if "Simple" in c],

    # --- Speed bins: keep Slow, Boost Speed, Supersonic, Average Speed only ---
    *[c for c in features.columns if any(x in c for x in [
        "Semi-", "Medium-Speed",
    ]) and "Speed" not in c],

    # --- Boost ranges: drop all detailed bin segments ---
    *[c for c in features.columns if any(x in c for x in [
        # "<=25", ">25", "<= 50", ">50", "<= 75", ">75", "<= 100"
    ])],

    # --- Redundant Boost Efficiency subtypes (keep just "Boost Efficiency" and maybe "Far From Ball") ---
    *[c for c in features.columns if any(x in c for x in [
        "Supersonic Speed Boost Efficiency",
        "Drive to Boost Speed Boost Efficiency",
        "Simple Boost Efficiency"
    ])],

    # --- Scored/outcome metrics (not behavior) ---
    "Scored Goal",
    "Team Scored Goal",
]
features = features.drop(columns=drop_features, errors='ignore')

In [22]:
results = []
for col in features.columns:
    if pd.api.types.is_numeric_dtype(features[col]):
        data = features[col].dropna()
        s = skew(data)

        # Skip if not numeric or near-symmetric
        if np.isnan(s) or abs(s) < 0.5:
            continue
        # Shapiro-Wilk test: tests normality (p < 0.05 means NOT normal)
        stat, p = shapiro(data.sample(min(len(data), 5000), random_state=42))  # cap sample size
        results.append({
            "feature": col,
            "skewness": s,
            "shapiro_p": p
        })

skew_features = pd.DataFrame(results).sort_values("skewness", ascending=False)
skew_features

  s = skew(data)
  res = hypotest_fun_out(*samples, **kwds)


Unnamed: 0,feature,skewness,shapiro_p
36,Boost Efficiency,21.828044,1.988141e-45
35,Far From Ball Boost Efficiency,21.744522,2.039253e-45
25,Percent Time While Boosting,14.83223,9.724063e-45
31,Percent Time With Full Boost,11.551655,5.872601999999999e-44
43,Percent Rotating From Full 3 to 1,8.756227,2.7789620000000002e-43
19,Percent Time while Stationary,7.720916,2.4223839999999998e-42
41,Percent Rotating From Full 1 to 3,7.327582,3.4428229999999996e-42
18,Percent Time In Opponents Goal,6.971626,1.376906e-40
20,Percent Time while Slow,6.266241,5.412251e-38
17,Percent Time In Own Goal,4.921164,4.207525e-37


In [26]:
# Make a working copy
skewed_features = features.copy()

records = []  # to collect before/after skewness

for col in skewed_features.select_dtypes(include=["float", "int"]).columns:
    data = skewed_features[col].dropna() * 100
    if len(data) == 0:
        continue

    s_before = skew(data)

    # Skip if not numeric or near-symmetric
    if np.isnan(s_before) or abs(s_before) < 0.5:
        continue

    # Special-case extreme zero-heavy fields
    special_cols = ["Boost Efficiency", "Far From Ball Boost Efficiency", "Percent Time while Slow"]
    if col in special_cols:
        x = np.log1p(data * 100)
        pt = PowerTransformer(method="yeo-johnson")
        data = pt.fit_transform(x.values.reshape(-1, 1)).flatten()
        method = "log1p+yeo-johnson"
    else:
        # Apply transform depending on skew direction
        if s_before > 1:
            # Heavily right-skewed
            if (data <= 1).all() and (data == 0).any():
                # Zero-heavy proportion → Yeo-Johnson
                pt = PowerTransformer(method='yeo-johnson')
                data = pt.fit_transform(data).flatten()
                method = "yeo-johnson"
            else:
                # Count-like → log1p
                data = np.log1p(data)
                method = "log1p"
        elif s_before > 0.5:
            data = np.sqrt(data)
            method = "sqrt"
        elif s_before < -0.5:
            # left-skewed — use square or exp to pull tail right
            data = np.power(data, 2)
            method = "square"
        else:
            method = "none"

    s_after = skew(data)

    # if np.isnan(s_after) or abs(s_after) < 0.5:
    #     continue

    records.append({
        "feature": col,
        "skew_before": s_before,
        "skew_after": s_after,
        "transform": method
    })

# Create comparison table
skew_compare = pd.DataFrame(records).sort_values("skew_after", ascending=False).reset_index(drop=True)
skew_compare


  s_before = skew(data)


Unnamed: 0,feature,skew_before,skew_after,transform
0,Percent Time While Boosting,14.83223,6.137107,log1p
1,Percent Time With Full Boost,11.551655,5.544611,log1p
2,Percent Rotating From Full 3 to 1,8.756227,4.488561,log1p
3,Percent Rotating From Full 1 to 3,7.327582,3.715727,log1p
4,Percent Time In Opponents Goal,6.971626,2.888428,log1p
5,Percent Time while Stationary,7.720916,2.398218,log1p
6,Boost Efficiency,21.828044,2.19477,log1p+yeo-johnson
7,Far From Ball Boost Efficiency,21.744522,1.99221,log1p+yeo-johnson
8,Percent Time On Front Wall,3.109558,1.815974,log1p
9,Average Overfill,1.788854,1.788854,log1p


In [24]:
features[skew_compare["feature"]].T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,524,525,526,527,528,529,530,531,532,533
Boost Efficiency,5.297655e-31,1.313748e-14,9.877749e-07,2.728388e-11,6.149588000000001e-32,0.223389,7.957653000000001e-29,2.569619e-11,0.0,2.139881e-10,...,4.46137e-21,6.571524e-07,4.958158e-43,5.215143e-17,1.288813e-12,9e-06,2.235776e-41,1.323633e-13,1.28658e-13,2.6e-05
Far From Ball Boost Efficiency,5.297655e-31,1.097304e-12,4.317803e-06,3.473368e-09,2.279976e-28,0.24821,8.461438e-25,7.483562e-10,0.0,1.154807e-09,...,8.394856e-21,6.571524e-07,7.913064e-40,4.355929e-15,1.030164e-06,0.000201,1.5597589999999999e-37,2.017426e-13,1.792149e-12,2.6e-05
Percent Time while Slow,0.0673646,0.09334154,0.03487447,0.1211867,0.05672987,0.144721,0.03955501,0.04104517,0.08429,0.007493664,...,0.1345145,0.02275763,0.0789487,0.03723911,0.02131147,0.022758,0.07003195,0.1326241,0.0897094,0.034523
Percent Time While Boosting,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Percent Rotating From Full 3 to 1,0.0,0.0,0.0,0.0,0.0,0.0,0.1818182,0.125,0.0,0.0,...,0.0,0.0,0.08888889,0.0,0.0,0.0,0.02941176,0.0,0.0,0.0
Percent Time With Full Boost,0.0,0.0,0.0,0.0,0.01783905,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008099389,0.0
Percent Rotating From Full 1 to 3,0.0,0.06666667,0.0,0.0,0.0,0.0,0.1470588,0.07142857,0.0,0.0,...,0.0,0.0,0.07317073,0.0,0.0,0.0,0.04651163,0.0,0.0,0.0
Percent Time In Opponents Goal,0.0,0.06922888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.001122164,0.0,0.0,0.0,0.0,0.0,0.008071303,0.0
Percent Time while Stationary,0.00111378,0.06139815,0.0,0.0,0.002548313,0.0,0.0,0.00143983,0.0,0.0,...,0.05936061,0.0,0.022488,0.0,0.0,0.0,0.0191523,0.07932888,0.05995411,0.0
Percent Time In Highest Third,0.02301965,0.0,0.01453154,0.0,0.01561043,0.0,0.02190644,0.0,0.0,0.0,...,0.0,0.0,0.02501607,0.0,0.03847226,0.0,0.0,0.0,0.02566169,0.0
