# Premier League Tactical Landscape

EPL heatmap, global tactical map, team evolution, and style-vs-results analysis.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from IPython.display import display, Image
import warnings
warnings.filterwarnings('ignore')

# -- Paths --
docs = Path("/Users/jorgepadilla/Documents")
for d in docs.iterdir():
    if "Jorge" in d.name and "MacBook" in d.name and d.is_dir():
        RAW = d / "thesis_data" / "raw_data"
        PROCESSED = d / "thesis_data" / "processed_data"
        REPO = d / "twelve-transfer-modelling"
        break

FIG_DIR = REPO / "clustering" / "team_dna" / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)

FAMILY_COLORS = ['#1B4F72', '#922B21', '#1E8449', '#B7950B', '#6C3483']

# Bipolar labels: what negative (left) and positive (right) z-scores mean
BIPOLAR = {
    "Defence":         ("Low Block",      "High Press"),
    "Def. Transition": ("Regroup",        "Counter-Press"),
    "Att. Transition": ("Build-Up",       "Counter-Attack"),
    "Attack":          ("Short Passing",  "Direct / Long"),
    "Penetration":     ("Crosses",        "Carries"),
    "Chance Creation": ("Sustained",      "Direct Chances"),
    "Outcome":         ("Worse",          "Better"),
}

qdf_clean = pd.read_parquet(PROCESSED / "team_styles" / "team_style_clusters.parquet")
style_dims = ["Defence", "Def. Transition", "Att. Transition", "Attack", "Penetration", "Chance Creation"]

# Recover FAMILY_NAMES and BEST_K from data
family_map = qdf_clean[["family_id", "family_name"]].drop_duplicates().set_index("family_id")["family_name"].to_dict()
FAMILY_NAMES = family_map
BEST_K = len(FAMILY_NAMES)

# Also load full qdf for background swarm in dot-strips
qdf = pd.read_parquet(PROCESSED / "team_styles" / "team_qualities.parquet")

DISPLAY_NAMES = {
    "Defence": "Low Block \u2192 High Press",
    "Def. Transition": "Regroup \u2192 Counter-Press",
    "Att. Transition": "Build-Up \u2192 Counter-Attack",
    "Attack": "Short Passing \u2192 Direct/Long",
    "Penetration": "Crosses \u2192 Carries",
    "Chance Creation": "Sustained \u2192 Direct Chances",
}

print(f"Loaded: {len(qdf_clean):,} clustered team-seasons, {BEST_K} families")

def save_and_show(fig, filename):
    """Save figure as PNG and display inline (lightweight)."""
    path = str(FIG_DIR / filename)
    fig.write_image(path, scale=2)
    display(Image(filename=path))


def twelve_dot_strip(team_row, all_data, dims, team_name, subtitle, filename,
                     bg='#1a2332', dot_bg='rgba(74,124,89,0.25)', dot_hl='#6ee06e'):
    """Twelve Football style dot-strip with bipolar labels per dimension."""
    fig = go.Figure()
    n_dims = len(dims)

    for i, dim in enumerate(dims):
        all_vals = all_data[dim].dropna().values
        team_val = team_row[dim]
        rng = np.random.default_rng(42 + i)
        jitter = rng.uniform(-0.18, 0.18, len(all_vals))

        # Background swarm
        fig.add_trace(go.Scatter(
            x=all_vals, y=[i + j for j in jitter],
            mode='markers',
            marker=dict(color=dot_bg, size=4),
            showlegend=False, hoverinfo='skip',
        ))
        # Highlighted team
        fig.add_trace(go.Scatter(
            x=[team_val], y=[i],
            mode='markers',
            marker=dict(color=dot_hl, size=14,
                        line=dict(color='white', width=2)),
            showlegend=False,
            hovertemplate=f'{dim}: {team_val:+.2f}<extra>{team_name}</extra>',
        ))

    # Build y-axis tick labels with bipolar indicators
    tick_labels = []
    left_annots = []
    right_annots = []
    for i, dim in enumerate(dims):
        tick_labels.append(dim)
        left, right = BIPOLAR.get(dim, ("âˆ’", "+"))
        left_annots.append(
            dict(x=-3.3, y=i, text=f"<i>{left}</i>", showarrow=False,
                 font=dict(color='#999', size=9), xanchor='right', yanchor='middle',
                 xref='x', yref='y')
        )
        right_annots.append(
            dict(x=3.3, y=i, text=f"<i>{right}</i>", showarrow=False,
                 font=dict(color='#999', size=9), xanchor='left', yanchor='middle',
                 xref='x', yref='y')
        )

    fig.update_layout(
        height=55 * n_dims + 120, width=750,
        template='plotly_dark',
        paper_bgcolor=bg, plot_bgcolor=bg,
        title=dict(
            text=(f"<b>{team_name}</b>"
                  f"<br><span style='font-size:12px;color:#aaa'>{subtitle}</span>"),
            font=dict(size=16, color='white'),
        ),
        yaxis=dict(
            tickvals=list(range(n_dims)), ticktext=tick_labels,
            tickfont=dict(color='white', size=11), showgrid=False,
            autorange='reversed',
        ),
        xaxis=dict(
            showgrid=False, zeroline=True,
            zerolinecolor='rgba(255,255,255,0.2)', zerolinewidth=1,
            showticklabels=False, range=[-3.5, 3.5],
        ),
        annotations=left_annots + right_annots,
        margin=dict(t=80, b=30, l=140, r=120),
    )
    save_and_show(fig, filename)


## Premier League: Who Plays Like Who?

In [None]:
TEAM_NAMES = {
    1609: 'Arsenal', 1610: 'Chelsea', 1611: 'Man United',
    1612: 'Liverpool', 1613: 'Newcastle', 1614: 'Aston Villa',
    1616: 'Fulham', 1619: 'Southampton', 1620: 'West Brom',
    1623: 'Everton', 1624: 'Tottenham', 1625: 'Man City',
    1626: 'Watford', 1627: 'Swansea', 1628: 'Crystal Palace',
    1629: 'Wolves', 1630: 'Leeds', 1631: 'Leicester',
    1632: 'Sunderland', 1633: 'West Ham', 1634: 'Ipswich',
    1636: 'Sheffield Utd', 1639: 'Stoke', 1642: "Nott'm Forest",
    1644: 'Middlesbrough', 1646: 'Burnley', 1650: 'Huddersfield',
    1651: 'Brighton', 1659: 'Bournemouth', 1660: 'Luton',
    1669: 'Brentford', 1672: 'Cardiff', 1673: 'Norwich',
    10529: 'Sheffield Utd (2)', 10531: 'Coventry',
}

epl_seasons = sorted(qdf_clean[qdf_clean["competition_id"] == 364]["season"].unique())
last_3 = epl_seasons[-3:]
print(f"EPL seasons: {last_3}")

epl = qdf_clean[(qdf_clean["competition_id"] == 364) & (qdf_clean["season"].isin(last_3))].copy()
epl["team_name"] = epl["team_id"].map(TEAM_NAMES).fillna(epl["team_id"].astype(str))

team_season_counts = epl.groupby("team_name")["season"].nunique()
teams_all_3 = team_season_counts[team_season_counts == len(last_3)].index.tolist()
epl = epl[epl["team_name"].isin(teams_all_3)].copy()
print(f"Teams with all {len(last_3)} seasons: {len(teams_all_3)}")

all_teams_sorted = sorted(epl["team_name"].unique())
display_cols = [DISPLAY_NAMES[d] for d in style_dims]

fig = go.Figure()
buttons = []
for idx, season in enumerate(last_3):
    season_data = epl[epl["season"] == season].copy().set_index("team_name")
    z_matrix = np.full((len(all_teams_sorted), len(style_dims)), np.nan)
    hover_text = [['' for _ in style_dims] for _ in all_teams_sorted]
    display_text = [['' for _ in style_dims] for _ in all_teams_sorted]
    for i_row, team in enumerate(all_teams_sorted):
        if team in season_data.index:
            r = season_data.loc[team]
            if isinstance(r, pd.DataFrame):
                r = r.iloc[0]
            for j_col, dim in enumerate(style_dims):
                val = r[dim]
                z_matrix[i_row, j_col] = val
                display_text[i_row][j_col] = f"{val:.2f}"
                fam = r.get("family_name", "")
                hover_text[i_row][j_col] = f"{team}<br>{DISPLAY_NAMES[dim]}: {val:.2f}<br>Family: {fam}"
    visible = idx == 0
    fig.add_trace(go.Heatmap(
        z=z_matrix, x=display_cols, y=all_teams_sorted,
        colorscale='RdBu_r', zmin=-2, zmax=2,
        text=display_text, texttemplate="%{text}", textfont=dict(size=10),
        hovertext=hover_text, hovertemplate='%{hovertext}<extra></extra>',
        visible=visible,
        colorbar=dict(title="z-score"),
    ))
    s_label = f"{season}/{season+1}"
    vis = [False] * len(last_3)
    vis[idx] = True
    buttons.append(dict(label=s_label, method='update',
                        args=[{'visible': vis}, {'title': f'EPL Tactical Heatmap \u2014 {s_label}'}]))

fig.update_layout(
    updatemenus=[dict(type='buttons', direction='right',
                      x=0.5, xanchor='center', y=1.12, buttons=buttons,
                      showactive=True, bgcolor='#E8E8E8')],
    height=max(400, 28 * len(all_teams_sorted) + 100),
    width=900, template='plotly_white',
    title=f'EPL Tactical Heatmap \u2014 {last_3[0]}/{last_3[0]+1}',
    title_font_size=16, margin=dict(t=80, l=130, b=40),
)
# Heatmap is lightweight, show interactive
fig.show()


## Premier League in the Global Tactical Landscape

In [None]:
from sklearn.preprocessing import StandardScaler

# Re-standardize for UMAP
X_raw = qdf_clean[style_dims].values
scaler = StandardScaler()
X = scaler.fit_transform(X_raw)

try:
    import umap
    reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=30, min_dist=0.3)
    embedding = reducer.fit_transform(X)
    dim_label = "UMAP"
except ImportError:
    from sklearn.decomposition import PCA
    pca = PCA(n_components=2)
    embedding = pca.fit_transform(X)
    dim_label = "PCA"

print(f"Using {dim_label}")

# Correlate axes with tactical dimensions
corr_x = {d: np.corrcoef(embedding[:, 0], qdf_clean[d].values)[0, 1] for d in style_dims}
corr_y = {d: np.corrcoef(embedding[:, 1], qdf_clean[d].values)[0, 1] for d in style_dims}

top_x = sorted(corr_x.items(), key=lambda x: abs(x[1]), reverse=True)[:2]
top_y = sorted(corr_y.items(), key=lambda x: abs(x[1]), reverse=True)[:2]

def axis_annotation(top_corrs):
    parts = []
    for dim, r in top_corrs:
        arrow = "\u2191" if r > 0 else "\u2193"
        parts.append(f"{arrow} {dim} (r={r:+.2f})")
    return "  |  ".join(parts)

x_label = axis_annotation(top_x)
y_label = axis_annotation(top_y)
print(f"X-axis correlates with: {x_label}")
print(f"Y-axis correlates with: {y_label}")

fig = go.Figure()
for fid in range(BEST_K):
    mask = qdf_clean["family_id"].values == fid
    fig.add_trace(go.Scatter(
        x=embedding[mask, 0], y=embedding[mask, 1],
        mode='markers',
        marker=dict(color=FAMILY_COLORS[fid], size=4, opacity=0.12),
        name=FAMILY_NAMES.get(fid, f"Family {fid}"),
        hoverinfo='skip', showlegend=True, legendgroup=f'fam_{fid}',
    ))

season_symbols = {last_3[0]: 'circle', last_3[1]: 'square', last_3[2]: 'diamond'}
season_legend_added = set()
qdf_clean_idx = qdf_clean.index.tolist()
for season in last_3:
    s_data = epl[epl["season"] == season]
    s_label = f"{season}/{season + 1}"
    for _, r in s_data.iterrows():
        try:
            idx = qdf_clean_idx.index(r.name)
        except ValueError:
            continue
        show_legend = season not in season_legend_added
        season_legend_added.add(season)
        fig.add_trace(go.Scatter(
            x=[embedding[idx, 0]], y=[embedding[idx, 1]],
            mode='markers+text',
            marker=dict(color=FAMILY_COLORS[r["family_id"]], size=10,
                        symbol=season_symbols.get(season, 'circle'),
                        line=dict(color='black', width=1)),
            text=[r["team_name"]], textposition='top center',
            textfont=dict(size=9, color='black'),
            name=f"EPL {s_label}" if show_legend else None,
            showlegend=show_legend, legendgroup=f'epl_{season}',
            hovertemplate=f"{r['team_name']} ({s_label})<br>Family: {FAMILY_NAMES[r['family_id']]}<extra></extra>",
        ))

fig.update_layout(
    height=750, width=1050, template='plotly_white',
    title=f'Premier League in the Global Tactical Landscape ({dim_label})',
    title_font_size=18,
    xaxis_title=x_label, yaxis_title=y_label,
    xaxis=dict(showticklabels=False, title_font=dict(size=11, color='#555')),
    yaxis=dict(showticklabels=False, title_font=dict(size=11, color='#555')),
    legend=dict(orientation='v', yanchor='top', y=0.98, xanchor='left', x=1.02,
                font=dict(size=10),
                title=dict(text='Tactical Families / EPL Seasons', font=dict(size=11))),
)
save_and_show(fig, "umap_landscape.png")


## How Teams Evolve

In [None]:
focus_teams = [
    (1609, 'Arsenal'), (1612, 'Liverpool'),
    (1625, 'Man City'), (1651, 'Brighton'),
]

epl_all = qdf_clean[qdf_clean["competition_id"] == 364].copy()
epl_all["team_name"] = epl_all["team_id"].map(TEAM_NAMES).fillna(epl_all["team_id"].astype(str))
avail_seasons = sorted(epl_all["season"].unique())[-5:]
SEASON_SHADES = ['#a8d5a2', '#6ec469', '#3da83e', '#1f8a1f', '#0d6b0d']

for tid, tname in focus_teams:
    t_data = epl_all[(epl_all["team_id"] == tid) & (epl_all["season"].isin(avail_seasons))]
    if len(t_data) == 0:
        continue
    seasons_present = sorted(t_data["season"].unique())

    fig = go.Figure()
    n_dims = len(style_dims)

    for i, dim in enumerate(style_dims):
        all_vals = qdf_clean[dim].dropna().values
        rng = np.random.default_rng(42 + i)
        jit = rng.uniform(-0.18, 0.18, len(all_vals))
        fig.add_trace(go.Scatter(
            x=all_vals, y=[i + j for j in jit],
            mode='markers',
            marker=dict(color='rgba(74,124,89,0.08)', size=3),
            showlegend=False, hoverinfo='skip',
        ))

    for s_idx, season in enumerate(seasons_present):
        s_row = t_data[t_data["season"] == season].iloc[0]
        s_label = f"{season % 100:02d}/{(season + 1) % 100:02d}"
        color = SEASON_SHADES[min(s_idx, len(SEASON_SHADES) - 1)]
        for i, dim in enumerate(style_dims):
            val = s_row[dim]
            y_offset = -0.2 + s_idx * (0.4 / max(len(seasons_present) - 1, 1))
            fig.add_trace(go.Scatter(
                x=[val], y=[i + y_offset],
                mode='markers',
                marker=dict(color=color, size=12,
                            line=dict(color='white', width=1.5)),
                showlegend=(i == 0), name=s_label if i == 0 else None,
                legendgroup=s_label,
                hovertemplate=f'{dim}: {val:+.2f}<extra>{tname} {s_label}</extra>',
            ))

    left_annots, right_annots = [], []
    for i, dim in enumerate(style_dims):
        left, right = BIPOLAR.get(dim, ("-", "+"))
        left_annots.append(dict(x=-3.3, y=i, text=f"<i>{left}</i>", showarrow=False,
                                font=dict(color='#999', size=9), xanchor='right',
                                yanchor='middle', xref='x', yref='y'))
        right_annots.append(dict(x=3.3, y=i, text=f"<i>{right}</i>", showarrow=False,
                                 font=dict(color='#999', size=9), xanchor='left',
                                 yanchor='middle', xref='x', yref='y'))

    fig.update_layout(
        height=55 * n_dims + 140, width=750,
        template='plotly_dark',
        paper_bgcolor='#1a2332', plot_bgcolor='#1a2332',
        title=dict(
            text=f"<b>{tname}</b><br><span style='font-size:12px;color:#aaa'>Tactical Evolution</span>",
            font=dict(size=16, color='white')),
        yaxis=dict(tickvals=list(range(n_dims)), ticktext=list(style_dims),
                   tickfont=dict(color='white', size=11), showgrid=False,
                   autorange='reversed'),
        xaxis=dict(showgrid=False, zeroline=True,
                   zerolinecolor='rgba(255,255,255,0.2)', zerolinewidth=1,
                   showticklabels=False, range=[-3.5, 3.5]),
        annotations=left_annots + right_annots,
        legend=dict(orientation='h', yanchor='bottom', y=-0.15,
                    xanchor='center', x=0.5, font=dict(color='white', size=10)),
        margin=dict(t=80, b=70, l=140, r=120),
    )
    save_and_show(fig, f"evolution_{tname.lower().replace(' ', '_')}.png")


## Style vs Results

In [None]:
outcomes = []
for fid in range(BEST_K):
    mask = qdf_clean["family_id"] == fid
    outcomes.append({
        "family": FAMILY_NAMES[fid], "outcome": qdf_clean.loc[mask, "Outcome"].mean(),
        "n": mask.sum(), "color": FAMILY_COLORS[fid],
    })
outcomes_df = pd.DataFrame(outcomes).sort_values("outcome")

fig = go.Figure(go.Bar(
    x=outcomes_df["outcome"], y=outcomes_df["family"], orientation='h',
    marker_color=outcomes_df["color"],
    text=outcomes_df["outcome"].apply(lambda x: f"{x:+.2f}"),
    textposition='outside',
))
fig.update_layout(
    title="Do Certain Playing Styles Win More?", title_font_size=18,
    xaxis_title="Average Outcome Score",
    height=400, width=700, template='plotly_white', margin=dict(l=180),
)
save_and_show(fig, "style_vs_results.png")

print("\nCorrelation between style and results:")
for dim in style_dims:
    r = qdf_clean[dim].corr(qdf_clean["Outcome"])
    arrow = "\u2192 wins more" if r > 0.1 else ("\u2192 wins less" if r < -0.1 else "\u2192 no clear link")
    print(f"  {dim:20s}  r={r:+.3f}  {arrow}")


## Assumptions & Design Decisions

In [None]:
assumptions = [
    ("Division 1 filter [CRITICAL]", "Only top-flight leagues (division == 1)"),
    ("higher_is_better flags", "See table in markdown cell above"),
    ("Quality weights", "Per supervisor's teams_qualities.md"),
    ("Z-score grouping", "Within (competition_id, season)"),
    ("Minimum group size", "3 teams per (competition, season)"),
    ("Number of families", "k = 5 (tested 3-8)"),
    ("Outcome excluded from clustering", "Only 6 style dims used for K-Means"),
]

print("=" * 60)
print("  ASSUMPTIONS CHECKLIST")
print("=" * 60)
for i, (title, detail) in enumerate(assumptions, 1):
    print(f"  \u2705  {i}. {title}")
    print(f"       {detail}")
print("=" * 60)
print(f"\n  Total team-seasons: {len(qdf_clean):,}")
print(f"  Total leagues:      {qdf_clean.competition_id.nunique()}")
print(f"  Seasons:            {qdf_clean.season.min()}\u2013{qdf_clean.season.max()}")
print(f"  Families:           {BEST_K}")
