# Allsvenskan Transfers 2020-2025

Clean dataset of all transfers involving Allsvenskan:
- **Entry:** Players who joined Allsvenskan from another league
- **Exit:** Players who left Allsvenskan to another league
- **Intra:** Team changes within Allsvenskan

In [None]:
import pandas as pd
from pathlib import Path

DATA_PATH = Path('../../thesis_data')
OUTPUT_PATH = DATA_PATH / 'processed'
OUTPUT_PATH.mkdir(exist_ok=True)

## 1. Load Data

In [None]:
# Transfer history
th = pd.read_parquet(DATA_PATH / 'tm_data/transfer_history_all.parquet')
print(f"Transfer history: {len(th):,} records")

# Names from Twelve (wy_player_id -> short_name)
twelve = pd.read_parquet(DATA_PATH / 'raw_data_twelve/Twelve/male_transfer_model.parquet')
name_lookup = twelve.drop_duplicates('player_id').set_index('player_id')['short_name'].to_dict()
print(f"Name lookup: {len(name_lookup):,} players")

## 2. Filter Allsvenskan Transfers (2020-2025)

In [None]:
# Parse dates
th['transfer_date'] = pd.to_datetime(th['date'], errors='coerce')
th['transfer_year'] = th['transfer_date'].dt.year

# Filters
is_from_allsv = th['competition_name_from'].str.contains('Allsvenskan', case=False, na=False)
is_to_allsv = th['competition_name_to'].str.contains('Allsvenskan', case=False, na=False)
in_window = (th['transfer_year'] >= 2020) & (th['transfer_year'] <= 2025)

# Categorize
entries = th[~is_from_allsv & is_to_allsv & in_window].copy()
entries['transfer_type'] = 'entry'

exits = th[is_from_allsv & ~is_to_allsv & in_window].copy()
exits['transfer_type'] = 'exit'

intra = th[is_from_allsv & is_to_allsv & in_window].copy()
intra['transfer_type'] = 'intra'

print(f"Entries: {len(entries):,}")
print(f"Exits: {len(exits):,}")
print(f"Intra: {len(intra):,}")
print(f"Total: {len(entries) + len(exits) + len(intra):,}")

## 3. Build Clean Dataset

In [None]:
# Combine
allsv = pd.concat([entries, exits, intra], ignore_index=True)

# Add player names from Twelve
allsv['player_name'] = allsv['wy_player_id'].map(name_lookup)

# Rename player_id to tm_player_id for clarity
allsv = allsv.rename(columns={'player_id': 'tm_player_id'})

# Select columns
allsv = allsv[[
    # IDs
    'wy_player_id', 'tm_player_id', 'player_name',
    # Transfer info
    'transfer_type', 'transfer_date', 'transfer_year',
    # From
    'team_id_from', 'team_name_from', 
    'competition_id_from', 'competition_name_from', 'competition_country_from',
    # To
    'team_id_to', 'team_name_to',
    'competition_id_to', 'competition_name_to', 'competition_country_to',
    # Value
    'age_at_transfer', 'transfer_fee', 'transfer_value',
    'remaining_contract_period', 'contract_until_date'
]].sort_values(['transfer_date', 'wy_player_id']).reset_index(drop=True)

print(f"\nDataset shape: {allsv.shape}")
print(f"Players with name: {allsv['player_name'].notna().sum():,} ({allsv['player_name'].notna().mean()*100:.1f}%)")
allsv.head(10)

## 4. Quick Stats

In [None]:
# By year and type
print("TRANSFERS BY YEAR AND TYPE")
print("="*50)
pivot = allsv.pivot_table(index='transfer_year', columns='transfer_type', 
                          aggfunc='size', fill_value=0)
pivot['total'] = pivot.sum(axis=1)
print(pivot)

In [None]:
# Exit fees
print("EXIT FEES")
print("="*50)
exit_fees = allsv[(allsv['transfer_type'] == 'exit') & (allsv['transfer_fee'] > 0)]
print(f"Exits with fee > 0: {len(exit_fees):,}")
print(f"Total: €{exit_fees['transfer_fee'].sum()/1e6:.1f}M")
print(f"Avg: €{exit_fees['transfer_fee'].mean()/1e6:.2f}M")
print(f"Max: €{exit_fees['transfer_fee'].max()/1e6:.2f}M")

In [None]:
# Top 20 exits by fee
print("TOP 20 EXITS BY FEE")
print("="*80)
top_exits = allsv[allsv['transfer_type'] == 'exit'].nlargest(20, 'transfer_fee')
for i, (_, r) in enumerate(top_exits.iterrows(), 1):
    name = r['player_name'] if pd.notna(r['player_name']) else f"ID:{r['wy_player_id']}"
    print(f"{i:2}. {name:<22} €{r['transfer_fee']/1e6:>5.2f}M  {r['team_name_from']} → {r['team_name_to']} ({r['competition_name_to']})")

In [None]:
# Destination leagues (exits)
print("TOP DESTINATION LEAGUES (Exits)")
print("="*50)
print(allsv[allsv['transfer_type'] == 'exit']['competition_name_to'].value_counts().head(15).to_string())

In [None]:
# Source leagues (entries)
print("TOP SOURCE LEAGUES (Entries)")
print("="*50)
print(allsv[allsv['transfer_type'] == 'entry']['competition_name_from'].value_counts().head(15).to_string())

## 5. Save

In [None]:
allsv.to_parquet(OUTPUT_PATH / 'allsvenskan_transfers_2020_2025.parquet', index=False)
print(f"✅ Saved: allsvenskan_transfers_2020_2025.parquet")
print(f"   {len(allsv):,} records")
print(f"   Columns: {list(allsv.columns)}")

---

## ⚠️ Data Gap

This dataset only contains **transfers**. Missing data:

- Players who played in Allsvenskan without making a transfer
- Performance metrics per season
- Minutes played

**Action:** Request complete Allsvenskan 2020-2025 dataset from Twelve.