In [9]:
import pandas as pd

In [10]:
df = pd.read_csv('data/player_valuations_with_age_and_club.csv')
df

Unnamed: 0,player_id,name,age_at_valuation,position,height_in_cm,foot,market_value_in_eur,date,player_club_id
0,11757,Adam Kwarasey,24.553046,Goalkeeper,190.0,right,750000,2012-07-01,
1,22474,Martin Örnskov,26.724162,Midfield,185.0,left,1000000,2012-07-01,
2,24112,Rune Jarstein,27.753593,Goalkeeper,192.0,right,1000000,2012-07-01,
3,26751,Johan Björdal,26.157426,Defender,188.0,right,750000,2012-07-01,
4,37287,Tarik Elyounoussi,24.353183,Attack,172.0,right,1500000,2012-07-01,
...,...,...,...,...,...,...,...,...,...
351247,896154,Leonid Mina,19.482546,Attack,182.0,right,100000,2023-09-21,2671.0
351248,932259,Konstantinos Chrysopoulos,20.336756,Defender,193.0,left,500000,2023-09-21,2441.0
351249,950091,Stefanos Tzimas,17.705681,Attack,184.0,both,1000000,2023-09-21,1091.0
351250,1166093,Georgios Konstantakopoulos,18.724162,Midfield,182.0,both,50000,2023-09-21,30120.0


In [11]:
def analyse_club(club_id, max_age=24, min_stay_at_club=1, max_age_at_start=21, position=None,
                 debug=False):
    club = df[df['player_club_id'] == club_id]
    club = club[club['age_at_valuation'] < max_age]
    club = club.sort_values(by='date')

    if position is not None and position != 'All':
        club = club[club['position'] == position]

    players = club.groupby(['player_id', 'name', 'position']).agg({'market_value_in_eur': ['first', 'last'],
                                                          'age_at_valuation': ['first', 'last']})
    players.columns = ['first_market_value_in_eur', 'last_market_value_in_eur', 'first_age_at_valuation', 'last_age_at_valuation']
    players = players.reset_index()

    players['age_diff'] = players['last_age_at_valuation'] - \
                          players['first_age_at_valuation']
    players = players[players['age_diff'] > min_stay_at_club]
    players = players[players['first_age_at_valuation'] < max_age_at_start]

    if position == 'All':
        if len(players) < 10:
            return 0, 0, 0
    else:
        if len(players) < 6:
            return 0, 0, 0

    players['market_value_diff'] = players['last_market_value_in_eur'] - \
                                   players['first_market_value_in_eur']
    players['market_value_diff_percent'] = players['market_value_diff'] / \
                                           players['first_market_value_in_eur']

    # do the age_diff root of the percentage increase to get a better comparison
    players['market_value_diff_percent_per_year'] = players['market_value_diff_percent'] ** \
                                                    (1 / players['age_diff'])
    # change NaN to 0
    players['market_value_diff_percent_per_year'] = \
        players['market_value_diff_percent_per_year'].fillna(-1)

    # players['market_value_diff_percent_per_year'] = players['market_value_diff_percent'] / \
    #                                                 players['age_diff']

    if debug:
        return players


    median_increase_per_year = players['market_value_diff_percent_per_year'].median()
    perc_1_5 = players[players['market_value_diff_percent_per_year'] > 1.5].shape[0] / players.shape[0]
    perc_2 = players[players['market_value_diff_percent_per_year'] > 2].shape[0] / players.shape[0]

    return median_increase_per_year, perc_1_5, perc_2

In [18]:
positions = {
    "Defender": [],
    "Midfield": [],
    "Attack": [],
    "All": []
}

for position in positions:
    for club_id in df['player_club_id'].unique():
        # check if nan
        if club_id != club_id:
            continue
        percentage_increased, average_increase, percentage_doubled_value = \
            analyse_club(club_id, max_age=23, position=position)
        print((club_id, percentage_increased, average_increase, percentage_doubled_value))
        positions[position].append((club_id, percentage_increased, average_increase, percentage_doubled_value))


(317.0, 1.4426980607686692, 0.4166666666666667, 0.16666666666666666)
(6414.0, 0, 0, 0)
(190.0, 1.9994024520298161, 0.5, 0.5)
(865.0, 1.2812292416210311, 0.3333333333333333, 0.16666666666666666)
(2778.0, 1.7017088253183097, 0.5882352941176471, 0.29411764705882354)
(2414.0, 0, 0, 0)
(173.0, 1.0, 0.2222222222222222, 0.1111111111111111)
(1177.0, 1.4033220594135405, 0.3, 0.2)
(5724.0, 0, 0, 0)
(678.0, 1.239791953338447, 0.3333333333333333, 0.1111111111111111)
(5817.0, 1.0, 0.2857142857142857, 0.14285714285714285)
(1053.0, 1.3418431362396435, 0.2857142857142857, 0.14285714285714285)
(206.0, 1.0, 0.2727272727272727, 0.2727272727272727)
(3426.0, 0, 0, 0)
(2578.0, 0, 0, 0)
(964.0, 0, 0, 0)
(1083.0, 0.8175531220022518, 0.25, 0.25)
(11107.0, 0, 0, 0)
(84.0, 0, 0, 0)
(3725.0, 0, 0, 0)
(121.0, 1.1094747849550677, 0.3333333333333333, 0.16666666666666666)
(6251.0, 0, 0, 0)
(10719.0, 0, 0, 0)
(932.0, 0, 0, 0)
(697.0, 0, 0, 0)
(306.0, 1.295804897737454, 0.2727272727272727, 0.09090909090909091)
(621.0, 

In [19]:
clubs_df = pd.read_csv('data/clubs.csv')

def save_to_df(key=1, n=10):
    df2 = []
    for position, c in positions.items():
        c = sorted(c, key=lambda x: x[key], reverse=True)
        for i in range(n):
            club_id = c[i][0]
            club_name = clubs_df[clubs_df['club_id'] == club_id]['name'].iloc[0]
            df2.append([club_name, int(club_id), position, c[i][key]])
    return pd.DataFrame(df2, columns=['club_name', 'club_id', 'position', 'value_increase'])

df2 = save_to_df()
df2

Unnamed: 0,club_name,club_id,position,value_increase
0,Valencia CF,1049,Defender,2.454419
1,Villarreal CF,1050,Defender,2.29546
2,Hertha BSC,44,Defender,2.227763
3,SC Freiburg,60,Defender,2.181853
4,SL Benfica,294,Defender,2.175313
5,Ajax Amsterdam,610,Defender,2.103348
6,Feyenoord Rotterdam,234,Defender,2.024306
7,West Ham United,379,Defender,2.000617
8,FC Copenhagen,190,Defender,1.999402
9,Olympique Lyon,1041,Defender,1.98528


In [14]:
analyse_club(club_id=281, debug=True, position='All', max_age=23)

Unnamed: 0,player_id,name,position,first_market_value_in_eur,last_market_value_in_eur,first_age_at_valuation,last_age_at_valuation,age_diff,market_value_diff,market_value_diff_percent,market_value_diff_percent_per_year
3,121474,Emyr Huws,Midfield,300000,1500000,20.364134,22.921287,2.557153,1200000,4.0,1.719656
4,134425,Raheem Sterling,Attack,50000000,50000000,20.848734,22.874743,2.02601,0,0.0,0.0
5,143559,Matija Nastasic,Defender,17000000,21000000,19.838467,21.374401,1.535934,4000000,0.235294,0.38983
11,195605,José Pozo,Midfield,500000,3000000,19.482546,22.206708,2.724162,2500000,5.0,1.805434
16,258878,Tosin Adarabioyo,Defender,250000,3500000,19.200548,22.061602,2.861054,3250000,13.0,2.451022
20,277179,Angeliño,Defender,300000,16000000,20.054757,22.9295,2.874743,15700000,52.333333,3.961771
22,295330,Kelechi Iheanacho,Attack,750000,15000000,19.028063,20.733744,1.705681,14250000,19.0,5.619564
26,363205,Gabriel Jesus,Attack,25000000,70000000,19.88501,22.685832,2.800821,45000000,1.8,1.233508
29,398184,Ferran Torres,Attack,45000000,45000000,20.621492,21.815195,1.193703,0,0.0,0.0
30,406635,Phil Foden,Attack,5000000,110000000,17.598905,22.433949,4.835044,105000000,21.0,1.877007
