In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
from matplotlib.ticker import FormatStrFormatter
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
pd.set_option("display.max_rows", None) #, "display.max_columns", None)

The following cell loads the data I collected from transfer windows in my own leagues.
It's broken down by position

In [2]:
Ngames = 24. # Data was taken after day 23 or 24 depending on the league, doesn't make a big difference. 

data_strikers = pd.read_csv("mercato_strikers.dat", delimiter = ',', comment='#')
data_mids = pd.read_csv("mercato_mids.dat", delimiter = ',', comment='#')
data_defs = pd.read_csv("mercato_defs.dat", delimiter = ',', comment='#')
data_keepers = pd.read_csv("mercato_keepers.dat", delimiter = ',', comment='#')

data_strikers["titu"] = data_strikers["titu"].div(100)
data_strikers["goals"] = data_strikers["goals"].div(Ngames)
data_mids["titu"] = data_mids["titu"].div(100)
data_mids["goals"] = data_mids["goals"].div(Ngames)
data_defs["titu"] = data_defs["titu"].div(100)
data_defs["goals"] = data_defs["goals"].div(Ngames)
data_keepers["titu"] = data_keepers["titu"].div(100)
data_keepers["goals"] = data_keepers["goals"].div(Ngames)

In [3]:
def KNR_predict(Xplayers,pos, nr):
    """ K-neighbors regressor """
    np.random.seed(nr)
    if pos=="A":
        X = data_strikers[['price','av','goals','titu']].copy()
        Y = data_strikers[["buy"]].copy()
    if pos=="M":
        X = data_mids[['price','av','goals','titu']].copy()
        Y = data_mids[["buy"]].copy()
    if pos=="D":
        X = data_defs[['price','av','goals','titu']].copy()
        Y = data_defs[["buy"]].copy()
    if pos=="G":
        X = data_keepers[['price','av','titu']].copy()
        Y = data_keepers[["buy"]].copy()

    X['price'] = X['price'].astype('float')
    sc_X = StandardScaler()
    sc_Y = StandardScaler()
    X = sc_X.fit_transform(X)
    Y = sc_Y.fit_transform(Y)

    # Splitting data into train and test
    train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.15)

    knn = KNeighborsRegressor(n_neighbors=5)
    knn.fit(X, Y)

    predict_y = knn.predict(test_x)
    #print("r2 score KNN", r2_score(test_y, predict_y) )

    Xout = Xplayers[Xplayers.pos==pos].copy()
    if pos=="G":
        Xp = sc_X.fit_transform(Xout.drop(columns=['name','pos','played','team','goals','time']))
    else:
        Xp = sc_X.fit_transform(Xout.drop(columns=['name','pos','played','team','time']))
    Xout["buy"] = np.ceil(sc_Y.inverse_transform(knn.predict(Xp)))
    return Xout, r2_score(test_y, predict_y)

In [4]:
def get_data(league):
    """ Collect data """
    data_file=league+"_full.csv"
    data = pd.read_csv(data_file, delimiter = ',', skiprows=2)
    data = data.rename(columns={"Joueur": "name", "Poste": "pos", \
                                "Côte": "price", "Note": "av", \
                                "Nb match": "played", "But": "goals", \
                                "Temps": "time", "Club": "team"})
    n = int(data.columns[26][1:]) # Number of games
    data = data[['name','pos','price','av','goals','played','time','team']]
    data['titu'] = data['played'].div(n).round(2)
    data['goals'] = data['goals'].div(n).round(4)
    data['pos'] = data['pos'].str.replace('DC','D')
    data['pos'] = data['pos'].str.replace('DL','D')
    data['pos'] = data['pos'].str.replace('MD','M')
    data['pos'] = data['pos'].str.replace('MO','M')
    return data, n

In [33]:
def pick_team(predicted_priceG, predicted_priceD, predicted_priceM, predicted_priceS):
    """ Pick a team that fits into the budget while satistying other constraints """
    budget = 500
    minutes = data.time.mean() # minimum number of minutes played
    best_keeper=0.95*predicted_priceG.av.max() # Only look at the best keepers
    while True:
        team = pd.DataFrame(columns = predicted_priceG.columns)
        # Add main keeper
        team = team.append(predicted_priceG[(predicted_priceG["titu"]>0.7) & (predicted_priceG["av"]>best_keeper)].sample(n=1))
        # Add secondary keepers
        goal_team = team[team["pos"]=="G"]["team"].iloc[0]
        goal_name = team[team["pos"]=="G"]["name"].iloc[0]
        sub_keepers = data_full[(data_full["team"]==goal_team) & (data_full["pos"]=="G") & (data_full["name"] != goal_name)]
        sub_keepers = sub_keepers.sort_values(by='price', ascending=False)
        sub_keepers["buy"] = 10
        sub_keepers.iat[1,9]=5
        team = team.append(sub_keepers.head(2))
        # Add defenders
        team = team.append(predicted_priceD[(predicted_priceD["time"]>minutes)].sample(n=6))
        # Add midfielders
        team = team.append(predicted_priceM[(predicted_priceM["time"]>minutes)].sample(n=6))
        # Add strikers
        team = team.append(predicted_priceS[(predicted_priceS["time"]>minutes)].sample(n=4))
        
        #print(team.loc[(team["price"] <= 12) & (team["pos"]!="G"), "buy"])
        team.loc[(team["price"] <= 13) & (team["pos"]!="G"), "buy"] = 13 
        
        cost = team["buy"].sum()
        average = team.drop(team.index[1:3])["av"].mean()
        titu = team.drop(team.index[1:3])["titu"].mean()
        goals = team["goals"].sum()
        average_def = team[team.pos=="D"].av.mean()
        goals_mid = team[(team.pos=="M")].goals.sum()
        goals_str = team[(team.pos=="A")].goals.sum()
        #print(average_def, goals_mid, goals_str)
        if (cost<=budget) and (titu>0.7) and (average>4.5) and (goals>1.8) and (average_def>5) and (goals_mid>0.1) and (goals_str>0.9):
            print("Total cost:",cost)
            print("Average:",average)
            print("Starting frequency:",titu)
            print("Goals per game:",goals)
            print(team.to_string(index=False))
            break

In [6]:
league="ligue1_21b"
data, n = get_data(league)
data_full = data.copy()
data[data.pos=="G"]=data[(data.pos=="G") & (data.played>0) & (data.price>10)]

Find a K-neighbors regressor with a good r^2 for all positions:

In [7]:
k=0
while True:
    predicted_priceG, r2G = KNR_predict(data[['name','pos','price','av','goals','played','team','titu','time']], "G", k)
    predicted_priceD, r2D = KNR_predict(data[['name','pos','price','av','goals','played','team','titu','time']], "D", k)
    predicted_priceM, r2M = KNR_predict(data[['name','pos','price','av','goals','played','team','titu','time']], "M", k)
    predicted_priceS, r2S = KNR_predict(data[['name','pos','price','av','goals','played','team','titu','time']], "A", k)
    r2=np.array([r2G,r2D,r2M,r2S])
    k+=1
    if(np.amin(r2)>0.7):
        print(k,r2)
        break
    #print()

86 [0.73065089 0.76125833 0.77489183 0.74565925]


Now pick a team:

In [53]:
pick_team(predicted_priceG, predicted_priceD, predicted_priceM, predicted_priceS)

Total cost: 476.0
Average: 5.198235294117647
Starting frequency: 0.8470588235294116
Goals per game: 1.9000000000000001
            name pos  price   av  goals  played    team  titu  time  buy
  Benítez Walter   G   18.0 5.72    0.0     9.0    Nice   0.9 810.0 40.0
    Bulka Marcin   G    7.0 0.00    0.0     0.0    Nice   0.0   0.0 10.0
 Boulhendi Teddy   G    3.0 0.00    0.0     0.0    Nice   0.0   0.0  5.0
     Disasi Axel   D   15.0 4.67    0.0     9.0  Monaco   0.9 711.0 24.0
  Laporte Julien   D   14.0 5.10    0.1    10.0 Lorient   1.0 900.0 24.0
El Hajjam Oualid   D   12.0 4.90    0.1    10.0  Troyes   1.0 606.0 13.0
        Reinildo   D   12.0 4.38    0.0     8.0   Lille   0.8 650.0 13.0
     Bard Melvin   D   19.0 6.06    0.1     8.0    Nice   0.8 638.0 26.0
   Salmier Yoann   D   10.0 5.14    0.0     7.0  Troyes   0.7 617.0 13.0
      Jean Lucas   M    9.0 4.86    0.0     7.0  Monaco   0.7 407.0 13.0
 Boudaoui Hichem   M   13.0 5.64    0.2     7.0    Nice   0.7 427.0 13.0
  Iko

Print a list of interesting players:

In [9]:
print(predicted_priceG.sort_values(by='titu', ascending=False).drop(columns=["team"])[(predicted_priceG["titu"]>0.7) & (predicted_priceG["av"]>5)].to_string(index=False) )
print()
print(predicted_priceD.sort_values(by='titu', ascending=False).drop(columns=["team"])[(predicted_priceD["titu"]>0.7) & (predicted_priceD["av"]>5)].to_string(index=False) )
print()
print(predicted_priceM.sort_values(by='titu', ascending=False).drop(columns=["team"])[(predicted_priceM["titu"]>0.7) & (predicted_priceM["av"]>5)].to_string(index=False) )
print()
print(predicted_priceS.sort_values(by='titu', ascending=False).drop(columns=["team"])[(predicted_priceS["titu"]>0.7) & (predicted_priceS["av"]>5)].to_string(index=False) )

            name pos  price   av  goals  played  titu  time  buy
 Gallon Gauthier   G   14.0 5.30    0.0    10.0   1.0 900.0 23.0
Rajkovic Predrag   G   17.0 5.30    0.0    10.0   1.0 900.0 34.0
    Lafont Alban   G   18.0 5.30    0.0    10.0   1.0 900.0 36.0
       Sels Matz   G   18.0 5.35    0.0    10.0   1.0 900.0 36.0
 Bernardoni Paul   G   15.0 5.35    0.0    10.0   1.0 900.0 27.0
      Nardi Paul   G   15.0 5.50    0.0    10.0   1.0 900.0 32.0
 Leca Jean-Louis   G   16.0 5.55    0.0    10.0   1.0 900.0 32.0
   Anthony Lopes   G   19.0 5.39    0.0     9.0   0.9 794.0 35.0
  Benítez Walter   G   18.0 5.72    0.0     9.0   0.9 810.0 40.0
    Gomis Alfred   G   18.0 5.44    0.0     9.0   0.9 810.0 38.0

                    name pos  price   av  goals  played  titu  time  buy
           Hakimi Achraf   D   27.0 5.50    0.3    10.0   1.0 734.0 32.0
         Manceau Vincent   D   13.0 5.25    0.0    10.0   1.0 900.0 25.0
        Kimpembe Presnel   D   13.0 5.25    0.0    10.0   1.0 815

  print(predicted_priceG.sort_values(by='titu', ascending=False).drop(columns=["team"])[(predicted_priceG["titu"]>0.7) & (predicted_priceG["av"]>5)].to_string(index=False) )
  print(predicted_priceD.sort_values(by='titu', ascending=False).drop(columns=["team"])[(predicted_priceD["titu"]>0.7) & (predicted_priceD["av"]>5)].to_string(index=False) )
  print(predicted_priceM.sort_values(by='titu', ascending=False).drop(columns=["team"])[(predicted_priceM["titu"]>0.7) & (predicted_priceM["av"]>5)].to_string(index=False) )
  print(predicted_priceS.sort_values(by='titu', ascending=False).drop(columns=["team"])[(predicted_priceS["titu"]>0.7) & (predicted_priceS["av"]>5)].to_string(index=False) )
