In [1]:
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, mutual_info_regression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
# VIEWING OPTIONS IN THE NOTEBOOK
from sklearn import set_config; set_config(display='diagram')

In [2]:
numerical_features = ['games_2022', 
                      'minutes_played_2022', 
                      'goals_2022',
                      'assists_2022', 
                      'goals_against_2022', 
                      'goals_for_2022',
                      'clean_sheet_2022', 
                      'height_in_cm', 
                      'age', 
                      'club_value', 
                      'squad_size', 
                      'term_days_remaining', 
                      'yellow_cards_2022', 
                      'red_cards_2022']

categorical_features = ['foot']

ordinal_features = ['country_of_citizenship',
                    'current_club_domestic_competition_id',
                    'current_club_name']

target = 'market_value_in_eur'

In [3]:
def encode(frame, feature):
    '''
    function that custom encode a categorical feature ordered by the most impactful 
    sub-category on the target
    '''
    # create a temporary dataframe  
    ordering = pd.DataFrame()
    # create an index with the unique values of the selected feature
    ordering['val'] = frame[feature].unique()
    ordering.index = ordering.val
    # create a spmean column with the mean value of the target 
    # for each category of the selected feature, sorted by value
    ordering['spmean'] = frame[[feature, target]]\
                         .groupby(feature).mean()[target]
    ordering = ordering.sort_values('spmean')
    # create a column ordering with the number from 1 to the number of
    # categories for the selected feature
    ordering['ordering'] = range(1, ordering.shape[0]+1)
    # extracting index and that column to a dictionary
    ordering = ordering['ordering'].to_dict()
    return ordering.keys()

In [4]:
for position in ['Attack', 'Defender', 'Goalkeeper', 'Midfield']:

    df = pd.read_pickle(f'../models/{position}_df.pickle')

    feat_ordinal_dict = {}
    for feature in ordinal_features:
        feat_ordinal_dict[feature] = list(encode(df,feature))

    numerical_transformer =  MinMaxScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
        
    feat_ordinal = sorted(feat_ordinal_dict.keys())
    feat_ordinal_values_sorted = [feat_ordinal_dict[i] for i in feat_ordinal]  
        
    ordinal_transformer = OrdinalEncoder(
        categories=feat_ordinal_values_sorted,
    )

    transformer = ColumnTransformer([
        ('num_tr', numerical_transformer, numerical_features),
        ('cat_tr', categorical_transformer, categorical_features),
        ('ord_tr', ordinal_transformer, ordinal_features)

    ])
    transformer.fit(df)
    transformed_df = transformer.transform(df)

    X, y = transformed_df, df[target]
    gbr = GradientBoostingRegressor(random_state=0, verbose = 0)
    gbr.fit(X, y)

    filename = f'../models/{position}_model.pickle'
    pickle.dump(gbr, open(filename, 'wb'))

    filename = f'../models/{position}_transformer.pickle'
    pickle.dump(transformer, open(filename, 'wb'))

    results = cross_validate (gbr, X, y)
    mean_score = np.mean(results['test_score'])
    print(f'The mean R2 score of position "{position}" cross validation is: {mean_score}')

    with open(f"../models/{position}_model.pickle", 'rb') as f:
        # load using pickle de-serializer
        model = pickle.load(f)
    with open(f"../models/{position}_transformer.pickle", 'rb') as f:
        # load using pickle de-serializer
        transformer = pickle.load(f)

print('Models and transformers files saved and checked.')

The mean R2 score of position "Attack" cross validation is: 0.7178998218621399
The mean R2 score of position "Defender" cross validation is: 0.6637816331933813
The mean R2 score of position "Goalkeeper" cross validation is: 0.7379623162603017
The mean R2 score of position "Midfield" cross validation is: 0.6737192937665119
Models and transformers files saved and checked.


In [5]:
player = pd.DataFrame()
player['games_2022'] =[20]
player['minutes_played_2022'] =[1800] 
player['goals_2022'] =[200]
player['assists_2022'] =[15] 
player['goals_against_2022'] =[20] 
player['goals_for_2022'] =[10]
player['clean_sheet_2022'] =[7] 
player['height_in_cm'] =[186] 
player['age'] =[31] 
player['club_value'] =[50000000] 
player['squad_size'] =[18] 
player['term_days_remaining'] =[200]
player['value_goals_for_2022'] =[50000000] 
player['yellow_cards_2022'] =[2] 
player['red_cards_2022'] =[0]
player['sub_position'] =['Left Winger'] 
player['foot'] =['Right']
player['country_of_citizenship'] =['Italy']
player['current_club_domestic_competition_id'] =['IT1'] 
player['current_club_name'] = ['Juventus Turin']

In [6]:
player_transformed = transformer.transform(player)
my_value = model.predict(player_transformed)
my_value

array([46456311.93334123])