In [2]:
import pandas
import numpy
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
import xgboost
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error
import pickle
import joblib

# Regression Problem: FIFA

In [3]:
training_data = pandas.read_csv("players_22.csv")
testing_data  = pandas.read_csv("male_players (legacy).csv")

training_data = training_data[['player_url','short_name','long_name','player_positions','potential','value_eur','wage_eur','age','dob','height_cm','weight_kg','club_team_id','club_name','league_name','league_level','club_position','club_jersey_number','club_loaned_from','nationality_id','nationality_name','nation_team_id','nation_position','nation_jersey_number','preferred_foot','weak_foot','skill_moves','international_reputation','work_rate','body_type','real_face','release_clause_eur','player_tags','player_traits','pace','shooting','passing','dribbling','defending','physic','attacking_crossing','attacking_finishing','attacking_heading_accuracy','attacking_short_passing','attacking_volleys','skill_dribbling','skill_curve','skill_fk_accuracy','skill_long_passing','skill_ball_control','movement_acceleration','movement_sprint_speed','movement_agility','movement_reactions','movement_balance','power_shot_power','power_jumping','power_stamina','power_strength','power_long_shots','mentality_aggression','mentality_interceptions','mentality_positioning','mentality_vision','mentality_penalties','mentality_composure','defending_marking_awareness','defending_standing_tackle','defending_sliding_tackle','goalkeeping_diving','goalkeeping_handling','goalkeeping_kicking','goalkeeping_positioning','goalkeeping_reflexes','goalkeeping_speed','ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm','rm','lwb','ldm','cdm','rdm','rwb','lb','lcb','cb','rcb','rb','gk','player_face_url','overall']]
testing_data  = testing_data[ ['player_url','short_name','long_name','player_positions','potential','value_eur','wage_eur','age','dob','height_cm','weight_kg','club_team_id','club_name','league_name','league_level','club_position','club_jersey_number','club_loaned_from','nationality_id','nationality_name','nation_team_id','nation_position','nation_jersey_number','preferred_foot','weak_foot','skill_moves','international_reputation','work_rate','body_type','real_face','release_clause_eur','player_tags','player_traits','pace','shooting','passing','dribbling','defending','physic','attacking_crossing','attacking_finishing','attacking_heading_accuracy','attacking_short_passing','attacking_volleys','skill_dribbling','skill_curve','skill_fk_accuracy','skill_long_passing','skill_ball_control','movement_acceleration','movement_sprint_speed','movement_agility','movement_reactions','movement_balance','power_shot_power','power_jumping','power_stamina','power_strength','power_long_shots','mentality_aggression','mentality_interceptions','mentality_positioning','mentality_vision','mentality_penalties','mentality_composure','defending_marking_awareness','defending_standing_tackle','defending_sliding_tackle','goalkeeping_diving','goalkeeping_handling','goalkeeping_kicking','goalkeeping_positioning','goalkeeping_reflexes','goalkeeping_speed','ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm','rm','lwb','ldm','cdm','rdm','rwb','lb','lcb','cb','rcb','rb','gk','player_face_url','overall']]

  training_data = pandas.read_csv("players_22.csv")
  testing_data  = pandas.read_csv("male_players (legacy).csv")


## Data Preprocessing

### cleaning the training data

manually dropping columns with irrelevant data because they, intuitively, bare no correlation with a player's rating.

In [6]:
training_data.drop(columns=['player_url','short_name','long_name','dob','club_team_id','club_name','league_name','league_level','club_position','club_jersey_number','club_loaned_from','nationality_id','nationality_name','nation_team_id','nation_position','nation_jersey_number','real_face','release_clause_eur','player_tags','player_traits','player_face_url'], inplace=True)

drop columns that have over 30% null values

In [7]:
percentage = (training_data.isnull().sum()/len(training_data))
drop = percentage[percentage > 0.3].index
training_data.drop(columns=drop, inplace=True)

In [8]:
filtered = training_data.filter(items=['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram',
                                    'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb',
                                    'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk'])

def convert_to_number(entry):
    if '+' in entry:
        parts = entry.split('+')
        return int(parts[0]) + int(parts[1])
    elif '-' in entry:
        parts = entry.split('-')
        return int(parts[0]) - int(parts[1])
    else:
        return int(entry)

filtered = filtered.applymap(convert_to_number)

  filtered = filtered.applymap(convert_to_number)


In [9]:
training_data.update(filtered)
int_columns = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram',
                'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb',
                'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk']

training_data[int_columns] = training_data[int_columns].astype(int)

In [10]:
numeric      = training_data.select_dtypes(include=numpy.number)
non_numeric  = training_data.select_dtypes(include=['object'])

imputing the missing numerical values with the **mean**

In [11]:
numeric_with_NaN = numeric.columns[numeric.isnull().any().tolist()]

for column in numeric_with_NaN:
   numeric[column].fillna(numeric[column].mean(), inplace=True)

replacing NaN `object` values with the **mode**

In [12]:
non_numeric_with_NaN = non_numeric.columns[non_numeric.isnull().any().tolist()]

for column in non_numeric_with_NaN:
  non_numeric[column].fillna(non_numeric[column].mode()[0], inplace=True)

**encoding** for non-numeric data

In [13]:
label_encoder = LabelEncoder()

for column in non_numeric:
  non_numeric[column] = label_encoder.fit_transform(non_numeric[column])

concatenating to form the final dataset

In [14]:
training_data = pandas.concat([non_numeric, numeric], axis=1)
training_data.head()

Unnamed: 0,player_positions,preferred_foot,work_rate,body_type,potential,value_eur,wage_eur,age,height_cm,weight_kg,...,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,overall
0,604,0,7,9,93,78000000.0,320000.0,34,170,72,...,67,67,69,64,53,53,53,64,22,93
1,635,1,2,9,92,119500000.0,270000.0,32,185,81,...,69,69,67,64,63,63,63,64,22,92
2,658,1,1,9,91,45000000.0,270000.0,36,187,83,...,62,62,66,63,56,56,56,63,23,91
3,372,1,2,9,91,129000000.0,270000.0,29,175,68,...,66,66,70,65,53,53,53,65,23,91
4,168,1,0,9,91,125500000.0,350000.0,30,181,70,...,83,83,82,78,72,72,72,78,24,91


### function for cleaning the data

In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

def clean_player_data(data):
    """
    The `clean_player_data` function takes a DataFrame as a parameter and returns a DataFrame with the clean data (no missing values and non-numeric data is encoded). It only works if the column data is similar, so the following columns need to be present: ['player_url','short_name','long_name','player_positions','potential','value_eur','wage_eur','age','dob','height_cm','weight_kg','club_team_id','club_name','league_name','league_level','club_position','club_jersey_number','club_loaned_from','nationality_id','nationality_name','nation_team_id','nation_position','nation_jersey_number','preferred_foot','weak_foot','skill_moves','international_reputation','work_rate','body_type','real_face','release_clause_eur','player_tags','player_traits','pace','shooting','passing','dribbling','defending','physic','attacking_crossing','attacking_finishing','attacking_heading_accuracy','attacking_short_passing','attacking_volleys','skill_dribbling','skill_curve','skill_fk_accuracy','skill_long_passing','skill_ball_control','movement_acceleration','movement_sprint_speed','movement_agility','movement_reactions','movement_balance','power_shot_power','power_jumping','power_stamina','power_strength','power_long_shots','mentality_aggression','mentality_interceptions','mentality_positioning','mentality_vision','mentality_penalties','mentality_composure','defending_marking_awareness','defending_standing_tackle','defending_sliding_tackle','goalkeeping_diving','goalkeeping_handling','goalkeeping_kicking','goalkeeping_positioning','goalkeeping_reflexes','goalkeeping_speed','ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm','rm','lwb','ldm','cdm','rdm','rwb','lb','lcb','cb','rcb','rb','gk','player_face_url','overall']
    data: the DataFrame with similar column headings to the `players_22.csv` dataset
    """
    data.drop(columns=['player_url', 'short_name', 'long_name', 'dob', 'club_team_id', 'club_name', 'league_name', 
                       'league_level', 'club_position', 'club_jersey_number', 'club_loaned_from', 'nationality_id', 
                       'nationality_name', 'nation_team_id', 'nation_position', 'nation_jersey_number', 'real_face', 
                       'release_clause_eur', 'player_tags', 'player_traits', 'player_face_url'], inplace=True)

    # drop columns with more than 30% missing values
    percentage = (data.isnull().sum() / len(data))
    drop = percentage[percentage > 0.3].index
    data.drop(columns=drop, inplace=True)

    int_columns = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram',
                   'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb',
                   'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk']
    filtered_columns = data.filter(items=int_columns)

    def convert_to_int(entry):
        entry = str(entry)  # Convert entry to string
        if '+' in entry:
            parts = entry.split('+')
            return int(parts[0]) + int(parts[1])
        elif '-' in entry:
            parts = entry.split('-')
            return int(parts[0]) - int(parts[1])
        else:
            return int(entry)

    filtered_columns = filtered_columns.applymap(convert_to_int)

    # update data with converted columns
    data.update(filtered_columns)
    data[int_columns] = data[int_columns].astype(int)

    numeric_data = data.select_dtypes(include=np.number)
    non_numeric_data = data.select_dtypes(include=['object'])

    # fill missing values in numeric columns with mean
    numeric_NaN = numeric_data.columns[numeric_data.isnull().any()].tolist()
    for column in numeric_NaN:
        numeric_data[column].fillna(numeric_data[column].mean(), inplace=True)

    # fill missing values in non-numeric columns with mode
    non_numeric_NaN = non_numeric_data.columns[non_numeric_data.isnull().any()].tolist()
    for column in non_numeric_NaN:
        non_numeric_data[column].fillna(non_numeric_data[column].mode()[0], inplace=True)

    label_encoder = LabelEncoder()
    for column in non_numeric_data:
        non_numeric_data[column] = label_encoder.fit_transform(non_numeric_data[column])

    return pd.concat([non_numeric_data, numeric_data], axis=1)

cleaning the testing data using the `clean_player_data` function

In [16]:
testing_data = clean_player_data(testing_data)

  filtered_columns = filtered_columns.applymap(convert_to_int)


# Feature Engineering
Using the RandomForest classifier to decide which features are the most important, as opposed to the correlation matrix, which may be too simple

In [20]:
Xtrain, Xtest, Ytrain, Ytest = 

Index(['player_positions', 'preferred_foot', 'work_rate', 'body_type',
       'potential', 'value_eur', 'wage_eur', 'age', 'height_cm', 'weight_kg',
       'weak_foot', 'skill_moves', 'international_reputation', 'pace',
       'shooting', 'passing', 'dribbling', 'defending', 'physic',
       'attacking_crossing', 'attacking_finishing',
       'attacking_heading_accuracy', 'attacking_short_passing',
       'attacking_volleys', 'skill_dribbling', 'skill_curve',
       'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
       'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
       'movement_reactions', 'movement_balance', 'power_shot_power',
       'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
       'mentality_aggression', 'mentality_interceptions',
       'mentality_positioning', 'mentality_vision', 'mentality_penalties',
       'mentality_composure', 'defending_marking_awareness',
       'defending_standing_tackle', 'defendin