In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows',102)


In [2]:
def clean_columns (data):
    
    data = data.rename(columns={'ID':'ID',
                                'Name':'Name',
                                'Age':'Age',
                                'Nationality':'Nationality',
                                'Club':'Club',
                                'BP':'Backup Position',
                                'Position':'Position',                                
                                'Team & Contract':'Team & Contract',
                                'Height':'Height',
                                'Weight':'Weight',
                                'foot':'Dominant Foot',   
                                'Growth':'Growth',
                                'Joined':'Joined',
                                'Loan Date End Value':'Loan Date End Value',
                                'Wage':'Wage',
                                'Release Clause':'Release Clause',
                                'Contract':'Contract',
                                'Attacking':'Attacking',
                                'Crossing':'Crossing',
                                'Finishing':'Finishing',
                                'Heading Accuracy':'Heading Accuracy',
                                'Volleys Passing':'Volleys Passing',
                                'Skill':'Skill',
                                'Dribbling':'Dribbling',
                                'Curve':'Curve',
                                'FK Accuracy':'Free Kick Accuracy',
                                'Long Passing':'Long Passing',
                                'Ball Control':'Ball Control',
                                'Movement':'Movement',
                                'Acceleration':'Acceleration',
                                'Sprint Speed':'Sprint Speed',
                                'Agility':'Agility',
                                'Reactions':'Reactions',
                                'Balance':'Balance',
                                'Power':'Power',
                                'Shot Power':'Shot Power',
                                'Jumping':'Jumping',
                                'Stamina':'Stamina',
                                'Strength':'Strength',
                                'Long Shots':'Long Shots',
                                'Mentality':'Mentality',
                                'Aggression':'Aggression',
                                'Interceptions':'Interceptions',
                                'Positioning':'Positioning',
                                'Vision':'Vision',
                                'Penalties':'Penalties',
                                'Composure':'Composure',
                                'Defending':'Defending',
                                'Marking':'Marking',
                                'Standing Tackle':'Standing Tackle',
                                'Sliding Tackle':'Sliding Tackle',
                                'Goalkeeping':'Goalkeeping',
                                'GK Diving':'Goalkeeper Diving',
                                'GK Handling':'Goalkeeper Handling',
                                'GK Kicking':'Goalkeeper Kicking',
                                'GK Positioning':'Goalkeeper Positioning',
                                'GK Reflexes':'Goalkeeper Reflexes',
                                'Total Stats':'Total Stats',
                                'Base Stats':'Base Stats',
                                'W/F':'Weak Foot',
                                'SM':'Skill Moves',
                                'A/W':'Attacking Work Rate',                                
                                'D/W':'Defensive Work Rate',
                                'IR':'Internal Reputation',
                                'PAC':'Pace',
                                'SHO':'Shooting',
                                'PAS':'Passing',
                                'DRI':'Dribbling',
                                'DEF':'Defense',
                                'PHY':'Physical',
                                'Hits':'Hits',
                                'LS':'Left Striker',
                                'ST':'Striker',
                                'RS':'Right Striker',
                                'LW':'Left Wing',
                                'LF':'Left Forward',
                                'CF':'Center Forward',
                                'RF':'Right Forward',
                                'RW':'Right Wing',
                                'LAM':'Left Attacking Midfielder',
                                'CAM':'Center Attacking Midfielder',
                                'RAM':'Right Attacking Midfielder',
                                'LM':'Left Midfielder',
                                'LCM':'Left Center Midfielder',
                                'CM':'Center Midfielder',
                                'RCM':'Right Center Midfielder',
                                'RM':'Right Midfielder',
                                'LWB':'Left Wing Back',
                                'LDM':'Left Defending Midfielder',
                                'CDM':'Center Defending Midfielder',
                                'RDM':'Right Defending Midfielder',
                                'RWB':'Right Wing Back',
                                'LB':'Left Back',
                                'LCB':'Left Center Back',
                                'CB':'Center Back',
                                'RCB':'Right Center Back',
                                'RB':'Right Back',
                                'GK':'Goal-Keeper',
                                'OVA':'Overall Rating'})
    
    
    # Transform some categorical columns into numerical
            
    data['Internal Reputation'] = pd.to_numeric(list(map(lambda x: x.replace(' ★', ''), data['Internal Reputation'])), errors='coerce')
    data['Skill Moves'] = pd.to_numeric(list(map(lambda x: x.replace('★', ''), data['Skill Moves'])), errors='coerce')
    data['Weak Foot'] = pd.to_numeric(list(map(lambda x: x.replace(' ★', ''), data['Weak Foot'])), errors='coerce')

    def values(amount):
        amount = amount.replace('€', '')
        if amount.find('K')>0:
            return int(float(amount.replace('K', ''))*1000)
        elif amount.find('M')>0:
            return int(float(amount.replace('M', ''))*1000000)
        else:
            return int((amount))

    data['Hits'] = pd.to_numeric(list(map(values, data['Hits'])), errors='coerce')    
    data['Value'] = pd.to_numeric(list(map(values, data['Value'])), errors='coerce')
    data['Wage'] = pd.to_numeric(list(map(values, data['Wage'])), errors='coerce')
    data['Release Clause'] = pd.to_numeric(list(map(values, data['Release Clause'])), errors='coerce')

    def conv_height(x):
        x = x.split("'")
        return (int(x[0])*30.48 + int(x[1].replace('"', '')))

    data['Height'] = pd.to_numeric(list(map(conv_height, data['Height'])), errors='coerce')

    def conv_weight(x):
        x = x.split("lbs")
        x = int(x[0])*0.45359237
        return round(x,2)

    data['Weight'] = pd.to_numeric(list(map(conv_weight, data['Weight'])), errors='coerce')
              
    
    # Reworking NAN values

    # Club - use mode for NAN
    data['Club'] = data['Club'].fillna('Free Agent')
    # Attacking Work Rate - use mode for NAN
    data['Attacking Work Rate'] = data['Attacking Work Rate'].fillna('Medium')
    # Defensive Work Rate - use mode for NAN
    data['Defensive Work Rate'] = data['Defensive Work Rate'].fillna('Medium')

    # Position - substitute NAN for 'U' (for Unknown)
    data['Position'] = data['Position'].fillna('U')

    # Joined - turn into "Year Joined" and drop OG - use mean for NAN
    data[['Joined','Year Joined']] = data['Joined'].str.split(',',expand=True)
    data['Year Joined'] = pd.to_numeric(data['Year Joined'])
    data = data.drop(['Joined'], axis=1)
    mean_yearjoin = data['Year Joined'].mean()
    data['Year Joined'] = data['Year Joined'].fillna(mean_yearjoin)
    data['Year Joined'] = data['Year Joined'].astype(int)

    # Volleys - use mean for NAN
    mean_volleys = data['Volleys'].mean()
    data['Volleys'] = data['Volleys'].fillna(mean_volleys)
    # Curve - use mean for NAN
    mean_curve = data['Curve'].mean()
    data['Curve'] = data['Curve'].fillna(mean_curve)
    # Agility - use mean for NAN
    mean_agility = data['Agility'].mean()
    data['Agility'] = data['Agility'].fillna(mean_agility)
    # Balance - use mean for NAN
    mean_balance = data['Balance'].mean()
    data['Balance'] = data['Balance'].fillna(mean_balance)
    # Jumping - use mean for NAN
    mean_jumping = data['Jumping'].mean()
    data['Jumping'] = data['Jumping'].fillna(mean_jumping)
    # Interceptions - use mean for NAN
    mean_interceptions = data['Interceptions'].mean()
    data['Interceptions'] = data['Interceptions'].fillna(mean_interceptions)
    # Positioning - use mean for NAN
    mean_positioning = data['Positioning'].mean()
    data['Positioning'] = data['Positioning'].fillna(mean_positioning)
    # Vision - use mean for NAN
    mean_vision = data['Vision'].mean()
    data['Vision'] = data['Vision'].fillna(mean_vision)
    # Composure - use mean for NAN
    mean_composure = data['Composure'].mean()
    data['Composure'] = data['Composure'].fillna(mean_composure)
    # Sliding Tackle - use mean for NAN
    mean_slidingt = data['Sliding Tackle'].mean()
    data['Sliding Tackle'] = data['Sliding Tackle'].fillna(mean_slidingt)
    # Hits - use mean for NAN
    mean_hits = data['Hits'].mean()
    data['Hits'] = data['Hits'].fillna(mean_hits)
  
    
    # Numerical variables
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numerical = data.select_dtypes(include=numerics)
    # Drop columns that dont affect the Score
    numerical = numerical.drop(['ID', 'Age', 'Growth'], axis=1)
    # Drop columns that are a Summation of other columns
    numerical = numerical.drop(['Attacking', 'Skill', 'Movement', 'Power', 'Mentality', 'Defending', 'Goalkeeping'], axis=1)
    numerical = numerical.drop(['Pace', 'Shooting', 'Passing', 'Dribbling', 'Defense', 'Physical', 'Total Stats', 'Base Stats'], axis=1)
    
    # Categorical variables
    categorical = data.select_dtypes(include=object)
    # Drop columns that no affect the Score
    categorical = categorical.drop(['Name', 'Team & Contract', 'Contract'], axis=1)

    position = categorical[['Left Striker', 'Striker', 'Right Striker', 'Left Wing', 'Left Forward', 'Center Forward', 'Right Forward', 'Right Wing', 'Left Attacking Midfielder', 'Center Attacking Midfielder', 'Right Attacking Midfielder', 'Left Midfielder', 'Left Center Midfielder', 'Center Midfielder', 'Right Center Midfielder', 'Right Midfielder', 'Left Wing Back', 'Left Defending Midfielder', 'Center Defending Midfielder', 'Right Defending Midfielder', 'Right Wing Back', 'Left Back', 'Left Center Back', 'Center Back', 'Right Center Back', 'Right Back', 'Goal-Keeper']]
    for x in position.columns:
        categorical[['Actual '+x,'Potential '+x]] = categorical[x].str.split('+',expand=True)
        

    categorical = categorical.drop(['Loan Date End', 'Left Striker', 'Striker', 'Right Striker', 'Left Wing', 'Left Forward', 'Center Forward', 'Right Forward', 'Right Wing', 'Left Attacking Midfielder', 'Center Attacking Midfielder', 'Right Attacking Midfielder', 'Left Midfielder', 'Left Center Midfielder', 'Center Midfielder', 'Right Center Midfielder', 'Right Midfielder', 'Left Wing Back', 'Left Defending Midfielder', 'Center Defending Midfielder', 'Right Defending Midfielder', 'Right Wing Back', 'Left Back', 'Left Center Back', 'Center Back', 'Right Center Back', 'Right Back', 'Goal-Keeper',  'Potential Left Striker', 'Potential Striker', 'Potential Right Striker', 'Potential Left Wing', 'Potential Left Forward', 'Potential Center Forward', 'Potential Right Forward', 'Potential Right Wing', 'Potential Left Attacking Midfielder', 'Potential Center Attacking Midfielder', 'Potential Right Attacking Midfielder', 'Potential Left Midfielder', 'Potential Left Center Midfielder', 'Potential Center Midfielder', 'Potential Right Center Midfielder', 'Potential Right Midfielder', 'Potential Left Wing Back', 'Potential Left Defending Midfielder', 'Potential Center Defending Midfielder', 'Potential Right Defending Midfielder', 'Potential Right Wing Back', 'Potential Left Back', 'Potential Left Center Back', 'Potential Center Back', 'Potential Right Center Back', 'Potential Right Back', 'Potential Goal-Keeper'], axis=1)

    return (numerical, categorical)

In [3]:
# data = pd.read_csv('C:/Users/joana/Ironhack/FIFA21/fifa21_train.csv')
data = pd.read_csv('C:/Users/joana/Ironhack/FIFA21/fifa21_validate.csv')

numerical, categorical = clean_columns (data)

# display(numerical)
# display(categorical)

In [4]:
#Normalization

def normalized_columns (numerical):
    
    numerical_norm = numerical.drop(['Overall Rating'], axis=1)

    # Normalizing using MinMaxScaler
            
    transformer = MinMaxScaler().fit(numerical_norm)
    x_normalized = transformer.transform(numerical_norm)
    # print(x_normalized.shape)
    x_normalized=pd.DataFrame(x_normalized, columns=numerical_norm.columns)
    x_normalized.head()

    return (x_normalized)


In [5]:
# Encoding Categorical Data.

def encoded_columns (categorical):

    #label encoder

    label_encoded=categorical.apply(LabelEncoder().fit_transform) 
    label_encoded.head()
    
    return (label_encoded)


In [6]:
# X-y Split

x_normalized = normalized_columns (numerical)

label_encoded = encoded_columns (categorical)

X = pd.concat([x_normalized, label_encoded], axis=1)
X
y = numerical['Overall Rating']
y

0       67
1       68
2       54
3       55
4       70
        ..
1994    60
1995    59
1996    76
1997    63
1998    60
Name: Overall Rating, Length: 1999, dtype: int64

In [7]:
# # Splitting into train set and test set.
    
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# print(X_train.shape)
# print(X_test.shape)
# print(y_train.shape)
# print(y_test.shape)

In [8]:
# Training the model

from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

lm = linear_model.LinearRegression()
# model = lm.fit(X_train,y_train)
model = lm.fit(X,y)

# Preliminary score

from sklearn.metrics import r2_score

# predictions = lm.predict(X_train)
# r2_score(y_train, predictions)

# Testing the model

# predictions = lm.predict(X_train)
predictions = lm.predict(X)


In [9]:
# Model Validation

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import linear_model
import math

# r2_score(y_test, predictions)
# print('R2 : ')
# print(r2_score(y_test, predictions))

# mse = mean_squared_error(y_test, predictions)
# print('Mean Squared Error : ')
# print(mse)

# rmse = math.sqrt(mse)
# print('Mean Error: ')
# print(rmse)

# mae = mean_absolute_error(y_test, predictions)
# print('Mean Absolute Error: ')
# print(mse)

r2_score(y, predictions)
print('R2 : ')
print(r2_score(y, predictions))

mse = mean_squared_error(y, predictions)
print('Mean Squared Error : ')
print(mse)

rmse = math.sqrt(mse)
print('Mean Error: ')
print(rmse)

mae = mean_absolute_error(y, predictions)
print('Mean Absolute Error: ')
print(mse)

R2 : 
0.8947665453389425
Mean Squared Error : 
4.81563874031623
Mean Error: 
2.194456365553034
Mean Absolute Error: 
4.81563874031623
