In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
dataset_file_path = "data/transfers.csv"

In [3]:
df = pd.read_csv(dataset_file_path)

## Review Data

In [4]:
columns = df.columns
types = df.dtypes

print("Columns:")
for column, dtype in zip(columns, types):
    print(f"{column} : {dtype}")

Columns:
id : int64
playerName : object
age : int64
position : object
marketValue : object
previousClub : object
newClub : object
price : object
season : object
date : object
cleanSheets : int64
savesPercentage : int64
goalieErrors : int64
tacklesWon : int64
interceptions : int64
cleaners : int64
goals : int64
assists : int64
chancesCreated : int64
successRate : float64


## Prepare

### Delete unimport columns

In [5]:
columns_to_delete = ['id', 'playerName', 'season', 'date', 'previousClub', 'newClub']
df = df.drop(columns=columns_to_delete)

### Convert str `market-value` and `price` into int

In [6]:
def convert_price(price_str):
    if price_str in ['free transfer', '?', '-']:
        return 0
    if 'm' in price_str:
        return float(price_str.replace('€', '').replace('m', '').replace(',', '')) * 1_000_000
    elif 'k' in price_str:
        return float(price_str.replace('€', '').replace('k', '').replace(',', '')) * 1_000
    else:
        return float(price_str.replace('€', '').replace(',', ''))
    
df['price'] = df['price'].apply(convert_price)
df['marketValue'] = df['marketValue'].apply(convert_price)

### One-Hot Encoding on `position`

In [7]:
df = pd.get_dummies(df, columns=['position'])

In [8]:
df

Unnamed: 0,age,marketValue,price,cleanSheets,savesPercentage,goalieErrors,tacklesWon,interceptions,cleaners,goals,...,position_CB,position_CF,position_CM,position_DM,position_GK,position_LB,position_LW,position_RB,position_RM,position_RW
0,24,90000000.0,116600000.0,0,0,0,6,5,5,0,...,False,False,False,True,False,False,False,False,False,False
1,24,55000000.0,75000000.0,0,0,0,0,0,0,23,...,False,False,False,False,False,False,False,False,False,False
2,22,42000000.0,40000000.0,0,0,0,4,4,6,0,...,True,False,False,False,False,False,False,False,False,False
3,22,30000000.0,30000000.0,0,0,0,0,0,0,22,...,False,True,False,False,False,False,False,False,False,False
4,30,20000000.0,15000000.0,0,0,0,6,4,5,0,...,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276,27,4500000.0,1500000.0,0,0,0,3,3,2,0,...,False,False,True,False,False,False,False,False,False,False
277,26,1500000.0,1400000.0,1,50,13,0,0,0,0,...,False,False,False,False,True,False,False,False,False,False
278,34,1600000.0,0.0,0,0,0,0,0,0,5,...,False,True,False,False,False,False,False,False,False,False
279,36,800000.0,0.0,0,0,0,1,1,0,0,...,False,False,True,False,False,False,False,False,False,False


In [9]:
columns = df.columns
types = df.dtypes

print("Columns:")
for column, dtype in zip(columns, types):
    print(f"{column} : {dtype}")

Columns:
age : int64
marketValue : float64
price : float64
cleanSheets : int64
savesPercentage : int64
goalieErrors : int64
tacklesWon : int64
interceptions : int64
cleaners : int64
goals : int64
assists : int64
chancesCreated : int64
successRate : float64
position_AM : bool
position_CB : bool
position_CF : bool
position_CM : bool
position_DM : bool
position_GK : bool
position_LB : bool
position_LW : bool
position_RB : bool
position_RM : bool
position_RW : bool


## Model

In [10]:
X = df.drop(columns=['successRate'])
y = df['successRate']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
model = LinearRegression()
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 9.764391922203543e-06
R^2 Score: 0.999999990837515
