# Load Data

In [1]:
import pandas as pd

# 데이터프레임으로 변환
laliga_train = pd.read_csv("../csv/LaLiga/train_La.csv")
print(laliga_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2908 entries, 0 to 2907
Data columns (total 48 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Rk       2908 non-null   int64  
 1   Player   2908 non-null   object 
 2   Nation   2908 non-null   object 
 3   Pos      2908 non-null   object 
 4   Squad    2908 non-null   object 
 5   Age      2908 non-null   int64  
 6   Born     2908 non-null   int64  
 7   90s      2908 non-null   float64
 8   Gls      2908 non-null   int64  
 9   Sh       2908 non-null   int64  
 10  SoT      2908 non-null   int64  
 11  SoT%     2404 non-null   float64
 12  Sh/90    2908 non-null   float64
 13  SoT/90   2908 non-null   float64
 14  G/Sh     2404 non-null   float64
 15  G/SoT    2018 non-null   float64
 16  Dist     2403 non-null   float64
 17  FK       2907 non-null   float64
 18  PK       2908 non-null   int64  
 19  PKatt    2908 non-null   int64  
 20  xG       2907 non-null   float64
 21  npxG     2907 

In [2]:
# Drop index column
laliga_train = laliga_train.drop(['Rk'], axis=1)
# print(laliga_train.info())

### Real Assist Leader

In [3]:
top_pass_cmp_ratio = laliga_train['Ast'].max()
print(top_pass_cmp_ratio)
actual_top_assistant = laliga_train.loc[laliga_train['Ast'].idxmax(), 'Player']
print(actual_top_assistant)

21
Lionel Messi


In [4]:
import numpy as np
# print(np.unique(laliga_train['Pos'].values))

replace_values = {
    'DF,FW': 'FW,DF',
    'DF,MF': 'MF,DF',
    'MF,FW': 'FW,MF'
}

laliga_train['Pos'] = laliga_train['Pos'].replace(replace_values)
# print(np.unique(laliga_train['Pos'].values))

# 결측치 채우기

In [5]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

numeric_laliga_train = laliga_train.select_dtypes(include=[np.number])
column_names = numeric_laliga_train.columns

imputer.fit(numeric_laliga_train)
numeric_laliga_train = imputer.transform(numeric_laliga_train)

numeric_laliga_train_df = pd.DataFrame(numeric_laliga_train, columns=column_names)

laliga_train[column_names] = numeric_laliga_train_df.round(2)

# 열 별로 NaN 값이 있는지 확인
column_nan = laliga_train.isna().any()
print(len(column_nan))
print(column_nan)

# laliga_train.info()

47
Player     False
Nation     False
Pos        False
Squad      False
Age        False
Born       False
90s        False
Gls        False
Sh         False
SoT        False
SoT%       False
Sh/90      False
SoT/90     False
G/Sh       False
G/SoT      False
Dist       False
FK         False
PK         False
PKatt      False
xG         False
npxG       False
npxG/Sh    False
G-xG       False
np:G-xG    False
Matches    False
Cmp        False
Att        False
Cmp%       False
TotDist    False
PrgDist    False
Ast        False
xAG        False
xA         False
A-xAG      False
KP         False
1/3        False
PPA        False
CrsPA      False
PrgP       False
Won        False
Lost       False
Won%       False
Tkl        False
TklW       False
Def 3rd    False
Mid 3rd    False
Att 3rd    False
dtype: bool


# Define X, y % Encode

In [6]:
import category_encoders as ce

X = laliga_train.drop(columns=['Ast', 'Player'], axis=1)
y = laliga_train['Ast']

target_encoder = ce.TargetEncoder(cols=['Nation', 'Pos', 'Squad', 'Matches'])

# 인코딩 수행
laliga_train_encoded = target_encoder.fit_transform(X, y)
# 데이터프레임 재구성
laliga_train_encoded['Ast'] = laliga_train['Ast']

X_encoded_train = laliga_train_encoded.drop(columns=['Ast'], axis=1)
y_encoded_train = laliga_train_encoded['Ast']

# Descriptive Statistics

In [7]:
y_encoded_train.describe()

count    2908.000000
mean        1.106946
std         1.844649
min         0.000000
25%         0.000000
50%         0.000000
75%         2.000000
max        21.000000
Name: Ast, dtype: float64

In [8]:
y_binned = pd.qcut(y_encoded_train, q=3, labels=False, duplicates='drop')
print(pd.Series(y_binned).value_counts())

Ast
0    2148
1     760
Name: count, dtype: int64


# Train / Validate / Test Split

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_val_test, y_train, y_val_test = train_test_split(
    X_encoded_train, y_encoded_train,
    test_size=0.3,
    random_state=42,
    shuffle=True,
    stratify=y_binned # 비율을 유지하면서 데이터 분할
)
X_val, X_test, y_val, y_test = train_test_split(
    X_val_test, y_val_test,
    test_size=0.5,
    random_state=42
)

# 1st Random Forest Regressor

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, GridSearchCV
from joblib import dump, load

kf = KFold(n_splits=5, shuffle=True, random_state=42)

random_forest = RandomForestRegressor(n_jobs=-1)

rf_params = {
    'random_state': [42],
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [3, 5, 7],
    'max_features': ['sqrt', 'log2', 0.7, 0.9, None],
    'min_samples_leaf': [3, 5, 7, 9, 11],
    'ccp_alpha': [0.3, 0.5, 0.7, 0.9]
}

gs = GridSearchCV(
    estimator=random_forest,
    param_grid=rf_params,
    scoring='neg_root_mean_squared_error',
    cv=kf
)

gs.fit(X_train, y_train)

# 최적의 모델을 파일로 저장
dump(gs.best_estimator_, 'assist_leader_model_ver1.joblib')

['assist_leader_model_ver1.joblib']

In [11]:
# 나중에 모델을 불러올 때
best_model_1st = load('assist_leader_model_ver1.joblib')

# 최적의 모델로 훈련 데이터를 사용해 최종 모델 훈련
best_model_1st.fit(X_train, y_train)

# Performance Evaluation

In [12]:
from sklearn.metrics import root_mean_squared_error

# 훈련데이터 검증
train_preds = best_model_1st.predict(X_train)
train_rmse = root_mean_squared_error(y_train, train_preds)
print(f'Train RMSE: {train_rmse:.2f}')
train_score = best_model_1st.score(X_train, y_train)
print(f'Best model Train R^2: {train_score:.2f}')

# 검증 데이터로 성능 평가
val_preds = best_model_1st.predict(X_val)
val_rmse = root_mean_squared_error(y_val, val_preds)
print(f'Validation RMSE: {val_rmse:.2f}')
val_score = best_model_1st.score(X_val, y_val)
print(f'Best model Validation R^2: {val_score:.2f}')

# 테스트 데이터로 최종 성능 평가
test_preds = best_model_1st.predict(X_test)
test_rmse = root_mean_squared_error(y_test, test_preds)
print(f'Test RMSE: {test_rmse:.2f}')
test_score = best_model_1st.score(X_test, y_test)
print(f'Best model Test R^2: {test_score:.2f}')

Train RMSE: 0.88
Best model Train R^2: 0.78
Validation RMSE: 0.90
Best model Validation R^2: 0.78
Test RMSE: 0.78
Best model Test R^2: 0.78


In [13]:
# Predicted Validation DataFrame
predicted_data = pd.DataFrame({
    'Player': X_val.index.map(laliga_train['Player']),
    'Predicted_Ast': val_preds
})

predicted_assist_leader = predicted_data.loc[predicted_data['Predicted_Ast'].idxmax(), 'Player']

# Real Validation DataFrame
real_validation_df = pd.DataFrame({
    'Player': X_val.index.map(laliga_train['Player']),
    'Ast': y_val
})
# Validation Data 내 실제 도움왕
actual_assist_leader = real_validation_df.loc[real_validation_df['Ast'].idxmax(), 'Player']

if actual_assist_leader == predicted_assist_leader:
    print(f"모델이 정확하게 도움왕을 예측했습니다: {actual_assist_leader}")
else:
    print(f"모델의 예측 도움왕: {predicted_assist_leader}, 실제 도움왕: {actual_assist_leader}")

모델이 정확하게 도움왕을 예측했습니다: Karim Benzema


# 2nd Random Forest Regressor

In [14]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

random_forest = RandomForestRegressor(n_jobs=-1)

rf_params = {
    'random_state': [42],
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [3, 5, 7],
    'min_samples_leaf': [3, 5, 7, 9, 11]
}

gs = GridSearchCV(
    estimator=random_forest,
    param_grid=rf_params,
    scoring='neg_root_mean_squared_error',
    cv=kf
)

gs.fit(X_train, y_train)

# 최적의 모델을 파일로 저장
dump(gs.best_estimator_, 'assist_leader_model_ver2.joblib')

['assist_leader_model_ver2.joblib']

In [33]:
best_model_2nd = load('assist_leader_model_ver2.joblib')

best_model_2nd.fit(X_train, y_train)

# Performance Evaluate

In [16]:
train_preds = best_model_2nd.predict(X_train)
train_rmse = root_mean_squared_error(y_train, train_preds)
print(f'Train RMSE: {train_rmse:.2f}')
train_score = best_model_2nd.score(X_train, y_train)
print(f'Train R^2: {train_score:.2f}')

val_preds = best_model_2nd.predict(X_val)
val_rmse = root_mean_squared_error(y_val, val_preds)
print(f'Validate RMSE: {val_rmse:.2f}')
val_score = best_model_2nd.score(X_val, y_val)
print(f'Validate R^2: {val_score:.2f}')

test_preds = best_model_2nd.predict(X_test)
test_rmse = root_mean_squared_error(y_test, test_preds)
print(f'Test RMSE: {test_rmse:.2f}')
test_score = best_model_2nd.score(X_test, y_test)
print(f'Test R^2: {test_score:.2f}')

Train RMSE: 0.20
Train R^2: 0.99
Validate RMSE: 0.23
Validate R^2: 0.99
Test RMSE: 0.14
Test R^2: 0.99


In [17]:
# Predicted Validation DataFrame
predicted_data = pd.DataFrame({
    'Player': X_val.index.map(laliga_train['Player']),
    'Predicted_Ast': val_preds
})
# 예측 도움왕
predicted_assist_leader = predicted_data.loc[predicted_data['Predicted_Ast'].idxmax(), 'Player']

# Real Validation DataFrame
real_validation_df = pd.DataFrame({
    'Player': X_val.index.map(laliga_train['Player']),
    'Ast': y_val
})

actual_assist_leader = real_validation_df.loc[real_validation_df['Ast'].idxmax(), 'Player']

# 득점왕 비교
if actual_assist_leader == predicted_assist_leader:
    print(f"모델이 정확하게 도움왕를 예측했습니다: {actual_assist_leader}")
else:
    print(f"모델의 예측 도움왕: {predicted_assist_leader}, 실제 도움왕: {actual_assist_leader}")

모델이 정확하게 도움왕를 예측했습니다: Karim Benzema


## 1차 모델의 피처 중요도 확인

In [18]:
feature_importances = best_model_1st.feature_importances_
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print(importance_df)

    Feature  Importance
29      xAG    0.727746
31    A-xAG    0.263238
30       xA    0.009016
0    Nation    0.000000
23  Matches    0.000000
25      Att    0.000000
26     Cmp%    0.000000
27  TotDist    0.000000
28  PrgDist    0.000000
32       KP    0.000000
33      1/3    0.000000
34      PPA    0.000000
35    CrsPA    0.000000
36     PrgP    0.000000
37      Won    0.000000
38     Lost    0.000000
39     Won%    0.000000
40      Tkl    0.000000
41     TklW    0.000000
42  Def 3rd    0.000000
43  Mid 3rd    0.000000
24      Cmp    0.000000
22  np:G-xG    0.000000
1       Pos    0.000000
21     G-xG    0.000000
2     Squad    0.000000
3       Age    0.000000
4      Born    0.000000
5       90s    0.000000
6       Gls    0.000000
7        Sh    0.000000
8       SoT    0.000000
9      SoT%    0.000000
10    Sh/90    0.000000
11   SoT/90    0.000000
12     G/Sh    0.000000
13    G/SoT    0.000000
14     Dist    0.000000
15       FK    0.000000
16       PK    0.000000
17    PKatt    0

## 2차 모델의 피처 중요도 확인

In [19]:
feature_importances = best_model_2nd.feature_importances_
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print(importance_df)

    Feature  Importance
29      xAG    0.671093
31    A-xAG    0.297529
30       xA    0.019831
32       KP    0.001184
22  np:G-xG    0.000599
35    CrsPA    0.000586
36     PrgP    0.000535
5       90s    0.000509
14     Dist    0.000461
0    Nation    0.000459
34      PPA    0.000455
15       FK    0.000397
6       Gls    0.000377
39     Won%    0.000363
33      1/3    0.000342
44  Att 3rd    0.000338
16       PK    0.000327
13    G/SoT    0.000322
11   SoT/90    0.000319
3       Age    0.000305
2     Squad    0.000276
8       SoT    0.000261
7        Sh    0.000260
9      SoT%    0.000248
10    Sh/90    0.000247
26     Cmp%    0.000225
40      Tkl    0.000224
19     npxG    0.000220
21     G-xG    0.000218
38     Lost    0.000172
4      Born    0.000158
25      Att    0.000141
37      Won    0.000135
42  Def 3rd    0.000123
43  Mid 3rd    0.000112
20  npxG/Sh    0.000101
18       xG    0.000101
12     G/Sh    0.000092
1       Pos    0.000079
24      Cmp    0.000078
27  TotDist    0

# Apply Model to Test Data

## Load Test Data

In [20]:
laliga_test = pd.read_csv('../csv/LaLiga/2023-2024_La.csv', encoding='cp949')

laliga_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 609 entries, 0 to 608
Data columns (total 48 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Rk       609 non-null    int64  
 1   Player   609 non-null    object 
 2   Nation   609 non-null    object 
 3   Pos      609 non-null    object 
 4   Squad    609 non-null    object 
 5   Age      607 non-null    object 
 6   Born     607 non-null    float64
 7   90s      609 non-null    float64
 8   Gls      609 non-null    int64  
 9   Sh       609 non-null    int64  
 10  SoT      609 non-null    int64  
 11  SoT%     493 non-null    float64
 12  Sh/90    609 non-null    float64
 13  SoT/90   609 non-null    float64
 14  G/Sh     493 non-null    float64
 15  G/SoT    413 non-null    float64
 16  Dist     493 non-null    float64
 17  FK       606 non-null    float64
 18  PK       609 non-null    int64  
 19  PKatt    609 non-null    int64  
 20  xG       606 non-null    float64
 21  npxG     606 non

In [21]:
laliga_test = laliga_test.drop(['Rk'], axis=1)

replace_values = {
    'DF,FW': 'FW,DF',
    'DF,MF': 'MF,DF',
    'MF,FW': 'FW,MF'
}

laliga_test['Pos'] = laliga_test['Pos'].replace(replace_values)

In [22]:
# 'Age' 열에서 '-' 앞의 정보만 추출하여 새로운 열로 저장
laliga_test['Age'] = laliga_test['Age'].str.split('-').str[0]

# 숫자로 변환할 수 없는 값이 있는 경우 NaN으로 변경(object -> NaN)
laliga_test['Age'] = pd.to_numeric(laliga_test['Age'], errors='coerce')

## 결측치 채우기

In [23]:
numeric_laliga_test = laliga_test.select_dtypes(include=[np.number])
column_names = numeric_laliga_test.columns

imputer.fit(numeric_laliga_test)
numeric_laliga_test = imputer.transform(numeric_laliga_test)

numeric_laliga_test_df = pd.DataFrame(numeric_laliga_test, columns=column_names)

laliga_test[column_names] = numeric_laliga_test_df.round(2)

# 열 별로 NaN 값이 있는지 확인
column_nan = laliga_test.isna().any()
print(column_nan)

Player     False
Nation     False
Pos        False
Squad      False
Age        False
Born       False
90s        False
Gls        False
Sh         False
SoT        False
SoT%       False
Sh/90      False
SoT/90     False
G/Sh       False
G/SoT      False
Dist       False
FK         False
PK         False
PKatt      False
xG         False
npxG       False
npxG/Sh    False
G-xG       False
np:G-xG    False
Matches    False
Cmp        False
Att        False
Cmp%       False
TotDist    False
PrgDist    False
Ast        False
xAG        False
xA         False
A-xAG      False
KP         False
1/3        False
PPA        False
CrsPA      False
PrgP       False
Won        False
Lost       False
Won%       False
Tkl        False
TklW       False
Def 3rd    False
Mid 3rd    False
Att 3rd    False
dtype: bool


## Independent Variable, Dependent Variable

In [24]:
X_test = laliga_test.drop(columns=['Ast', 'Player'], axis=1)
y_test = laliga_test['Ast']

laliga_test_encoded = target_encoder.fit_transform(X_test, y_test)

laliga_test_encoded['Ast'] = laliga_test['Ast']

X_encoded_test = laliga_test_encoded.drop(columns=['Ast'], axis=1)
y_encoded_test = laliga_test_encoded['Ast']

## 1st Model Test

In [25]:
test_preds = best_model_1st.predict(X_encoded_test)
rmse = root_mean_squared_error(y_encoded_test, test_preds)
print(f'1st Random Forest evaluation by RMSE: {rmse:.2f}')
test_coef = best_model_1st.score(X_encoded_test, y_encoded_test)
print(f'1st Random Forest evaluation by coefficient: {test_coef:.2f}')

copied_X_test = X_encoded_test.copy()

copied_X_test['Predicted_Ast'] = test_preds
max_predicted_index = copied_X_test['Predicted_Ast'].idxmax()
predicted_assist_leader = laliga_test.loc[max_predicted_index, 'Player']

print(f'Assist Leader in the 23-24 season predicted by the model: {predicted_assist_leader}')

1st Random Forest evaluation by RMSE: 0.83
1st Random Forest evaluation by coefficient: 0.80
Assist Leader in the 23-24 season predicted by the model: Alex Baena


## 2nd Model Test

In [26]:
test_preds = best_model_2nd.predict(X_encoded_test)
rmse = root_mean_squared_error(y_encoded_test, test_preds)
print(f'2nd Random Forest evaluation by RMSE: {rmse:.2f}')
test_coef = best_model_2nd.score(X_encoded_test, y_encoded_test)
print(f'2nd Random Forest evaluation by coefficient: {test_coef:.2f}')

copied_X_test = X_encoded_test.copy()

copied_X_test['Predicted_Ast'] = test_preds
max_predicted_index = copied_X_test['Predicted_Ast'].idxmax()
predicted_assist_leader = laliga_test.loc[max_predicted_index, 'Player']

print(f'Assist Leader in the 23-24 season predicted by the model: {predicted_assist_leader}')

2nd Random Forest evaluation by RMSE: 0.16
2nd Random Forest evaluation by coefficient: 0.99
Assist Leader in the 23-24 season predicted by the model: Alex Baena


## 2차 모델의 피처 중요도 확인

In [34]:
feature_importances = best_model_2nd.feature_importances_
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print(importance_df)

    Feature  Importance
29      xAG    0.671093
31    A-xAG    0.297529
30       xA    0.019831
32       KP    0.001184
22  np:G-xG    0.000599
35    CrsPA    0.000586
36     PrgP    0.000535
5       90s    0.000509
14     Dist    0.000461
0    Nation    0.000459
34      PPA    0.000455
15       FK    0.000397
6       Gls    0.000377
39     Won%    0.000363
33      1/3    0.000342
44  Att 3rd    0.000338
16       PK    0.000327
13    G/SoT    0.000322
11   SoT/90    0.000319
3       Age    0.000305
2     Squad    0.000276
8       SoT    0.000261
7        Sh    0.000260
9      SoT%    0.000248
10    Sh/90    0.000247
26     Cmp%    0.000225
40      Tkl    0.000224
19     npxG    0.000220
21     G-xG    0.000218
38     Lost    0.000172
4      Born    0.000158
25      Att    0.000141
37      Won    0.000135
42  Def 3rd    0.000123
43  Mid 3rd    0.000112
20  npxG/Sh    0.000101
18       xG    0.000101
12     G/Sh    0.000092
1       Pos    0.000079
24      Cmp    0.000078
27  TotDist    0

# Premier League Test

In [30]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import category_encoders as ce
from sklearn.metrics import mean_squared_error

class PremierLeagueDataProcessor:
    def __init__(self, file_path, encoding='cp949'):
        self.file_path = file_path
        self.encoding = encoding
        self.imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
        self.target_encoder = ce.TargetEncoder(cols=['Nation', 'Pos', 'Squad', 'Matches'])
        self.df = None
        self.processed_data = None

    def load_data(self):
        self.df = pd.read_csv(self.file_path, encoding=self.encoding)
        self.df.info()
        self.df = self.df.drop(['Rk'], axis=1)
        
        replace_values = {
            'DF,FW': 'FW,DF',
            'DF,MF': 'MF,DF',
            'MF,FW': 'FW,MF'
        }
        
        self.df['Pos'] = self.df['Pos'].replace(replace_values)

        return self.df

    def impute_values(self):
        numeric_data = self.df.select_dtypes(include=[np.number])
        col_names = numeric_data.columns
        self.imputer.fit(numeric_data)
        imputed_data = self.imputer.transform(numeric_data)
        self.df[col_names] = pd.DataFrame(imputed_data, columns=col_names).round(2)
        return self.df

    def preprocess_data(self):
        X = self.df.drop(columns=['Player', 'Ast'], axis=1)
        y = self.df['Ast']

        self.processed_data = self.target_encoder.fit_transform(X, y)
        self.processed_data = pd.DataFrame(self.processed_data, columns=X.columns)
        self.processed_data['Ast'] = y.values

        X_test = self.processed_data.drop(columns=['Ast'], axis=1)
        y_test = self.processed_data['Ast']
        return X_test, y_test

    def evaluate_model(self, model, X_test, y_test):
        preds = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        score = model.score(X_test, y_test)
        return rmse, score

# Usage example
file_path = "../csv/2023-2024_Pre.csv"
processor = PremierLeagueDataProcessor(file_path)

processor.load_data()
processor.impute_values()
print(">>> Preprocessing data...")
processor.df.info()

X_encoded_test, y_encoded_test = processor.preprocess_data()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 580 entries, 0 to 579
Data columns (total 48 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Rk       580 non-null    int64  
 1   Player   580 non-null    object 
 2   Nation   580 non-null    object 
 3   Pos      580 non-null    object 
 4   Squad    580 non-null    object 
 5   Age      580 non-null    int64  
 6   Born     580 non-null    int64  
 7   90s      580 non-null    float64
 8   Gls      580 non-null    int64  
 9   Sh       580 non-null    int64  
 10  SoT      580 non-null    int64  
 11  SoT%     465 non-null    float64
 12  Sh/90    580 non-null    float64
 13  SoT/90   580 non-null    float64
 14  G/Sh     465 non-null    float64
 15  G/SoT    402 non-null    float64
 16  Dist     465 non-null    float64
 17  FK       580 non-null    int64  
 18  PK       580 non-null    int64  
 19  PKatt    580 non-null    int64  
 20  xG       580 non-null    float64
 21  npxG     580 non

## 1st Model Test

In [31]:
test_preds = best_model_1st.predict(X_encoded_test)
rmse, test_coef = processor.evaluate_model(best_model_1st, X_encoded_test, y_encoded_test)

print(f'1st Random Forest evaluation by RMSE: {rmse:.2f}')
print(f'1st Random Forest evaluation by coefficient: {test_coef:.2f}')

copied_X_test = X_encoded_test.copy()

copied_X_test['Predicted_Ast'] = test_preds
max_predicted_index = copied_X_test['Predicted_Ast'].idxmax()
predicted_assist_leader = processor.df.loc[max_predicted_index, 'Player']

print(f'Assist Leader in the 23-24 season predicted by the model: {predicted_assist_leader}')

1st Random Forest evaluation by RMSE: 1.05
1st Random Forest evaluation by coefficient: 0.80
Assist Leader in the 23-24 season predicted by the model: Kieran Trippier


## 2nd Model Test

In [32]:
test_preds = best_model_2nd.predict(X_encoded_test)
rmse, test_coef = processor.evaluate_model(best_model_2nd, X_encoded_test, y_encoded_test)

print(f'2nd Random Forest evaluation by RMSE: {rmse:.2f}')
print(f'2nd Random Forest evaluation by coefficient: {test_coef:.2f}')

copied_X_test = X_encoded_test.copy()

copied_X_test['Predicted_Ast'] = test_preds
max_predicted_index = copied_X_test['Predicted_Ast'].idxmax()
predicted_assist_leader = processor.df.loc[max_predicted_index, 'Player']

print(f'Assist Leader in the 23-24 season predicted by the model: {predicted_assist_leader}')

2nd Random Forest evaluation by RMSE: 0.29
2nd Random Forest evaluation by coefficient: 0.98
Assist Leader in the 23-24 season predicted by the model: Pascal Gross
