In [1]:
from ml_model_eval import pred_proba_plot, plot_cross_val_confusion_matrix, plot_learning_curve
from data_processing import scale_df
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
import pandas as pd
from sklearn import preprocessing
df_saved_name = 'football_data.csv'
df_ml = pd.read_csv(df_saved_name)
df_ml=df_ml.fillna(0)
columns_to_keep = ['Time','Div','Date','HomeTeam','AwayTeam','FTR','Referee','HTR']
df_keep = df_ml[columns_to_keep]
columns_to_scale = [col for col in df_ml.columns if col not in columns_to_keep]
df_scale = df_ml[columns_to_scale]
scaled_np = preprocessing.scale(df_scale)
scaled_df = pd.DataFrame(scaled_np, columns=df_scale.columns)
data = pd.concat([df_keep, scaled_df], axis=1)
data=data.drop(['Div','Referee','HTR','Time','Date'],axis=1)
desired_columns = ['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR']
data = data.loc[:, desired_columns]

last_season = 'last_season.csv'
last_season = pd.read_csv(last_season)
last_season=last_season.fillna(0)


def calculate_rolling_averages_home(df, team, n=5):
    # Columns to exclude from rolling average calculation
    exclude_columns = ['FTR', 'HomeTeam', 'AwayTeam']
    
    # Filter DataFrame for rows where the specified team is either the HomeTeam or AwayTeam
    team_df = df[(df['HomeTeam'] == team)]
    
    # Columns to calculate rolling averages for
    columns_to_average = team_df.columns.difference(exclude_columns)
    # Calculate rolling averages for the specified columns
    rolling_stats = team_df[columns_to_average].rolling(window=n).mean().shift(1)
    # Concatenate the team information with the rolling statistics
    rolling_stats = pd.concat([team_df[exclude_columns], rolling_stats], axis=1)
    
    return rolling_stats
def calculate_rolling_averages_away(df, team, n=10):
    # Columns to exclude from rolling average calculation
    exclude_columns = ['FTR', 'HomeTeam', 'AwayTeam']
    
    # Filter DataFrame for rows where the specified team is either the HomeTeam or AwayTeam
    team_df = df[(df['AwayTeam'] == team)]
    
    # Columns to calculate rolling averages for
    columns_to_average = team_df.columns.difference(exclude_columns)
    # Calculate rolling averages for the specified columns
    rolling_stats = team_df[columns_to_average].rolling(window=n).mean().shift(1)
    # Concatenate the team information with the rolling statistics
    rolling_stats = pd.concat([team_df[exclude_columns], rolling_stats], axis=1)
    
    return rolling_stats


In [2]:
home_win = last_season.groupby('HomeTeam')['FTR'].apply(lambda x: (x == 'H').mean())
home_draw = last_season.groupby('HomeTeam')['FTR'].apply(lambda x: (x == 'D').mean())
away_win = last_season.groupby('AwayTeam')['FTR'].apply(lambda x: (x == 'A').mean())
away_draw = last_season.groupby('AwayTeam')['FTR'].apply(lambda x: (x == 'D').mean())

# Tạo DataFrame mới từ các Series trên
last_season_stats = pd.DataFrame({
    'LSHW': home_win,
    'LSHD': home_draw,
    'LSAW': away_win,
    'LSAD': away_draw
}).reset_index()

# Đổi tên cột 'index' thành 'Team' để dễ dàng merge
last_season_stats.rename(columns={'index': 'Team'}, inplace=True)

In [3]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import numpy as np
# Prepare the dataset
def prepare_dataset_away(data, teams):
    all_data = []
    for team in teams:
        rolling_stats = calculate_rolling_averages_away(data, team)
        all_data.append(rolling_stats)
    return pd.concat(all_data)
def prepare_dataset_home(data, teams):
    all_data = []
    for team in teams:
        rolling_stats = calculate_rolling_averages_home(data, team)
        all_data.append(rolling_stats)
    return pd.concat(all_data)
# List of teams
teams = data['HomeTeam'].unique()

# Prepare the dataset
prepare_dataset_away = prepare_dataset_away(data, teams)
prepare_dataset_away = prepare_dataset_away.fillna(0)
prepare_dataset_home = prepare_dataset_home(data, teams)
prepare_dataset_home = prepare_dataset_home.fillna(0)
prepare_dataset_away['FTR'] = prepare_dataset_away['FTR'].astype('category').cat.codes
prepare_dataset_home['FTR'] = prepare_dataset_home['FTR'].astype('category').cat.codes

import pandas as pd

# Giả sử bạn đã có hai dataframe prepare_dataset_away và prepare_dataset_home

# Merge hai dataframe
merged_df = pd.merge(prepare_dataset_away, prepare_dataset_home, on=['FTR', 'HomeTeam', 'AwayTeam'], suffixes=('_away', '_home'))

# Chọn các cột cần thiết
columns_away = ['AC_away', 'AF_away', 'AR_away', 'AS_away', 'AST_away', 'AY_away', 'FTAG_away']
columns_home = ['FTHG_home', 'HC_home', 'HF_home', 'HR_home', 'HS_home', 'HST_home', 'HY_home']
columns_to_group = ['FTR', 'HomeTeam', 'AwayTeam'] + columns_away + columns_home

# Tạo dataframe mới và nhóm theo 'FTR', 'HomeTeam', 'AwayTeam'
new_df = merged_df[columns_to_group].groupby(['FTR', 'HomeTeam', 'AwayTeam']).mean().reset_index()

current_df_copy = new_df.copy()

# Tiếp theo, merge `current_df_copy` với `last_season_df` để lấy thông số cho đội nhà
current_df_copy = current_df_copy.merge(last_season_stats[['Team', 'LSHW', 'LSHD']], left_on='HomeTeam', right_on='Team', how='left')
current_df_copy.rename(columns={'LSHW': 'LSHW_home', 'LSHD': 'LSHD_home'}, inplace=True)

# Sau đó, merge `current_df_copy` với `last_season_df` một lần nữa để lấy thông số cho đội khách
current_df_copy = current_df_copy.merge(last_season_stats[['Team', 'LSAW', 'LSAD']], left_on='AwayTeam', right_on='Team', how='left', suffixes=('', '_away'))
current_df_copy.rename(columns={'LSAW': 'LSAW_away', 'LSAD': 'LSAD_away'}, inplace=True)

# Loại bỏ các cột 'Team' và 'Team_away' không cần thiết
current_df_copy.drop(['Team', 'Team_away'], axis=1, inplace=True)

# Tạo một bản đồ thay thế cho các đội đặc biệt
special_teams_map = {
    'Burnley': 'Leicester',
    'Sheffield United': 'Leeds',
    'Luton': 'Southampton'
}

# Thay thế thông số của các đội đặc biệt bằng thông số của đội tương ứng
for team, replace_with in special_teams_map.items():
    # Lấy thông số của đội thay thế
    replace_stats = last_season_stats.loc[last_season_stats['Team'] == replace_with, ['LSHW', 'LSHD', 'LSAW', 'LSAD']].iloc[0]
    
    # Cập nhật thông số cho đội cần thay thế trong DataFrame hiện tại
    current_df_copy.loc[current_df_copy['HomeTeam'] == team, ['LSHW_home', 'LSHD_home']] = replace_stats[['LSHW', 'LSHD']].values
    current_df_copy.loc[current_df_copy['AwayTeam'] == team, ['LSAW_away', 'LSAD_away']] = replace_stats[['LSAW', 'LSAD']].values

# Encode categorical variables
current_df_copy['HomeTeam'] = current_df_copy['HomeTeam'].astype('category').cat.codes
current_df_copy['AwayTeam'] = current_df_copy['AwayTeam'].astype('category').cat.codes
# Define features and target variable
X = current_df_copy.drop('FTR', axis=1)
y = current_df_copy['FTR']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
param_grid = {
    'n_estimators': [100, 200],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Initialize the classifier
rf = RandomForestClassifier(random_state=42)

# Initialize the GridSearchCV with error handling
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, error_score=np.nan)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Predict with the best model
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Print the accuracy score
print("Accuracy Score:")
print(accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best parameters found:  {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.58      0.61        33
           1       0.00      0.00      0.00         7
           2       0.64      0.75      0.69        36

    accuracy                           0.61        76
   macro avg       0.43      0.44      0.44        76
weighted avg       0.59      0.61      0.59        76

Confusion Matrix:
[[19  2 12]
 [ 4  0  3]
 [ 6  3 27]]
Accuracy Score:
0.6052631578947368


In [5]:
# Import necessary libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import numpy as np
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_neighbors': [1,3, 5, 7, 9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'p': [1, 2]
}

# Initialize the KNN classifier
knn = KNeighborsClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Predict with the best model
best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print the confusion matrix 
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Print the accuracy score
print("Accuracy Score:") 
print(accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 252 candidates, totalling 1260 fits
Best parameters found:  {'metric': 'manhattan', 'n_neighbors': 41, 'p': 1, 'weights': 'uniform'}
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.21      0.30        33
           1       0.00      0.00      0.00         7
           2       0.51      0.86      0.64        36

    accuracy                           0.50        76
   macro avg       0.34      0.36      0.31        76
weighted avg       0.46      0.50      0.43        76

Confusion Matrix:
[[ 7  0 26]
 [ 3  0  4]
 [ 4  1 31]]
Accuracy Score:
0.5


In [6]:
# Import necessary libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import numpy as np

# Define the parameter grid for GridSearchCV specific to Decision Tree
# Define the parameter grid for GridSearchCV specific to Decision Tree
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]  # Removed 'auto' from the options
}

# Initialize the Decision Tree classifier
dt = DecisionTreeClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Predict with the best model
best_dt = grid_search.best_estimator_
y_pred = best_dt.predict(X_test)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Print the accuracy score
print("Accuracy Score:")
print(accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 270 candidates, totalling 1350 fits
Best parameters found:  {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2}
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.55      0.60        33
           1       0.00      0.00      0.00         7
           2       0.66      0.75      0.70        36

    accuracy                           0.59        76
   macro avg       0.44      0.43      0.43        76
weighted avg       0.60      0.59      0.59        76

Confusion Matrix:
[[18  5 10]
 [ 3  0  4]
 [ 6  3 27]]
Accuracy Score:
0.5921052631578947


In [7]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
# Initialize the Gaussian Naive Bayes classifier
gnb = GaussianNB()

# Define the parameter grid for GridSearchCV
param_grid = {
    'var_smoothing': np.logspace(0,-9, num=100)
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=gnb, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy', verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Predict with the best model
best_gnb = grid_search.best_estimator_
y_pred = best_gnb.predict(X_test)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Print the accuracy score
print("Accuracy Score:")
print(accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found:  {'var_smoothing': 0.02848035868435802}
Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.30      0.40        33
           1       0.00      0.00      0.00         7
           2       0.53      0.86      0.65        36

    accuracy                           0.54        76
   macro avg       0.37      0.39      0.35        76
weighted avg       0.50      0.54      0.48        76

Confusion Matrix:
[[10  0 23]
 [ 2  0  5]
 [ 5  0 31]]
Accuracy Score:
0.5394736842105263


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np

# Định nghĩa tham số grid cho GridSearchCV
param_grid = {
    'C': [0.1, 1, 10], 
    'gamma': [0.0001, 0.001, 0.01, 0.1],
    'kernel': ['rbf', 'linear']
}

# Khởi tạo mô hình SVM 
svm = SVC()

# Khởi tạo GridSearchCV với scoring phù hợp cho bài toán phân loại đa lớp
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# Huấn luyện mô hình với GridSearchCV
grid_search.fit(X_train, y_train)

# Lấy siêu tham số tốt nhất
best_params = grid_search.best_params_
print("Siêu tham số tốt nhất: ", best_params)

# Đánh giá mô hình với siêu tham số tốt nhất
best_svm = grid_search.best_estimator_
y_pred = best_svm.predict(X_test)

# In ra báo cáo phân loại
print("Báo cáo phân loại:")
print(classification_report(y_test, y_pred))

# In ra ma trận nhầm lẫn
print("Ma trận nhầm lẫn:")
print(confusion_matrix(y_test, y_pred))

# In ra độ chính xác
print("Độ chính xác:")
print(accuracy_score(y_test, y_pred))

Siêu tham số tốt nhất:  {'C': 1, 'gamma': 0.0001, 'kernel': 'linear'}
Báo cáo phân loại:
              precision    recall  f1-score   support

           0       0.62      0.45      0.53        33
           1       0.00      0.00      0.00         7
           2       0.59      0.83      0.69        36

    accuracy                           0.59        76
   macro avg       0.40      0.43      0.41        76
weighted avg       0.55      0.59      0.56        76

Ma trận nhầm lẫn:
[[15  1 17]
 [ 3  0  4]
 [ 6  0 30]]
Độ chính xác:
0.5921052631578947
