In [1]:
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

In [2]:
df1 = pd.read_csv('data.csv')
df2 = pd.read_csv('matches.csv')
df2 = df2.drop(columns=['Unnamed: 0','notes','match report','formation','pk','pkatt'])
df1 = df1.drop(columns=['Unnamed: 0','notes','match report','formation','opp formation','pk','pkatt'])

matches = pd.concat([df1,df2], axis=0)

In [3]:
matches['date'] = pd.to_datetime(matches['date'], errors='coerce')
matches.dropna(subset=['date'])

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,poss,attendance,captain,referee,sh,sot,dist,fk,season,team
0,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2.0,0.0,Ipswich Town,...,62.0,30014.0,Virgil van Dijk,Tim Robinson,18.0,5.0,14.8,0.0,2023.0,Liverpool
1,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2.0,0.0,Brentford,...,62.0,60017.0,Virgil van Dijk,Stuart Attwell,19.0,8.0,13.6,1.0,2023.0,Liverpool
2,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Away,W,3.0,0.0,Manchester Utd,...,47.0,73738.0,Virgil van Dijk,Anthony Taylor,11.0,3.0,13.4,0.0,2023.0,Liverpool
3,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,L,0.0,1.0,Nott'ham Forest,...,68.0,60344.0,Virgil van Dijk,Michael Oliver,14.0,5.0,14.9,0.0,2023.0,Liverpool
4,2024-09-21,15:00,Premier League,Matchweek 5,Sat,Home,W,3.0,0.0,Bournemouth,...,58.0,60347.0,Virgil van Dijk,Tony Harrington,19.0,12.0,16.6,0.0,2023.0,Liverpool
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1384,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0.0,4.0,Tottenham,...,34.0,,John Egan,Andre Marriner,8.0,1.0,17.4,0.0,2021.0,Sheffield United
1385,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0.0,2.0,Crystal Palace,...,50.0,,John Egan,Simon Hooper,7.0,0.0,11.4,1.0,2021.0,Sheffield United
1386,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1.0,0.0,Everton,...,38.0,,John Egan,Jonathan Moss,10.0,3.0,17.0,0.0,2021.0,Sheffield United
1387,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0.0,1.0,Newcastle Utd,...,50.0,10000.0,John Egan,Robert Jones,11.0,1.0,16.0,1.0,2021.0,Sheffield United


In [4]:
from sklearn.preprocessing import LabelEncoder

matches['result_encoded'] = matches['result'].map({'W':0, 'L':1, 'D':1})
matches['venue_encoded'] = matches['venue'].map({'Home':0, 'Away':1})

le = LabelEncoder()
matches['referee'] = le.fit_transform(matches['referee'])

In [5]:
def calculate_last_5_stats(team, date):
    """
    计算给定球队在某日期前的过去五场比赛表现，若不足五场，则返回实际值
    """
    past_matches = matches[
        ((matches['team'] == team)) & (matches['date'] < date)
    ].tail(5)

    total_matches = len(past_matches)
    wins = ((past_matches['team'] == team) & (past_matches['result'] == 'W')).sum()

    # 计算射门和射正数
    shots_for = past_matches['sh'].sum()
    shots_on_target_for = past_matches['sot'].sum()
    avg_shots_for = shots_for / total_matches if total_matches > 0 else 0
    avg_shots_on_target_for = shots_on_target_for / total_matches if total_matches > 0 else 0

    gf = past_matches['gf'].sum()
    ga = past_matches['ga'].sum()
    avg_gf = gf / total_matches if total_matches > 0 else 0
    avg_ga = ga / total_matches if total_matches > 0 else 0

    return wins, avg_shots_for, avg_shots_on_target_for, avg_gf, avg_ga

matches = matches.sort_values(by=['team', 'date']).reset_index(drop=True)
matches[['Last5Wins', 'Last5AvgSh', 'Last5AvgSot', 'Last5AvgGf', 'Last5AvgGa']] = matches.apply(
    lambda row: pd.Series(calculate_last_5_stats(row['team'], row['date'])),
    axis=1
)

In [6]:
merged_df = pd.merge(
    matches[matches["venue"] == "Home"],
    matches[matches["venue"] == "Away"],
    on=["date", "time", "referee", "comp", "day"],
    suffixes=("_home", "_away")
)

merged_df = merged_df[[
    "date", "referee",
    "team_home", "team_away", "result_encoded_home",
    "gf_home", "ga_home", "sh_home", "sot_home",'Last5Wins_home', 'Last5AvgSh_home', 'Last5AvgSot_home', 'Last5AvgGf_home', 'Last5AvgGa_home',
    "gf_away", "ga_away", "sh_away", "sot_away",'Last5Wins_away', 'Last5AvgSh_away', 'Last5AvgSot_away', 'Last5AvgGf_away', 'Last5AvgGa_away'
]]

In [7]:
merged_df


Unnamed: 0,date,referee,team_home,team_away,result_encoded_home,gf_home,ga_home,sh_home,sot_home,Last5Wins_home,...,Last5AvgGa_home,gf_away,ga_away,sh_away,sot_away,Last5Wins_away,Last5AvgSh_away,Last5AvgSot_away,Last5AvgGf_away,Last5AvgGa_away
0,2020-09-19,18,Arsenal,West Ham United,0.0,2.0,1.0,6.0,3.0,1.0,...,0.000000,1.0,2.0,15.0,3.0,0.0,15.000000,3.000000,0.0,2.000000
1,2020-10-04,14,Arsenal,Sheffield United,0.0,2.0,1.0,6.0,5.0,2.0,...,1.333333,1.0,2.0,6.0,2.0,0.0,8.333333,1.333333,0.0,1.333333
2,2020-10-25,4,Arsenal,Leicester City,1.0,0.0,1.0,12.0,4.0,3.0,...,1.200000,1.0,0.0,6.0,3.0,3.0,8.400000,4.000000,2.4,1.600000
3,2020-11-08,16,Arsenal,Aston Villa,1.0,0.0,3.0,13.0,2.0,2.0,...,1.200000,3.0,0.0,17.0,5.0,3.0,14.000000,6.400000,2.8,1.800000
4,2020-11-29,18,Arsenal,Wolverhampton Wanderers,1.0,1.0,2.0,13.0,2.0,1.0,...,1.000000,2.0,1.0,11.0,5.0,2.0,13.000000,4.600000,1.0,0.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
927,2024-10-20,3,Wolverhampton Wanderers,Manchester City,1.0,1.0,2.0,3.0,2.0,0.0,...,2.600000,2.0,1.0,22.0,7.0,3.0,22.000000,7.800000,2.2,1.400000
928,2024-11-02,2,Wolverhampton Wanderers,Crystal Palace,1.0,2.0,2.0,11.0,6.0,0.0,...,2.800000,2.0,2.0,19.0,6.0,1.0,13.800000,5.200000,0.4,0.800000
929,2024-11-09,31,Wolverhampton Wanderers,Southampton,0.0,2.0,0.0,8.0,4.0,0.0,...,2.600000,0.0,2.0,9.0,0.0,1.0,9.000000,3.200000,1.0,2.000000
930,2024-11-30,22,Wolverhampton Wanderers,Bournemouth,1.0,2.0,4.0,10.0,3.0,2.0,...,1.400000,4.0,2.0,9.0,5.0,2.0,13.600000,4.000000,1.6,1.400000


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

feature_cols = [
    "referee",
    'Last5Wins_home', 'Last5AvgSh_home', 'Last5AvgSot_home', 'Last5AvgGf_home', 'Last5AvgGa_home',
    'Last5Wins_away', 'Last5AvgSh_away', 'Last5AvgSot_away', 'Last5AvgGf_away', 'Last5AvgGa_away'
]

# 目标列
target_col = "result_encoded_home"

x = merged_df[feature_cols]
y = merged_df[target_col]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10)

# 初始化随机森林模型
rf = RandomForestClassifier(n_estimators=200, min_samples_leaf=4, min_samples_split=10, random_state=10, max_depth=10)

# 训练模型
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

# 评估模型
accuracy = accuracy_score(y_test, y_pred)
print(f"测试集准确率: {accuracy:.2f}")

print("分类报告:")
print(classification_report(y_test, y_pred))

测试集准确率: 0.68
分类报告:
              precision    recall  f1-score   support

         0.0       0.52      0.44      0.48        61
         1.0       0.75      0.80      0.77       126

    accuracy                           0.68       187
   macro avg       0.63      0.62      0.63       187
weighted avg       0.67      0.68      0.68       187



In [10]:
# from sklearn.model_selection import GridSearchCV

# # 定义参数网格
# param_grid = {
#     'n_estimators': [100, 200, 300],          # 决策树的数量
#     'max_depth': [10, 20, None],             # 树的最大深度
#     'min_samples_split': [2, 5, 10],         # 节点分裂的最小样本数
#     'min_samples_leaf': [1, 2, 4],           # 叶子节点的最小样本数
#     'max_features': ['sqrt', 'log2', None]   # 每次分裂时的最大特征数
# }

# # 初始化随机森林模型
# rf = RandomForestClassifier(random_state=10)

# # 初始化 GridSearchCV
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# # 执行网格搜索
# grid_search.fit(x_train, y_train)

# # 输出最佳参数
# print("最佳参数:", grid_search.best_params_)

# # 使用最佳参数的模型预测测试集
# best_rf = grid_search.best_estimator_
# y_pred = best_rf.predict(x_test)

# # 评估模型性能
# accuracy = accuracy_score(y_test, y_pred)
# print(f"优化后测试集准确率: {accuracy:.2f}")

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
最佳参数: {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
优化后测试集准确率: 0.67
