In [1]:
# 1. IMPORT LIBRARIES
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [2]:
# 2. DATA LOADING AND INITIAL PREPROCESSING
# Membaca dataset dari file CSV
matches = pd.read_csv("matches.csv", index_col=0)

# Menghapus kolom yang tidak relevan
del matches["comp"]  # Menghapus kolom 'comp' karena hanya satu kompetisi
del matches["notes"]  # Menghapus kolom 'notes' karena tidak relevan

# Mengubah kolom 'date' menjadi tipe datetime
matches["date"] = pd.to_datetime(matches["date"])

# Membuat kolom target untuk hasil pertandingan (1 untuk menang, 0 untuk tidak)
matches["target"] = (matches["result"] == "W").astype("int")

# Menambahkan kode untuk venue dan lawan
matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

# Mengambil jam dari kolom 'time'
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

# Mengambil kode hari dari tanggal
matches["day_code"] = matches["date"].dt.dayofweek

matches = matches.reset_index(drop=True)

In [3]:
# 3. FEATURE ENGINEERING
# Add team performance metrics
matches['goals_scored'] = matches.groupby('team')['gf'].transform(lambda x: x.rolling(window=5).mean()).fillna(0)
matches['goals_conceded'] = matches.groupby('team')['ga'].transform(lambda x: x.rolling(window=5).mean()).fillna(0)
matches['shots_ratio'] = matches['sh'] / matches['sot'].replace(0, 1)

In [4]:
# 4. MODEL SETUP AND TRAINING
# Menyiapkan model Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth=8, min_samples_split=5, min_samples_leaf=5, max_features='sqrt', random_state=1)

# Memisahkan data menjadi data pelatihan dan pengujian
train = matches[matches["date"] < '2022-01-01']
test = matches[matches["date"] > '2022-01-01']

# Mendefinisikan fitur yang digunakan untuk prediksi
predictors = ["venue_code", "opp_code", "hour", "day_code", "goals_scored", "goals_conceded", "shots_ratio"]

# Melatih model dengan data pelatihan
rf.fit(train[predictors], train["target"])


In [5]:
# 5. MODEL EVALUATION
# Menghitung akurasi pada data pengujian
accuracy = rf.score(test[predictors], test["target"])
print(f"Akurasi Model: {accuracy:.4f}")

# Melakukan prediksi pada data pengujian
preds = rf.predict(test[predictors])

# Menghitung akurasi model
error = accuracy_score(test["target"], preds)
print("Error (Accuracy):", error)

# Menggabungkan hasil aktual dan prediksi
combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))

# Menampilkan tabel kontingensi untuk hasil aktual dan prediksi
print(pd.crosstab(index=combined["actual"], columns=combined["predicted"]))

# Menghitung precision score
precision = precision_score(test["target"], preds)
print("Precision Score:", precision)

# Mengelompokkan pertandingan berdasarkan tim
grouped_matches = matches.groupby("team")
group = grouped_matches.get_group("Manchester City").sort_values("date")

Akurasi Model: 0.7391
Error (Accuracy): 0.7391304347826086
predicted    0   1
actual            
0          149  23
1           49  55
Precision Score: 0.7051282051282052


In [6]:
# 6. ROLLING AVERAGES CALCULATION
# Fungsi untuk menghitung rolling averages
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

# Kolom yang digunakan untuk rolling averages
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

# Menghitung rolling averages untuk setiap tim
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')  # Menghapus level multi-index
matches_rolling.index = range(matches_rolling.shape[0])  # Mengatur ulang index

In [7]:
# 7. PREDICTION FUNCTIONS
# Fungsi untuk membuat prediksi
def make_predictions(data, predictors):
    train = data[data["date"] < '2022-01-01']
    test = data[data["date"] > '2022-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    error = precision_score(test["target"], preds)
    return combined, error

# Membuat prediksi dan menghitung error
combined, error = make_predictions(matches_rolling, predictors + new_cols)
print("Error (Precision):", error)

# Menggabungkan hasil prediksi dengan informasi tambahan dari matches_rolling
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

# Menampilkan 10 baris pertama dari hasil gabungan
print(combined.head(10))

Error (Precision): 0.7794117647058824
    actual  predicted       date     team        opponent result
55       0          1 2022-01-23  Arsenal         Burnley      D
56       1          0 2022-02-10  Arsenal          Wolves      W
57       1          1 2022-02-19  Arsenal       Brentford      W
58       1          0 2022-02-24  Arsenal          Wolves      W
59       1          0 2022-03-06  Arsenal         Watford      W
60       1          1 2022-03-13  Arsenal  Leicester City      W
61       0          0 2022-03-16  Arsenal       Liverpool      L
62       1          1 2022-03-19  Arsenal     Aston Villa      W
63       0          0 2022-04-04  Arsenal  Crystal Palace      L
64       0          0 2022-04-09  Arsenal        Brighton      L


In [8]:
# 8. TEAM MAPPING AND DATA HANDLING
# Mendefinisikan kelas untuk menangani nilai yang hilang
class MissingDict(dict):
    __missing__ = lambda self, key: key

# Pemetaan nama tim untuk konsistensi
map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves"
}
mapping = MissingDict(**map_values)

# Menambahkan kolom baru untuk nama tim yang dipetakan
combined["new_team"] = combined["team"].map(mapping)

# Menggabungkan hasil prediksi dengan informasi tim baru
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

# Menampilkan hasil analisis untuk prediksi yang benar dan salah
result_counts = merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 0)]["actual_x"].value_counts()
print(result_counts)
print()


actual_x
1    50
0    10
Name: count, dtype: int64



In [9]:
# 9. MATCH PREDICTION IMPLEMENTATION
# First, prepare the data for both teams with rolling averages
def prepare_team_data(team_name, opponent_name, venue_code, hour, day_code):
    base_data = {
        'venue_code': [venue_code],
        'opp_code': [matches[matches['team'] == opponent_name]['opp_code'].iloc[0]],
        'hour': [hour],
        'day_code': [day_code],
        'goals_scored': [matches[matches['team'] == team_name]['goals_scored'].mean()],
        'goals_conceded': [matches[matches['team'] == team_name]['goals_conceded'].mean()],
        'shots_ratio': [matches[matches['team'] == team_name]['shots_ratio'].mean()]
    }
    
    # Add rolling averages
    for col in cols:
        base_data[f'{col}_rolling'] = [matches[matches['team'] == team_name][col].mean()]
    
    return pd.DataFrame(base_data)

In [10]:
# 10. ACCURACY ANALYSIS
# Menghitung jumlah total hasil yang sebenarnya (1 dan 0)
total_actual = result_counts.sum()

# Menghitung jumlah hasil yang diprediksi 1 dan sebenarnya 1
predicted_correct = result_counts.get(1, 0)  # Menggunakan get untuk menghindari KeyError jika 1 tidak ada

# Menghitung proporsi
proportion = predicted_correct / total_actual if total_actual > 0 else 0

# Mencetak hasil
print(f"Jumlah hasil yang diprediksi 1 dan sebenarnya 1: {predicted_correct}")
print(f"Total hasil yang sebenarnya (1 dan 0): {total_actual}")
print(f"Accuracy Model: {proportion:.2f}")

# First retrain the model with the correct feature set
predictors = ["venue_code", "opp_code", "hour", "day_code", "goals_scored", "goals_conceded", "shots_ratio"]

# Create new RandomForestClassifier instance
rf_new = RandomForestClassifier(n_estimators=100, max_depth=8, min_samples_split=5, min_samples_leaf=5, max_features='sqrt', random_state=1)

# Train with matching features
train_mask = matches_rolling["date"] < '2022-01-01'
test_mask = matches_rolling["date"] > '2022-01-01'

train_data = matches_rolling[train_mask]
test_data = matches_rolling[test_mask]

# Fit the model with the correct features
rf_new.fit(train_data[predictors], train_data['target'])

# Calculate performance metrics
train_preds = rf_new.predict(train_data[predictors])
train_accuracy = accuracy_score(train_data['target'], train_preds)

test_preds = rf_new.predict(test_data[predictors])
test_accuracy = accuracy_score(test_data['target'], test_preds)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Difference: {train_accuracy - test_accuracy:.4f}")

Jumlah hasil yang diprediksi 1 dan sebenarnya 1: 50
Total hasil yang sebenarnya (1 dan 0): 60
Accuracy Model: 0.83
Training Accuracy: 0.8377
Test Accuracy: 0.7500
Difference: 0.0877
