In [1]:
# Import pandas library
import pandas as pd

# Load match data from CSV, using the first column as the DataFrame index
matches = pd.read_csv("matches.csv", index_col=0)

# Display the first 5 rows to preview the dataset
matches.head()

  from pandas.core import (


Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
0,2024-04-14,17:00,Série A,Matchweek 1,Sun,Away,L,2,3,Cruzeiro,...,Match Report,,6,3,15.2,0.0,0,0,2024,Botafogo RJ
1,2024-04-18,21:30,Série A,Matchweek 2,Thu,Home,W,1,0,Atl Goianiense,...,Match Report,,13,4,20.1,1.0,0,0,2024,Botafogo RJ
2,2024-04-21,18:30,Série A,Matchweek 3,Sun,Home,W,5,1,Juventude,...,Match Report,,12,5,14.8,1.0,1,1,2024,Botafogo RJ
3,2024-04-28,11:00,Série A,Matchweek 4,Sun,Away,W,2,0,Flamengo,...,Match Report,,9,3,20.3,0.0,0,0,2024,Botafogo RJ
4,2024-05-05,18:30,Série A,Matchweek 5,Sun,Home,L,1,2,Bahia,...,Match Report,,16,5,22.4,1.0,0,0,2024,Botafogo RJ


In [2]:
# Convert all column names to lowercase for consistency
matches.columns = [c.lower() for c in matches.columns]

# Display the first 5 rows again to verify the column name changes
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2024-04-14,17:00,Série A,Matchweek 1,Sun,Away,L,2,3,Cruzeiro,...,Match Report,,6,3,15.2,0.0,0,0,2024,Botafogo RJ
1,2024-04-18,21:30,Série A,Matchweek 2,Thu,Home,W,1,0,Atl Goianiense,...,Match Report,,13,4,20.1,1.0,0,0,2024,Botafogo RJ
2,2024-04-21,18:30,Série A,Matchweek 3,Sun,Home,W,5,1,Juventude,...,Match Report,,12,5,14.8,1.0,1,1,2024,Botafogo RJ
3,2024-04-28,11:00,Série A,Matchweek 4,Sun,Away,W,2,0,Flamengo,...,Match Report,,9,3,20.3,0.0,0,0,2024,Botafogo RJ
4,2024-05-05,18:30,Série A,Matchweek 5,Sun,Home,L,1,2,Bahia,...,Match Report,,16,5,22.4,1.0,0,0,2024,Botafogo RJ


In [3]:
# Get the dimensions of the matches DataFrame
matches.shape

# Expected rows based on typical league structure: 2 seasons x 20 teams x 38 matches per team
# 1520 (Theoretical total if data is complete)

(1520, 28)

In [4]:
matches["team"].value_counts()

team
Botafogo RJ             76
Vasco da Gama           76
Cuiaba                  76
Athletico Paranaense    76
Red Bull Bragantino     76
Gremio                  76
Palmeiras               76
Atletico Mineiro        76
Fluminense              76
Cruzeiro                76
Bahia                   76
Corinthians             76
Sao Paulo               76
Internacional           76
Fortaleza               76
Flamengo                76
Vitoria                 38
Juventude               38
Criciuma                38
Atletico Goianiense     38
Santos                  38
Goias                   38
Coritiba                38
America MG              38
Name: count, dtype: int64

In [5]:
matches["round"].value_counts()

round
Matchweek 1     40
Matchweek 29    40
Matchweek 22    40
Matchweek 23    40
Matchweek 24    40
Matchweek 25    40
Matchweek 26    40
Matchweek 27    40
Matchweek 28    40
Matchweek 30    40
Matchweek 2     40
Matchweek 31    40
Matchweek 32    40
Matchweek 33    40
Matchweek 34    40
Matchweek 35    40
Matchweek 36    40
Matchweek 37    40
Matchweek 21    40
Matchweek 20    40
Matchweek 19    40
Matchweek 18    40
Matchweek 3     40
Matchweek 4     40
Matchweek 5     40
Matchweek 6     40
Matchweek 7     40
Matchweek 8     40
Matchweek 9     40
Matchweek 10    40
Matchweek 11    40
Matchweek 12    40
Matchweek 13    40
Matchweek 14    40
Matchweek 15    40
Matchweek 16    40
Matchweek 17    40
Matchweek 38    40
Name: count, dtype: int64

In [6]:
# Check the data types of all columns in the DataFrame
matches.dtypes

date              object
time              object
comp              object
round             object
day               object
venue             object
result            object
gf                 int64
ga                 int64
opponent          object
xg               float64
xga              float64
poss               int64
attendance       float64
captain           object
formation         object
opp formation     object
referee           object
match report      object
notes            float64
sh                 int64
sot                int64
dist             float64
fk               float64
pk                 int64
pkatt              int64
season             int64
team              object
dtype: object

In [7]:
# Convert 'date' column to datetime objects for time-based analysis
matches["date"] = pd.to_datetime(matches["date"])

# Create 'day_code' feature representing the day of the week (0=Monday, 6=Sunday)
matches["day_code"] = matches["date"].dt.dayofweek

# Create 'hour' feature by extracting the hour from the 'time' string and converting to integer
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

# Create 'venue_code' by converting the categorical 'venue' (e.g., 'Home'/'Away') into numerical codes
matches["venue_code"] = matches["venue"].astype("category").cat.codes

# Create 'opp_code' by converting the categorical 'opponent' names into unique numerical codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

# Create the binary 'target' variable for prediction: 1 if the team won ('W'), 0 otherwise
matches["target"] = (matches["result"] == "W").astype("int")

# Delete columns 'comp' (competition) and 'notes' as they are not needed for the model
if 'comp' in matches.columns:
    del matches["comp"]
if 'notes' in matches.columns:
    del matches["notes"]

In [8]:
# Import the Random Forest classification algorithm from the scikit-learn library
from sklearn.ensemble import RandomForestClassifier

In [9]:
# Initialize the Random Forest Classifier model
# n_estimators: Number of decision trees in the forest 
# min_samples_split: Minimum samples required to split an internal node - helps prevent overfitting
# random_state: Ensures reproducibility of results
rf = RandomForestClassifier(n_estimators=900, min_samples_split=30, random_state=1)

In [10]:
# Define the specific cutoff date for separating the data chronologically
split_date = pd.to_datetime('2024-06-06')

# Create the training set ('train') containing all matches with dates BEFORE the split_date
train = matches[matches['date'] < split_date]

# Create the test set ('test') containing all matches with dates ON or AFTER the split_date
# This ensures the model is trained on older data and tested on newer data, respecting time order
test = matches[matches['date'] >= split_date]

In [11]:
# Define the list of column names to be used as input features (predictors) for the initial model
# These are the basic, non-rolling features selected
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [12]:
# Train the Random Forest model ('rf')
# The model learns the relationship between the selected 'predictors' 
# and the 'target' variable using the data in the 'train' set
rf.fit(train[predictors], train["target"])

In [13]:
# Use the trained Random Forest model ('rf') to make predictions on the test data ('test')
preds = rf.predict(test[predictors])

In [14]:
from sklearn.metrics import accuracy_score

In [15]:
# Calculate the accuracy of the predictions on the test set
# Accuracy = (Number of correct predictions) / (Total number of predictions)
error = accuracy_score(test["target"], preds)
error

0.6234177215189873

In [16]:
# Create a new DataFrame ('combined') to easily compare the actual results
# from the test set ('test["target"]') side-by-side with the model's predictions ('preds')
# The DataFrame will have two columns: 'actual' and 'predicted'
combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))

combined.head()

Unnamed: 0,actual,predicted
7,1,0
8,1,0
9,0,0
10,0,0
11,1,0


In [17]:
# Generate a confusion matrix using pandas crosstab
# This table shows the counts of actual outcomes (rows) versus the model's predicted outcomes (columns)
# Helps visualize True Positives, True Negatives, False Positives, and False Negatives
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,332,67
1,171,62


In [18]:
from sklearn.metrics import precision_score

# Calculate the precision score for the positive class (label '1', which represents 'Win')
# Precision = True Positives / (True Positives + False Positives)
# Measures: Of all the matches the model predicted as a 'Win', how many were actually 'Wins'?
precision_score(test["target"], preds)

0.4806201550387597

In [19]:
# Group the 'matches' DataFrame by the 'team' column.
# This creates a DataFrameGroupBy object ('grouped_matches') which allows applying
# operations (like calculating rolling averages) independently to each team's subset of data.
grouped_matches = matches.groupby("team")
group = grouped_matches.get_group("Sao Paulo").sort_values("date")

In [20]:
# Define a function to calculate rolling averages for a given group (e.g., one team's matches)
def rolling_averages(group, cols, new_cols):
    # Ensure the group's data is sorted by date for correct time-series calculation
    group = group.sort_values("date")
    # Calculate the rolling mean over a window of the previous 3 matches (closed='left')
    # 'closed=left' prevents using the current match's data in the average for that same match
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    # Assign the calculated rolling statistics to the new columns in the group DataFrame
    group[new_cols] = rolling_stats
    # Drop rows where rolling averages couldn't be calculated (first few matches lack history)
    group = group.dropna(subset=new_cols)
    # Return the updated group DataFrame with rolling average features
    return group

In [21]:
# Define the list of specific statistic columns for which rolling averages will be calculated
# gf: goals for, ga: goals against, sh: shots, sot: shots on target, dist: avg shot distance,
# fk: free kicks, pk: penalty kicks scored, pkatt: penalty kicks attempted
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]

# Automatically create a list of new column names by appending '_rolling' to each original column name
# Example: "gf" becomes "gf_rolling"
new_cols = [f"{c}_rolling" for c in cols]

# Call the rolling_averages function on a 'group' DataFrame (which should be defined previously
# likely holding data for a single team
rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,opp_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
1143,2023-05-07,16:00,Matchweek 4,Sun,Home,W,2,0,Internacional,1.9,...,16,1,1.666667,1.000000,18.666667,6.666667,18.766667,0.666667,0.0,0.000000
1144,2023-05-11,20:00,Matchweek 5,Thu,Away,D,0,0,Fortaleza,0.8,...,13,0,2.000000,0.333333,15.333333,5.000000,19.433333,0.666667,0.0,0.333333
1145,2023-05-14,16:00,Matchweek 6,Sun,Away,D,1,1,Corinthians,0.9,...,6,0,1.000000,0.333333,14.000000,3.333333,20.133333,0.666667,0.0,0.333333
1146,2023-05-20,18:30,Matchweek 7,Sat,Home,W,4,2,Vasco da Gama,2.3,...,22,1,1.000000,0.333333,8.666667,1.666667,18.500000,0.000000,0.0,0.333333
1147,2023-05-27,21:00,Matchweek 8,Sat,Home,W,2,1,Goiás,0.9,...,14,1,1.666667,1.000000,11.666667,3.000000,17.566667,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,2024-11-20,16:30,Matchweek 34,Wed,Away,D,1,1,RB Bragantino,0.4,...,19,0,2.000000,0.666667,12.000000,3.333333,19.733333,0.333333,0.0,0.000000
224,2024-11-23,21:30,Matchweek 35,Sat,Home,D,2,2,Atlético Mineiro,1.0,...,3,0,2.000000,0.666667,10.000000,3.000000,19.466667,0.333333,0.0,0.000000
225,2024-12-01,16:00,Matchweek 36,Sun,Away,L,1,2,Grêmio,0.3,...,15,0,1.666667,1.333333,11.000000,3.000000,18.933333,0.666667,0.0,0.000000
226,2024-12-04,20:00,Matchweek 37,Wed,Home,L,1,2,Juventude,0.5,...,17,0,1.333333,1.666667,10.000000,2.666667,19.300000,0.333333,0.0,0.000000


In [22]:
# Apply the 'rolling_averages' function to each group (each team's data) within the 'matches' DataFram
# 'groupby("team")' creates the groups based on team name
# '.apply()' executes the provided lambda function for each team's subset ('x')
# The lambda function calls 'rolling_averages' to compute stats for that team
# The results from all teams are automatically combined back into the 'matches_rolling' DataFrame
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [23]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,opp_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
America MG,1485,2023-05-07,11:00,Matchweek 4,Sun,Home,L,1,2,Cuiabá,1.4,...,10,0,0.666667,3.000000,19.333333,6.000000,20.066667,0.666667,0.000000,0.000000
America MG,1486,2023-05-10,19:00,Matchweek 5,Wed,Away,D,2,2,RB Bragantino,2.0,...,19,0,1.000000,2.666667,18.666667,6.666667,20.066667,0.666667,0.000000,0.000000
America MG,1487,2023-05-14,18:30,Matchweek 6,Sun,Home,L,0,4,Cruzeiro,1.3,...,9,0,1.666667,2.333333,14.000000,4.333333,19.566667,1.000000,0.666667,0.666667
America MG,1488,2023-05-20,18:30,Matchweek 7,Sat,Home,W,2,1,Fortaleza,0.8,...,13,1,1.000000,2.666667,12.666667,2.333333,19.433333,0.666667,0.666667,0.666667
America MG,1489,2023-05-28,19:00,Matchweek 8,Sun,Away,L,0,2,Botafogo (RJ),0.4,...,5,0,1.333333,2.333333,10.666667,2.000000,20.566667,0.666667,0.666667,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vitoria,413,2024-11-20,16:30,Matchweek 34,Wed,Away,W,1,0,Criciúma,1.1,...,8,1,1.666667,1.333333,10.333333,3.000000,20.333333,0.666667,0.333333,0.333333
Vitoria,414,2024-11-23,19:30,Matchweek 35,Sat,Away,D,1,1,Botafogo (RJ),0.5,...,5,0,1.333333,1.000000,11.333333,3.333333,19.533333,0.333333,0.000000,0.000000
Vitoria,415,2024-12-01,18:30,Matchweek 36,Sun,Home,W,2,0,Fortaleza,2.7,...,13,1,1.000000,1.000000,11.000000,3.000000,20.300000,1.000000,0.000000,0.000000
Vitoria,416,2024-12-04,20:00,Matchweek 37,Wed,Home,D,1,1,Grêmio,1.5,...,15,0,1.333333,0.333333,14.333333,5.000000,19.333333,1.333333,0.000000,0.000000


In [24]:
# Remove the top level ('team') from the DataFrame's index
matches_rolling = matches_rolling.droplevel('team')

# The index might still not be unique or sequential after this step.

In [25]:
# It replaces the existing index (which might be non-sequential after previous steps)
# with a simple range of numbers starting from 0 up to the number of rows.
matches_rolling.index = range(matches_rolling.shape[0])

In [26]:
# Define a function to perform the train-predict-evaluate cycle
def make_predictions(data, predictors):
  # Split the data into training (before split_date) and testing (after split_date) sets
  train = data[data["date"] < split_date]
  test = data[data["date"] > split_date] # Use >= if matches on the split date should be tested
  # Check if train or test sets are empty after splitting (can happen with certain date ranges)
  if train.empty or test.empty:
      print(f"Warning: Training set empty ({train.empty}) or Test set empty ({test.empty}). Returning empty results.")
      return pd.DataFrame(), 0.0 # Return empty results to avoid errors
  # Train the (globally defined) RandomForest model 'rf' on the training data
  rf.fit(train[predictors], train["target"])
  # Make predictions on the test set using the trained model
  preds = rf.predict(test[predictors])
  # Create a DataFrame comparing actual values ('target') and predictions ('preds') for the test set
  combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
  # Calculate the precision score (how many predicted Wins were actual Wins)
  # The variable is named 'error', but stores precision. zero_division=0 avoids warnings.
  error = precision_score(test["target"], preds, zero_division=0)
  # Return the comparison DataFrame and the precision score
  return combined, error

In [27]:
# Call the prediction function using the 'matches_rolling' DataFrame
combined, error = make_predictions(matches_rolling, predictors + new_cols)

In [28]:
error

0.5606060606060606

In [29]:
# Merge the 'combined' DataFrame (containing 'actual' and 'predicted' columns)
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

# Display the combined DataFrame with added context (optional)
combined.head()

Unnamed: 0,actual,predicted,date,team,opponent,result
77,1,0,2024-06-13,Athletico Paranaense,Criciúma,W
78,0,0,2024-06-16,Athletico Paranaense,Flamengo,D
79,0,0,2024-06-19,Athletico Paranaense,Botafogo (RJ),D
80,0,1,2024-06-23,Athletico Paranaense,Corinthians,D
81,0,0,2024-06-26,Athletico Paranaense,Cruzeiro,L


In [30]:
merged = combined.merge(combined, left_on=["date", "team"], right_on=["date", "opponent"])

In [31]:
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,actual_y,predicted_y,team_y,opponent_y,result_y
0,1,0,2024-06-13,Bahia,Fortaleza,W,0,0,Fortaleza,Bahia,L
1,0,0,2024-06-16,Bahia,Criciúma,D,0,0,Criciuma,Bahia,D
2,0,0,2024-06-20,Bahia,Flamengo,L,1,0,Flamengo,Bahia,W
3,1,0,2024-06-23,Bahia,Cruzeiro,W,0,0,Cruzeiro,Bahia,L
4,1,0,2024-06-26,Bahia,Vasco da Gama,W,0,0,Vasco da Gama,Bahia,L
...,...,...,...,...,...,...,...,...,...,...,...
310,0,0,2024-11-21,Vasco da Gama,Internacional,L,1,0,Internacional,Vasco da Gama,W
311,0,0,2024-11-24,Vasco da Gama,Corinthians,L,1,1,Corinthians,Vasco da Gama,W
312,0,0,2024-11-30,Vasco da Gama,Atl Goianiense,D,0,0,Atletico Goianiense,Vasco da Gama,D
313,1,0,2024-12-04,Vasco da Gama,Atlético Mineiro,W,0,1,Atletico Mineiro,Vasco da Gama,L


In [32]:
# Filter the 'merged' DataFrame to find matches where the model predicted:
#  - Team X (left side of merge) would Win ('predicted_x' == 1)
#  - AND Team Y (right side of merge, opponent of X) would NOT Win ('predicted_y' == 0)
# Then, for these specific matches, count the actual outcomes for Team X ('actual_x')
# This shows how often Team X actually Won (1) or Didn't Win (0) when the model predicted this Win/Non-Win scenario
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 0)]["actual_x"].value_counts()

actual_x
1    26
0    14
Name: count, dtype: int64