In [2]:
import pandas as pd

# Load matches data from a CSV file
matches = pd.read_csv("matches.csv", index_col=0)

# Display the first few rows of the DataFrame
print(matches.head())

# Display the shape of the DataFrame (number of rows, number of columns)
print(matches.shape)

# Count the number of matches for each team
print(matches["team"].value_counts())

# Sort matches for Liverpool by date
print(matches[matches["team"] == "Liverpool"].sort_values("date"))

# Count the number of matches for each round
print(matches["round"].value_counts())

# Display the data types of each column in the DataFrame
print(matches.dtypes)

# Delete unnecessary columns
del matches["comp"]
del matches["notes"]

# Convert the date column to datetime format
matches["date"] = pd.to_datetime(matches["date"])

# Create a target column indicating whether the team won the match
matches["target"] = (matches["result"] == "W").astype("int")

# Display the updated DataFrame
print(matches)

# Create categorical codes for venue and opponent columns
matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

# Extract the hour from the time column and convert it to integer
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

# Create a day of week code for the date column
matches["day_code"] = matches["date"].dt.dayofweek

# Display the DataFrame with new columns
print(matches)

# Import RandomForestClassifier from sklearn.ensemble
from sklearn.ensemble import RandomForestClassifier

# Create a RandomForestClassifier model
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

# Split the data into training and testing sets
train = matches[matches["date"] < '2022-01-01']
test = matches[matches["date"] > '2022-01-01']

# Define the predictors
predictors = ["venue_code", "opp_code", "hour", "day_code"]

# Fit the model to the training data
rf.fit(train[predictors], train["target"])

# Predict the target for the testing data
preds = rf.predict(test[predictors])

# Import accuracy_score from sklearn.metrics
from sklearn.metrics import accuracy_score

# Calculate the accuracy of the model
error = accuracy_score(test["target"], preds)
print(error)

# Create a DataFrame with actual and predicted values
combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))

# Create a confusion matrix
print(pd.crosstab(index=combined["actual"], columns=combined["predicted"]))

# Import precision_score from sklearn.metrics
from sklearn.metrics import precision_score

# Calculate the precision of the model
print(precision_score(test["target"], preds))

# Group matches by team
grouped_matches = matches.groupby("team")

# Get matches for Manchester City and sort by date
group = grouped_matches.get_group("Manchester City").sort_values("date")

# Define a function to calculate rolling averages
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

# Define columns for rolling averages
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

# Calculate rolling averages for Manchester City
print(rolling_averages(group, cols, new_cols))

# Apply rolling averages to all teams
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

# Display the DataFrame with rolling averages
print(matches_rolling)

# Drop the team level from the index
matches_rolling = matches_rolling.droplevel('team')

# Display the final DataFrame
print(matches_rolling)


         date   time            comp        round  day venue result   gf   ga  \
1  2023-08-15  16:30  Premier League  Matchweek 1  Sun  Away      L  0.0  1.0   
2  2023-08-21  15:00  Premier League  Matchweek 2  Sat  Home      W  5.0  0.0   
3  2023-08-28  12:30  Premier League  Matchweek 3  Sat  Home      W  5.0  0.0   
4  2023-09-11  15:00  Premier League  Matchweek 4  Sat  Away      W  1.0  0.0   
6  2023-09-18  15:00  Premier League  Matchweek 5  Sat  Home      D  0.0  0.0   

         opponent  ...  match report  notes    sh   sot  dist   fk   pk pkatt  \
1       Tottenham  ...  Match Report    NaN  18.0   4.0  16.9  1.0  0.0   0.0   
2    Norwich City  ...  Match Report    NaN  16.0   4.0  17.3  1.0  0.0   0.0   
3         Arsenal  ...  Match Report    NaN  25.0  10.0  14.3  0.0  0.0   0.0   
4  Leicester City  ...  Match Report    NaN  25.0   8.0  14.0  0.0  0.0   0.0   
6     Southampton  ...  Match Report    NaN  16.0   1.0  15.7  1.0  0.0   0.0   

   season             team

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
