In [23]:
# Importing libraries
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [24]:
# Loading datasets
train_df = pd.read_csv('/Users/jamille.ghazaleh/Downloads/CEUB/1-avaliacao-pratica-2-2024-ml-uniceub/train_futebol.csv')
test_df = pd.read_csv('/Users/jamille.ghazaleh/Downloads/CEUB/1-avaliacao-pratica-2-2024-ml-uniceub/test_futebol.csv')
submission_df = pd.read_csv('/Users/jamille.ghazaleh/Downloads/CEUB/1-avaliacao-pratica-2-2024-ml-uniceub/sample_submission.csv')

print(train_df.head())

         Home      Away  Round        Date   Time   WIN  minuto       evento  \
0  Birmingham  West Ham     38  11.05.2003  16:00  Draw      27  Yellow_Away   
1  Birmingham  West Ham     38  11.05.2003  16:00  Draw      57  Yellow_Home   
2  Birmingham  West Ham     38  11.05.2003  16:00  Draw      66    Goal_Away   
3  Birmingham  West Ham     38  11.05.2003  16:00  Draw      80    Goal_Home   
4  Birmingham  West Ham     38  11.05.2003  16:00  Draw      88    Goal_Home   

   Yellow_Away  Yellow_Home  Goal_Away  Goal_Home  Red_Card_Away  \
0            1            0          0          0              0   
1            1            1          0          0              0   
2            1            1          1          0              0   
3            1            1          1          1              0   
4            1            1          1          2              0   

   Red_Card_Home  Own_Home  Own_Away  Penalty_Missed_Home  \
0              0         0         0             

-----------------------------------------------------------

EDA

In [25]:
# Understanding dimensions
print(train_df.shape)

(40330, 20)


In [26]:
# Statistics
print(train_df.describe())

              Round        minuto   Yellow_Away   Yellow_Home     Goal_Away  \
count  40330.000000  40330.000000  40330.000000  40330.000000  40330.000000   
mean      18.978031     54.656534      1.157575      0.926878      0.799207   
std       10.954710     25.333753      1.136995      1.026580      0.938274   
min        1.000000      0.000000      0.000000      0.000000      0.000000   
25%        9.000000     35.000000      0.000000      0.000000      0.000000   
50%       19.000000     57.000000      1.000000      1.000000      1.000000   
75%       28.000000     77.000000      2.000000      1.000000      1.000000   
max       38.000000    101.000000      9.000000      7.000000      9.000000   

          Goal_Home  Red_Card_Away  Red_Card_Home      Own_Home      Own_Away  \
count  40330.000000   40330.000000   40330.000000  40330.000000  40330.000000   
mean       1.026382       0.054947       0.038978      0.040863      0.029259   
std        1.063413       0.235162       0.20

In [27]:
# Data cleaning
print(train_df.isnull().sum())

Home                   0
Away                   0
Round                  0
Date                   0
Time                   0
WIN                    0
minuto                 0
evento                 0
Yellow_Away            0
Yellow_Home            0
Goal_Away              0
Goal_Home              0
Red_Card_Away          0
Red_Card_Home          0
Own_Home               0
Own_Away               0
Penalty_Missed_Home    0
Penalty_Missed_Away    0
Var_Home               0
Var_Away               0
dtype: int64


In [28]:
# Understanding type of data 
print(train_df.dtypes)

Home                   object
Away                   object
Round                   int64
Date                   object
Time                   object
WIN                    object
minuto                  int64
evento                 object
Yellow_Away             int64
Yellow_Home             int64
Goal_Away               int64
Goal_Home               int64
Red_Card_Away           int64
Red_Card_Home           int64
Own_Home                int64
Own_Away                int64
Penalty_Missed_Home     int64
Penalty_Missed_Away     int64
Var_Home                int64
Var_Away                int64
dtype: object


-----------------------------------------------------------

In [29]:
# Setting numeric columns
numeric_columns = ['minuto', 'Yellow_Away', 'Yellow_Home', 'Goal_Away', 'Goal_Home', 
                   'Red_Card_Away', 'Red_Card_Home', 'Own_Home', 'Own_Away', 
                   'Penalty_Missed_Home', 'Penalty_Missed_Away', 'Var_Home', 'Var_Away']


In [30]:
# Import the MinMaxScaler
scaler = MinMaxScaler()

# Apply the scaler to the numerical columns
train_df[numeric_columns] = scaler.fit_transform(train_df[numeric_columns])
test_df[numeric_columns] = scaler.transform(test_df[numeric_columns])

print(train_df[numeric_columns].head())


     minuto  Yellow_Away  Yellow_Home  Goal_Away  Goal_Home  Red_Card_Away  \
0  0.267327     0.111111     0.000000   0.000000   0.000000            0.0   
1  0.564356     0.111111     0.142857   0.000000   0.000000            0.0   
2  0.653465     0.111111     0.142857   0.111111   0.000000            0.0   
3  0.792079     0.111111     0.142857   0.111111   0.111111            0.0   
4  0.871287     0.111111     0.142857   0.111111   0.222222            0.0   

   Red_Card_Home  Own_Home  Own_Away  Penalty_Missed_Home  \
0            0.0       0.0       0.0                  0.0   
1            0.0       0.0       0.0                  0.0   
2            0.0       0.0       0.0                  0.0   
3            0.0       0.0       0.0                  0.0   
4            0.0       0.0       0.0                  0.0   

   Penalty_Missed_Away  Var_Home  Var_Away  
0                  0.0       0.0       0.0  
1                  0.0       0.0       0.0  
2                  0.0       

-------------------

In [31]:
# Defining the target variable 'Result'
train_df['Result'] = train_df.apply(lambda row: 1 if row['Goal_Home'] > row['Goal_Away'] else (-1 if row['Goal_Home'] < row['Goal_Away'] else 0), axis=1)

# Defining the features (X) and target (y)
X = train_df.drop(columns=['Result'])
y = train_df['Result']

# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [32]:
# Remove columns Date and Time 
X_train = X_train.drop(columns=['Date', 'Time'], errors='ignore')
X_val = X_val.drop(columns=['Date', 'Time'], errors='ignore')

In [33]:
# Scaling the numerical columns
scaler = StandardScaler()
numeric_columns = X_train.select_dtypes(include=['float64', 'int64']).columns

# Applying the scaler to the training data
X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_val[numeric_columns] = scaler.transform(X_val[numeric_columns])

print(X_train.dtypes)

Home                    object
Away                    object
Round                  float64
WIN                     object
minuto                 float64
evento                  object
Yellow_Away            float64
Yellow_Home            float64
Goal_Away              float64
Goal_Home              float64
Red_Card_Away          float64
Red_Card_Home          float64
Own_Home               float64
Own_Away               float64
Penalty_Missed_Home    float64
Penalty_Missed_Away    float64
Var_Home               float64
Var_Away               float64
dtype: object


In [34]:
# Check for the presence of categorical or boolean columns
categorical_columns = X_train.select_dtypes(include=['object', 'bool']).columns
if len(categorical_columns) > 0:
    X_train = pd.get_dummies(X_train, columns=categorical_columns)
    X_val = pd.get_dummies(X_val, columns=categorical_columns)

# Ensure that the columns match between the training and validation datasets
X_val = X_val.reindex(columns=X_train.columns, fill_value=0)

In [35]:
# Map the values of 'Result' to numbers
y_train = y_train.map({1: 1, 0: 0, -1: -1})
y_val = y_val.map({1: 1, 0: 0, -1: -1})

# Check if all columns in X_train and X_val are numeric
print(X_train.dtypes)

Round                   float64
minuto                  float64
Yellow_Away             float64
Yellow_Home             float64
Goal_Away               float64
                         ...   
evento_Red_Card_Home       bool
evento_Var_Away            bool
evento_Var_Home            bool
evento_Yellow_Away         bool
evento_Yellow_Home         bool
Length: 109, dtype: object


In [36]:
# Initialize the RandomForestClassifier with a fixed random state for reproducibility.
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluation on the validation set
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy}')
print(classification_report(y_val, y_val_pred))

Validation Accuracy: 0.9884701214976445
              precision    recall  f1-score   support

          -1       0.99      0.99      0.99      2332
           0       0.98      0.98      0.98      2467
           1       0.99      0.99      0.99      3267

    accuracy                           0.99      8066
   macro avg       0.99      0.99      0.99      8066
weighted avg       0.99      0.99      0.99      8066



In [37]:
# Check if the 'WIN' column exists in test_df and map it to numeric values
if 'WIN' in test_df.columns:
    test_df['WIN'] = test_df['WIN'].map({'Win': 3, 'Lose': 0, 'Draw': 1})

# Define the categorical columns expected to be in the test DataFrame
categorical_columns_test = ['Home', 'Away', 'WIN', 'evento']

# Identify which of these columns actually exist in test_df
existing_categorical_columns = [col for col in categorical_columns_test if col in test_df.columns]

# Apply pd.get_dummies only to the categorical columns that actually exist in test_df
if len(existing_categorical_columns) > 0:
    test_df = pd.get_dummies(test_df, columns=existing_categorical_columns)

# Ensure that the columns in X_test match those in X_train
X_test = test_df.reindex(columns=X_train.columns, fill_value=0)


In [38]:
# Make predictions on the test set
test_predictions = model.predict(X_test)

# Save the predictions for submission
submission_df['Result'] = test_predictions
submission_df.to_csv('submission.csv', index=False)

--------

In [39]:
# Check the accuracy on the training set
y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training Accuracy: {train_accuracy}')

Training Accuracy: 1.0


In [40]:
# Print the count of each unique value in the target variable 'y'
print(y.value_counts())

Result
 1    16517
 0    12416
-1    11397
Name: count, dtype: int64


In [41]:
# Ensure that all categorical columns are converted to dummy variables
X_train = pd.get_dummies(X_train)
X_val = pd.get_dummies(X_val)

# Ensure that the columns in X_val match those in X_train
X_val = X_val.reindex(columns=X_train.columns, fill_value=0)

# Check for any non-numeric columns
non_numeric_columns = X_train.select_dtypes(include=['object']).columns
if len(non_numeric_columns) > 0:
    print(f"Non-numeric columns found: {non_numeric_columns}")


In [42]:
# Apply pd.get_dummies to X_test
X_test = pd.get_dummies(test_df)

# Ensure that the columns in X_test match those in X_train
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Check for any non-numeric columns in X_test
non_numeric_columns_test = X_test.select_dtypes(include=['object']).columns
if len(non_numeric_columns_test) > 0:
    print(f"Non-numeric columns found in X_test: {non_numeric_columns_test}")


In [43]:
# Perform cross-validation on the model using the training data
scores = cross_val_score(model, X_train, y_train, cv=5)
print(f'Average Accuracy with Cross-Validation: {scores.mean()}')

Average Accuracy with Cross-Validation: 0.9870754328426952


In [44]:
# Initialize a DecisionTreeClassifier with a maximum depth of 3
model = DecisionTreeClassifier(max_depth=3)
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy with Decision Tree: {accuracy}')

NameError: name 'DecisionTreeClassifier' is not defined

-------------

Refining Data

In [None]:
# Print the classification report for the validation set predictions
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

          -1       1.00      0.71      0.83      2332
           0       0.80      0.90      0.84      2467
           1       0.90      1.00      0.95      3267

    accuracy                           0.89      8066
   macro avg       0.90      0.87      0.88      8066
weighted avg       0.90      0.89      0.88      8066



In [None]:
scores = cross_val_score(model, X_train, y_train, cv=5)
print(f'Average Accuracy with Cross-Validation: {scores.mean()}')

Average Accuracy with Cross-Validation: 0.8928835802472337


---------

In [None]:
# Load the test data and the submission file containing predictions
test_df = pd.read_csv('/Users/jamille.ghazaleh/Downloads/CEUB/1-avaliacao-pratica-2-2024-ml-uniceub/test_futebol.csv')
submission_df = pd.read_csv('submission.csv')

# Add the predictions to the test DataFrame
test_df['Predicted_Result'] = submission_df['Result']

# Check for duplicates after grouping by match identifiers
unique_matches_check = test_df.drop_duplicates(subset=['Round', 'Home', 'Away'])

print(f"Total unique matches after removing duplicates: {unique_matches_check.shape[0]}")
print(f"Total rows in the original dataset: {test_df.shape[0]}")

Total unique matches after removing duplicates: 931
Total rows in the original dataset: 5902


In [None]:
"""# Group by 'Round', 'Home', 'Away' to ensure each match is counted once
unique_matches = test_df.groupby(['Round', 'Home', 'Away']).agg({'Predicted_Result': 'first'}).reset_index()

# Initialize a dictionary to store points for each team
team_points = {}

# Iterate through each unique match and assign points
for index, row in unique_matches.iterrows():
    home_team = row['Home']
    away_team = row['Away']
    result = row['Predicted_Result']
    
    # Initialize the points for the teams if they are not yet in the dictionary
    if home_team not in team_points:
        team_points[home_team] = 0
    if away_team not in team_points:
        team_points[away_team] = 0
    
    # Assign points based on the predicted result
    if result == 1:
        # Home team wins
        team_points[home_team] += 3
    elif result == -1:
        # Away team wins
        team_points[away_team] += 3
    else:
        # Draw
        team_points[home_team] += 1
        team_points[away_team] += 1"""


"# Group by 'Round', 'Home', 'Away' to ensure each match is counted once\nunique_matches = test_df.groupby(['Round', 'Home', 'Away']).agg({'Predicted_Result': 'first'}).reset_index()\n\n# Initialize a dictionary to store points for each team\nteam_points = {}\n\n# Iterate through each unique match and assign points\nfor index, row in unique_matches.iterrows():\n    home_team = row['Home']\n    away_team = row['Away']\n    result = row['Predicted_Result']\n    \n    # Initialize the points for the teams if they are not yet in the dictionary\n    if home_team not in team_points:\n        team_points[home_team] = 0\n    if away_team not in team_points:\n        team_points[away_team] = 0\n    \n    # Assign points based on the predicted result\n    if result == 1:\n        # Home team wins\n        team_points[home_team] += 3\n    elif result == -1:\n        # Away team wins\n        team_points[away_team] += 3\n    else:\n        # Draw\n        team_points[home_team] += 1\n        team_

In [None]:
# Group by 'Round', 'Home', 'Away' to ensure each match is counted once
# Take the first occurrence of goals for home and away teams to avoid double counting
unique_matches = test_df.groupby(['Round', 'Home', 'Away']).agg({
    'Predicted_Result': 'first',
    'Goal_Home': 'max',
    'Goal_Away': 'max'
}).reset_index()

# Initialize a dictionary to store points, goal difference, and goals scored for each team
team_stats = {}

# Iterate through each unique match and update team statistics
for index, row in unique_matches.iterrows():
    home_team = row['Home']
    away_team = row['Away']
    result = row['Predicted_Result']
    goals_home = row['Goal_Home']
    goals_away = row['Goal_Away']
    
    # Initialize the stats for the teams if they are not yet in the dictionary
    if home_team not in team_stats:
        team_stats[home_team] = {'Points': 0, 'GD': 0, 'GF': 0}
    if away_team not in team_stats:
        team_stats[away_team] = {'Points': 0, 'GD': 0, 'GF': 0}
    
    # Assign points and update goal statistics based on the predicted result
    if result == 1:
        # Home team wins
        team_stats[home_team]['Points'] += 3
    elif result == -1:
        # Away team wins
        team_stats[away_team]['Points'] += 3
    else:
        # Draw
        team_stats[home_team]['Points'] += 1
        team_stats[away_team]['Points'] += 1
    
    # Update goal difference (GD) and goals scored (GF)
    team_stats[home_team]['GD'] += (goals_home - goals_away)
    team_stats[away_team]['GD'] += (goals_away - goals_home)
    team_stats[home_team]['GF'] += goals_home
    team_stats[away_team]['GF'] += goals_away


In [None]:
# Convert the stats dictionary into a DataFrame for better visualization
points_table = pd.DataFrame.from_dict(team_stats, orient='index').reset_index()
points_table.rename(columns={'index': 'Team'}, inplace=True)

# Sort teams by Points, then by Goal Difference (GD), and then by Goals Scored (GF)
points_table = points_table.sort_values(by=['Points', 'GD', 'GF'], ascending=False).reset_index(drop=True)

print("Final Points Table:")
print(points_table)

# Determine the champion
champion = points_table.iloc[0]
print(f"\nThe predicted champion is: {champion['Team']} with {champion['Points']} points, "
      f"{champion['GD']} goal difference, and {champion['GF']} goals scored.")


Final Points Table:
               Team  Points   GD   GF
0         Liverpool      94  114  197
1         Tottenham      94   52  150
2          West Ham      94   28  146
3       Aston Villa      94  -11  117
4           Everton      94  -25  108
5       Southampton      94  -38  115
6           Burnley      94  -40   83
7           Chelsea      93   73  165
8           Arsenal      93   43  143
9    Crystal Palace      93  -34  101
10        Newcastle      93  -43  103
11  Manchester City      92  152  220
12        Leicester      92   22  148
13         Brighton      92  -21   91
14   Manchester Utd      91   51  160
15           Wolves      91  -15   90
16            Leeds      76  -28  102
17    Sheffield Utd      56  -46   34
18          Norwich      56  -90   26
19          Watford      55  -51   53
20        Brentford      38   -8   46
21           Fulham      38  -26   26
22        West Brom      37  -40   33
23      Bournemouth      18  -19   18

The predicted champion is: Li