In [None]:
import pandas as pd
import numpy as np 
import plotly.express as px

In [None]:
# Read data 

df = pd.read_csv("soccer_results_02_21.csv")
print(df.head())

In [None]:
# Check the size of the dataset

print("Total results: ", df.shape[0]) # rows
print("Total categories: ", df.shape[1]) # columns (categories)

In [None]:
# Check the categories and their data types

print(df.dtypes)

In [None]:
# Drop AET columns (I only want to have league games)

df = df.drop(['Home_Score_AET', 'Away_Score_AET'], axis=1)

In [None]:
# Check if Competition include only league games 

print(df['Competition'].unique())

In [None]:
# Drop competitions that are not national leagues

df = df[df.Competition != 'fa-cup']
df = df[df.Competition != 'uefa-europa-league']
df = df[df.Competition != 'uefa-champions-league']

# Alternative ways

# df = df[~df['Competition'].isin(['fa-cup', 'uefa-europa-league', 'uefa-champions-league'])]
# df = df.query("Competition not in ['fa-cup', 'uefa-europa-league', 'uefa-champions-league']")

In [None]:
# Check if it includes season 2022 

print(df['season'].unique())

In [None]:
# Drop season 2022 because it's incomplete

df = df[df.season != 2022]

In [None]:
# Rearange columns 

df = df[['season', 'Country', 'Competition', 'Round', 'Date', 'Time', 'Home_Team', 'Home_Score', 
'Away_Score','Away_Team', 'Home_Penalties', 'Away_Penalties', 'Home_Points', 'Away_Points']]

print(df.columns)

In [None]:
# Fill Nan's and convert points, scores, and penalties to integer

columns_to_fill = ['Home_Points', 'Away_Points', 'Home_Score', 'Away_Score', 
                   'Home_Penalties', 'Away_Penalties']

for col in columns_to_fill:
    df[col] = df[col].fillna(0).astype(int)

print(df.dtypes)

In [None]:
# Check missing rows in columns

c=df.isnull().sum()
print(c)

In [None]:
# Create new columns - Home_Win, Draw, and Away_Win and set the values

df['Home_Win'] = np.where(df['Home_Points'] == 3, True, False)
df['Draw'] = np.where(df['Home_Points'] == 1, True, False)
df['Away_Win'] = np.where(df['Away_Points'] == 3, True, False)

# Check if it's correct

print(df.head())

In [None]:
# Find the team with the most home wins

grouped_h = df.groupby('Home_Team')
home_wins = grouped_h['Home_Win'].sum()
home_wins.sort_values(ascending=False, inplace=True)

print(home_wins)


In [None]:
# Find the team with the most away wins

grouped_a = df.groupby('Away_Team')
away_wins = grouped_a['Away_Win'].sum()
away_wins.sort_values(ascending=False, inplace=True)

print(away_wins)

In [None]:
# Find the team with the most wins in total

total_wins = home_wins.add(away_wins, fill_value=0)
total_wins.sort_values(ascending=False, inplace=True)

print(total_wins.head(10))

In [None]:
# Find teams that scored most goals

home_score = grouped_h['Home_Score'].sum()
away_score = grouped_a['Away_Score'].sum()

total_score = home_score.add(away_score, fill_value=0)
total_score.sort_values(ascending=False, inplace=True)

print(total_score.head(10))

In [None]:
# Create function to count the percentage of home wins, draws and away wins 

def calculate_percentage(group):
    total_games = len(group)
    home_wins = group['Home_Win'].eq(True).sum()
    draws = group['Draw'].eq(True).sum()
    away_wins = group['Away_Win'].eq(True).sum()
    
    home_wins_percentage = (home_wins / total_games) * 100
    draws_percentage = (draws / total_games) * 100
    away_wins_percentage = (away_wins / total_games) * 100
    
    return pd.Series({'Home Wins': home_wins_percentage,
                      'Draws': draws_percentage,
                      'Away Wins': away_wins_percentage})

In [None]:
# Grup by season and apply percentage function

per_season = df.groupby('season')
per_season = per_season.apply(calculate_percentage)
per_season.reset_index(level=['season'], inplace=True)

print(per_season)

In [None]:
# Plot bar chart to compare seasons

fig = px.bar(per_season, x='season', y=["Home Wins", "Draws", "Away Wins"], 
            title="Home Wins, Draws, and Away Wins from 2002 - 2021 by Season")
fig.show()

In [None]:
# Plot line graph to see trends

fig = px.line(per_season, x="season", y=["Home Wins", "Draws", "Away Wins"],
              title="Home Wins, Draws, and Away Wins from 2002 - 2021 by Season")
fig.show()

In [None]:
# Group by competition and apply percentages

per_comp = df.groupby(['Competition']) 
per_comp = per_comp.apply(calculate_percentage) 
per_comp.reset_index(level=['Competition'], inplace=True)

print(per_comp)

In [None]:
# Plot bar to compare national leagues

fig = px.bar(per_comp, x='Competition', y=["Home Wins", "Draws", "Away Wins"], 
            title="Home Wins, Draws, and Away Wins from 2002 - 2021 by Competition")
fig.show()

In [None]:
# df_grouped = df.groupby(['season', 'Competition']) # create new data frame grouped by season and comp.
# df_grouped = df_grouped.apply(calculate_percentage)  # apply percentage function
# df_grouped.reset_index(level=['season', 'Competition'], inplace=True)
