## Data Exploration of Understat data

In [127]:
from math import factorial as fac

In [27]:
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
import os
import plotly.graph_objects as go
import nbformat
import matplotlib.pyplot as plt

In [2]:
#Let's read the data in in one big table
df_list = []
for file in os.listdir('Project_data/league_tables'):
    df_temp = pd.read_csv(f'Project_data/league_tables/{file}')
    df_list.append(df_temp)
df = pd.concat(df for df in df_list).set_index('position')

Some initial variable change - creating 'per game' stats

In [3]:
list_of_per_game_metrics = ['wins', 'draws', 'loses','scored', 'conceded', 'pts', 'xG', 'npxG', 'xGA', 'npxGA','npxGD', 'xpts']
for variable in list_of_per_game_metrics:
    df[f'{variable}_PG'] = df[variable] / df['matches']


Graph 1 - How many points, goals, and conceded per game does the winner normally get?

In [97]:
winners_goals_conceded_pts = df[df.index == 1][['pts_PG','scored_PG','conceded_PG','position.1']]

In [95]:
hist_1 = go.Histogram(x=winners_goals_conceded_pts['pts_PG'],name = "Points per game",nbinsx=20)    
hist_2 = go.Histogram(x=winners_goals_conceded_pts['scored_PG'],name = 'Scored per game',nbinsx=20)   
hist_3 = go.Histogram(x=winners_goals_conceded_pts['conceded_PG'],name = 'Conceded per game',nbinsx=20)

fig = plotly.subplots.make_subplots(rows=1,cols=2,subplot_titles=['Points and scored','Conceded'])
fig.add_trace(
    hist_1,
    row=1,col=1,
    )
fig.add_trace(
    hist_2,
    row=1,col=1
)
fig.add_trace(
    hist_3,
    row=1,col=2
)
fig.update_layout(height=500,width=1200, title='Winners per game metrics',title_x=0.5)
fig.update_xaxes(title_text='goals and pts',showgrid=True,row=1,col=1,range=[1.6,3],dtick=0.2)
fig.update_xaxes(title_text='conceded',showgrid=True,row=1,col=2,range=[0.2,1.6],dtick=0.2)
fig.show()

Let's plot a scatter plot of goals per game versus position, and conceded per game versus position

In [116]:
scatter_plot_1 = go.Scatter(x=df['scored_PG'],y=df.index,name='Scored per game versus position',mode='markers')
scatter_plot_2 = go.Scatter(x=df['conceded_PG'],y=df.index,name='Conceded per game versus position',mode='markers')
fig = plotly.subplots.make_subplots(rows=1,cols=2,subplot_titles=['Scored per game v position','conceded per game versus position'])
fig.add_trace(
    scatter_plot_1,
    row=1,col=1
)
fig.add_trace(
    scatter_plot_2,
    row=1,col=2
)
fig.update_layout(height=500,width=1500)
fig.update_xaxes(title_text = 'goals per game',row=1,col=1)
fig.update_xaxes(title_text = 'conceded per game',row=1,col=2)
fig.update_yaxes(title_text = 'position')
fig.show()

Let's compare xG to G and x_conceded to conceded

In [120]:
xg_v_g = go.Scatter(x=df['xG_PG'],y=df['scored_PG'],mode='markers')
xa_v_a = go.Scatter(x=df['xGA_PG'],y=df['conceded_PG'],mode='markers')
fig = plotly.subplots.make_subplots(rows=1,cols=2,subplot_titles=['xG v G','xGA v A'])
fig.add_trace(
    xg_v_g,
    row=1,col=1
)
fig.add_trace(
    xa_v_a,
    row=1,col=2
)
fig.update_layout(height=500,width=1500, title_text='Comparing performance versus expected performance',title_x=0.5)
fig.update_xaxes(title_text = 'expected goals per game',row=1,col=1)
fig.update_xaxes(title_text = 'expected conceded per game',row=1,col=2)
fig.update_yaxes(title_text = 'goalsscored', row=1,col=1)
fig.update_yaxes(title_text = 'conceded', row=1,col=2)
fig.show()

