<a href="https://www.kaggle.com/code/mahmoudelshabrawy/arsenal-eda?scriptVersionId=193403010" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df = pd.read_csv("/kaggle/input/arsenal-epl-2019-2021/prem_arsenal.csv")
df

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df['date_col'] = pd.to_datetime(df['date_col'])
df['home_score'] = df['score'].apply(lambda x: int(x.split('–')[0]))
df['away_score'] = df['score'].apply(lambda x: int(x.split('–')[1]))
df['result'] = df.apply(lambda row: 'Win' if row['home_score'] > row['away_score'] else 'Draw' if row['home_score'] == row['away_score'] else 'Loss', axis=1)

In [None]:
df

In [None]:
home_stats = df[df['home'] == 'arsenal'].agg({
    'home_score': ['mean', 'sum'],
    'away_score': ['mean', 'sum'],
    'points': 'sum'
})
away_stats = df[df['away'] == 'arsenal'].agg({
    'home_score': ['mean', 'sum'],
    'away_score': ['mean', 'sum'],
    'points': 'sum'
})

print("Home Stats:\n", home_stats)
print("Away Stats:\n", away_stats)

In [None]:
# Home Advantage
df['home_advantage'] = df['home'] == 'arsenal'

# Match Outcome
df['match_outcome'] = df.apply(lambda row: 1 if row['home_score'] > row['away_score'] else 0 if row['home_score'] == row['away_score'] else -1, axis=1)

df.head()


In [None]:
import seaborn as sns
plt.figure(figsize=(12, 6))
sns.histplot(df[df['home'] == 'arsenal']['home_score'], kde=True, label='Home Score', color='blue', alpha=0.6)
sns.histplot(df[df['away'] == 'arsenal']['away_score'], kde=True, label='Away Score', color='red', alpha=0.6)
plt.title('Distribution of Scores (Home vs Away)')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
df['game_date'] = pd.to_datetime(df['date_col'])
df.set_index('game_date', inplace=True)
df[['points']].resample('M').sum().plot(figsize=(12, 6), title='Points Earned per Game Over Time')
plt.xlabel('Date')
plt.ylabel('Points')
plt.grid(True)
plt.show()


In [None]:
# Win-Loss Ratio by Referee
referee_performance = df.groupby('referee')['result'].value_counts(normalize=True).unstack().fillna(0)
referee_performance['win_loss_ratio'] = referee_performance['Win'] / referee_performance['Loss'].replace(0, 1)  # Avoid division by zero

plt.figure(figsize=(12, 8))
referee_performance['win_loss_ratio'].sort_values().plot(kind='barh', color='purple')
plt.title('Win-Loss Ratio by Referee')
plt.xlabel('Win-Loss Ratio')
plt.ylabel('Referee')
plt.grid(True)
plt.show()


In [None]:
points_by_result = df.groupby('result')['points'].sum()
plt.figure(figsize=(10, 6))
points_by_result.plot(kind='bar', color=['green', 'grey', 'red'])
plt.title('Total Points Earned by Match Result')
plt.xlabel('Match Result')
plt.ylabel('Total Points')
plt.grid(True)
plt.show()


In [None]:

df['month'] = df.index.month
df['year'] = df.index.year
performance_pivot = df.pivot_table(index='month', columns='year', values='points', aggfunc='mean')
plt.figure(figsize=(12, 8))
sns.heatmap(performance_pivot, cmap='YlGnBu', annot=True, fmt='.1f')
plt.title('Average Points per Month and Year')
plt.xlabel('Year')
plt.ylabel('Month')
plt.show()


In [None]:
rolling_avg = df['points'].rolling(window=5).mean()

plt.figure(figsize=(12, 6))
plt.plot(df.index, df['points'], alpha=0.5, label='Points')
plt.plot(df.index, rolling_avg, color='red', label='Rolling Average (Window=5)')
plt.title('Points with Rolling Average')
plt.xlabel('Date')
plt.ylabel('Points')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Create a correlation matrix
df['home_score'] = df['home_score'].astype(float)
df['away_score'] = df['away_score'].astype(float)
correlation_matrix = df[['home_score', 'away_score', 'points']].corr()

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)
plt.title('Correlation Matrix of Performance Metrics')
plt.show()


In [None]:
# Box Plot of Points by Match Result
plt.figure(figsize=(10, 6))
sns.boxplot(x='result', y='points', data=df, palette='Set2')
plt.title('Distribution of Points by Match Result')
plt.xlabel('Match Result')
plt.ylabel('Points')
plt.grid(True)
plt.show()


In [None]:
# Calculate win rates
win_rate_home = df[df['home_advantage']]['result'].value_counts(normalize=True).get('Win', 0)
win_rate_away = df[~df['home_advantage']]['result'].value_counts(normalize=True).get('Win', 0)

plt.figure(figsize=(10, 6))
plt.bar(['Home Advantage', 'Away'], [win_rate_home, win_rate_away], color=['blue', 'red'])
plt.title('Win Rate with and without Home Advantage')
plt.xlabel('Scenario')
plt.ylabel('Win Rate')
plt.grid(True)
plt.show()


In [None]:
# Scatter Plot
plt.figure(figsize=(10, 6))
plt.scatter(df['home_score'], df['away_score'], alpha=0.5)
plt.title('Home Score vs. Away Score')
plt.xlabel('Home Score')
plt.ylabel('Away Score')
plt.grid(True)
plt.show()


In [None]:
df['cumulative_points'] = df['points'].cumsum()

plt.figure(figsize=(12, 6))
plt.plot(df.index, df['cumulative_points'], color='purple')
plt.title('Cumulative Points Over Time')
plt.xlabel('Date')
plt.ylabel('Cumulative Points')
plt.grid(True)
plt.show()


In [None]:
# Pie Chart
outcome_counts = df['result'].value_counts()

plt.figure(figsize=(8, 8))
plt.pie(outcome_counts, labels=outcome_counts.index, autopct='%1.1f%%', colors=['green', 'grey', 'red'])
plt.title('Proportion of Match Outcomes')
plt.show()


In [None]:
df['cumulative_wins'] = (df['result'] == 'Win').cumsum()
df['cumulative_games'] = (df['result'] != 'Draw').cumsum()  # Count non-draw games
df['cumulative_win_rate'] = df['cumulative_wins'] / df['cumulative_games']
plt.figure(figsize=(12, 6))
plt.plot(df.index, df['cumulative_win_rate'], color='blue')
plt.title('Cumulative Win Rate Over Time')
plt.xlabel('Date')
plt.ylabel('Cumulative Win Rate')
plt.grid(True)
plt.show()

In [None]:
df['rolling_mean'] = df['points'].rolling(window=10).mean()
df['rolling_std'] = df['points'].rolling(window=10).std()

plt.figure(figsize=(12, 6))
plt.plot(df.index, df['points'], alpha=0.5, label='Points')
plt.plot(df.index, df['rolling_mean'], color='red', label='Rolling Mean (Window=10)')
plt.plot(df.index, df['rolling_std'], color='orange', label='Rolling Std Dev (Window=10)')
plt.title('Rolling Mean and Standard Deviation of Points')
plt.xlabel('Date')
plt.ylabel('Points')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df['home_score'], df['away_score'], df['points'], alpha=0.5, c=df['points'], cmap='viridis')

ax.set_xlabel('Home Score')
ax.set_ylabel('Away Score')
ax.set_zlabel('Points')
ax.set_title('3D Scatter Plot of Home Score, Away Score, and Points')

plt.show()


In [None]:
df['day_of_week'] = df.index.dayofweek
df['month'] = df.index.month

heatmap_data = df.pivot_table(index='day_of_week', columns='month', values='result', aggfunc='count', fill_value=0)

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data, cmap='YlGnBu', annot=True, fmt='d')
plt.title('Heatmap of Match Results by Day of Week and Month')
plt.xlabel('Month')
plt.ylabel('Day of Week')
plt.show()


In [None]:
import plotly.express as px

fig = px.scatter(df, x='home_score', y='away_score', color='result', 
                 size='points', hover_name=df.index, title='Interactive Scatter Plot of Home vs Away Scores')
fig.show()


In [None]:
df

In [None]:
plt.figure(figsize=(12, 6))
sns.kdeplot(data=df, x='home_score', y='away_score', cmap='Blues', fill=True)
plt.title('Bivariate KDE Plot of Home and Away Scores')
plt.xlabel('Home Score')
plt.ylabel('Away Score')
plt.grid(True)
plt.show()


In [None]:
df['external_factor'] = np.random.randn(len(df))  # Replace with actual data
correlation_matrix = df[['points', 'home_score', 'away_score', 'external_factor']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)
plt.title('Correlation Matrix with External Factors')
plt.show()


In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

features = df[['home_score', 'away_score', 'points']].dropna()
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_features)

plt.figure(figsize=(10, 6))
plt.scatter(pca_result[:, 0], pca_result[:, 1], c=df['result'].astype('category').cat.codes, cmap='viridis', alpha=0.5)
plt.colorbar(label='Match Result')
plt.title('PCA of Match Scores and Points')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)
plt.show()


In [None]:
!pip install dtaidistance

In [None]:
from sklearn.manifold import TSNE

# Example high-dimensional features
features = df[['home_score', 'away_score', 'points']].fillna(0)

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(features)

# Plot t-SNE result
plt.figure(figsize=(10, 8))
plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c=df['result'].astype('category').cat.codes, cmap='viridis', alpha=0.7)
plt.colorbar(label='Match Result')
plt.title('t-SNE Plot of Features')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.grid(True)
plt.show()


In [None]:
from sklearn.manifold import MDS
from sklearn.metrics import pairwise_distances

# Compute pairwise distances
distances = pairwise_distances(df[['home_score', 'away_score']].fillna(0))

# Apply MDS
mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
mds_result = mds.fit_transform(distances)

# Plot MDS result
plt.figure(figsize=(10, 8))
plt.scatter(mds_result[:, 0], mds_result[:, 1], c=df['points'], cmap='viridis', alpha=0.7)
plt.colorbar(label='Points')
plt.title('MDS Plot of Teams')
plt.xlabel('MDS Dimension 1')
plt.ylabel('MDS Dimension 2')
plt.grid(True)
plt.show()


In [None]:
from sklearn.manifold import TSNE

features = df[['home_score', 'away_score', 'points']].fillna(0)

tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(features)

plt.figure(figsize=(10, 8))
plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c=df['result'].astype('category').cat.codes, cmap='viridis', alpha=0.7)
plt.colorbar(label='Match Result')
plt.title('t-SNE Plot of Features')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='result', y='points', data=df, palette='Set2')
plt.title('Box Plot of Points by Match Result with Annotations')
plt.xlabel('Match Result')
plt.ylabel('Points')

for i, result in enumerate(df['result'].unique()):
    y = df[df['result'] == result]['points']
    plt.text(i, y.max(), f'Max: {y.max():.2f}', horizontalalignment='center')
    plt.text(i, y.min(), f'Min: {y.min():.2f}', horizontalalignment='center')

plt.grid(True)
plt.show()


In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
home_scores = df['home_score'].fillna(0)
away_scores = df['away_score'].fillna(0)
points = df['points'].fillna(0)

X, Y = np.meshgrid(np.unique(home_scores), np.unique(away_scores))
Z = np.array([[points[(home_scores == x) & (away_scores == y)].mean() for y in np.unique(away_scores)] for x in np.unique(home_scores)])

ax.plot_surface(X, Y, Z, cmap='viridis', edgecolor='none')
ax.set_title('3D Surface Plot of Home Score vs. Away Score vs. Points')
ax.set_xlabel('Home Score')
ax.set_ylabel('Away Score')
ax.set_zlabel('Points')

plt.show()


In [None]:
from pandas.plotting import lag_plot

plt.figure(figsize=(10, 6))
lag_plot(df['points'].dropna(), lag=1)
plt.title('Lag Plot of Points')
plt.xlabel('Current Value')
plt.ylabel('Lagged Value')
plt.grid(True)
plt.show()


In [None]:
df