# Exploratory Data Analysis (EDA)  
The provided script conducts an exploratory data analysis on a dataset of players. Below is a detailed breakdown of each step, followed by the relevant code snippets:

## 1. Dataset Overview  
This section provides a high-level summary of the dataset, including:

- Total observations and variables.
- Data types of variables.
-Missing value details.

In [None]:
import pandas as pd

# Load the dataset
file_path = '../data/raw/players_complete_raw.csv'
df = pd.read_csv(file_path)

# Dataset Summary
total_observations = len(df)
total_variables = df.shape[1]
missing_values_total = df.isnull().sum().sum()
missing_values_percentage = (missing_values_total / (len(df) * df.shape[1])) * 100

# Display summary
summary = pd.DataFrame({
    "Total Observations": [total_observations],
    "Total Variables": [total_variables],
    "Missing Values (Total)": [missing_values_total],
    "Missing Values (%)": [missing_values_percentage]
})
print(summary)

# Missing Values Details
missing_values_detail = df.isnull().sum()[df.isnull().sum() > 0]
print("Missing Values Details:", missing_values_detail)


## 2. Heatmap of Missing Values  
Visualizes missing values in the dataset to identify patterns or issues.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Heatmap for Missing Data
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Data Heatmap")
plt.xlabel("Variables")
plt.ylabel("Observations")
plt.show()


## 3. Distribution of Player Age  
Displays the distribution of player ages with a histogram to identify common age ranges.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Convert 'Age in Season' to numeric
df['Age in Season'] = pd.to_numeric(df['Age in Season'], errors='coerce')

# Age Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['Age in Season'], bins=20, kde=True, color='blue')
plt.title('Age Distribution of Players')
plt.xlabel('Age in Season')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


## 4. Relationship Between Games Played and Market Value  
Analyzes the correlation between games played and market value using a scatter plot.

In [None]:
# Ensure numeric conversion
df['Market_Value'] = pd.to_numeric(df['Market_Value'], errors='coerce')
df['Games Played'] = pd.to_numeric(df['Games Played'], errors='coerce')

# Scatter Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Games Played', y='Market_Value', alpha=0.7)
plt.title('Market Value vs. Games Played')
plt.xlabel('Games Played')
plt.ylabel('Market Value (in millions)')
plt.grid(linestyle='--', alpha=0.7)
plt.show()


## 5. Correlation Matrix of Numerical Variables  
Examines relationships between numerical variables.

In [None]:
# Select numerical variables
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns

# Calculate correlations
correlation_matrix = df[numerical_columns].corr()

# Heatmap of Correlations
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.title('Correlation Matrix of Numerical Variables')
plt.show()


## 6. Market Value Distribution by Age  
Illustrates how market value varies across different age groups.

In [None]:
# Boxplot of Market Value by Age
plt.figure(figsize=(10, 6))
sns.boxplot(x='Age in Season', y='Market_Value', data=df)
plt.title('Market Value Distribution by Age in Season')
plt.xlabel('Age in Season')
plt.ylabel('Market Value (in millions)')
plt.grid(axis='y')
plt.show()


## 7. Average Goals by Age Group  
Analyzes the average number of goals scored by players within different age ranges.

In [None]:
# Create age groups
df['Age Group'] = pd.cut(df['Age in Season'], bins=range(15, 41, 5), right=False)

# Calculate average goals per age group
avg_goals = df.groupby('Age Group')['Goals'].mean()

# Bar Chart
plt.figure(figsize=(10, 6))
avg_goals.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Average Goals Scored by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Average Goals Scored')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()


## 8. Player-Specific Trends  
Tracks the market value progression of selected players over their careers.

In [None]:
# Filter for selected players
selected_players = ['Granit Xhaka', 'Taulant Xhaka']
player_data = df[df['Name'].isin(selected_players)]

# Plot market value progression
plt.figure(figsize=(12, 6))
for player in selected_players:
    subset = player_data[player_data['Name'] == player]
    plt.plot(subset['Age in Season'], subset['Market_Value'] / 1e6, marker='o', label=player)

plt.title('Market Value Progression for Selected Players')
plt.xlabel('Age in Season')
plt.ylabel('Market Value (in millions)')
plt.legend(title='Player Name')
plt.grid()
plt.show()


## 9. League Representation by Player Position  
Displays the number of players in each position across various leagues using a stacked bar chart.

In [None]:
# Group data by league and position
league_position_data = df.groupby(['Competition', 'Position']).size().unstack(fill_value=0)

# Stacked Bar Chart
league_position_data.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='viridis')
plt.title('League Representation by Player Position')
plt.xlabel('League')
plt.ylabel('Number of Players')
plt.xticks(rotation=45)
plt.legend(title='Position')
plt.show()


## 10. Heatmap of Goals vs. Games Played  
Shows the density of goals scored relative to games played.

In [None]:
# Density Heatmap
plt.figure(figsize=(10, 6))
sns.kdeplot(x='Games Played', y='Goals', data=df, cmap='Blues', fill=True)
plt.title('Density Heatmap of Goals Scored vs. Games Played')
plt.xlabel('Games Played')
plt.ylabel('Goals Scored')
plt.grid()
plt.show()
