In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

In [3]:
# players_21 data files
players_data_path = "data/players_21.csv"

# Read the players data
df = pd.read_csv(players_data_path)

# Display the data table for preview
df.head()

Unnamed: 0,sofifa_id,player_url,short_name,long_name,age,dob,height_cm,weight_kg,nationality,club_name,...,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,33,1987-06-24,170,72,Argentina,FC Barcelona,...,66+3,65+3,65+3,65+3,66+3,62+3,52+3,52+3,52+3,62+3
1,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,35,1985-02-05,187,83,Portugal,Juventus,...,65+3,61+3,61+3,61+3,65+3,61+3,54+3,54+3,54+3,61+3
2,200389,https://sofifa.com/player/200389/jan-oblak/210002,J. Oblak,Jan Oblak,27,1993-01-07,188,87,Slovenia,Atlético Madrid,...,32+3,36+3,36+3,36+3,32+3,32+3,33+3,33+3,33+3,32+3
3,188545,https://sofifa.com/player/188545/robert-lewand...,R. Lewandowski,Robert Lewandowski,31,1988-08-21,184,80,Poland,FC Bayern München,...,64+3,65+3,65+3,65+3,64+3,61+3,60+3,60+3,60+3,61+3
4,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Júnior,28,1992-02-05,175,68,Brazil,Paris Saint-Germain,...,67+3,62+3,62+3,62+3,67+3,62+3,49+3,49+3,49+3,62+3


In [4]:
# Check height and weight columns if they have the appropriate data type (should be in integers int64)
df[['height_cm', 'weight_kg']].dtypes

height_cm    int64
weight_kg    int64
dtype: object

In [5]:
# Check datatype of 'joined' column
df['joined'].dtypes

dtype('O')

In [6]:
# Convert 'joined' column into datatime, 
df['joined'] = pd.to_datetime(df['joined'], errors='coerce')

# then extract the year, month, and day into 3 separate cols
df['joined_year'] = df['joined'].dt.year
df['joined_month'] = df['joined'].dt.month
df['joined_day'] = df['joined'].dt.day

# Display the results
df[['joined', 'joined_year', 'joined_month', 'joined_day']].head()

Unnamed: 0,joined,joined_year,joined_month,joined_day
0,2004-07-01,2004.0,7.0,1.0
1,2018-07-10,2018.0,7.0,10.0
2,2014-07-16,2014.0,7.0,16.0
3,2014-07-01,2014.0,7.0,1.0
4,2017-08-03,2017.0,8.0,3.0


In [7]:
# Check datatype of 'value, wage, release clause' cols
df[['value_eur', 'wage_eur', 'release_clause_eur']].dtypes

value_eur               int64
wage_eur                int64
release_clause_eur    float64
dtype: object

In [8]:
# Convert float to int
df['release_clause_eur'] = df['release_clause_eur'].fillna(0).astype(int)

# Verify the change
df[['release_clause_eur']].dtypes

release_clause_eur    int32
dtype: object

In [9]:
# Display the results of the 3 cols
df[['value_eur', 'wage_eur', 'release_clause_eur']].head()

Unnamed: 0,value_eur,wage_eur,release_clause_eur
0,67500000,560000,138400000
1,46000000,220000,75900000
2,75000000,125000,159400000
3,80000000,240000,132000000
4,90000000,270000,166500000


In [10]:
df['contract_valid_until'].dtypes

dtype('float64')

In [11]:
# Convert to int
df['contract_valid_until'] = df['contract_valid_until'].fillna(0).astype(int)

In [12]:
# Check if there's missing values in the dataset
missing_values = df.isnull().sum()
missing_values

sofifa_id         0
player_url        0
short_name        0
long_name         0
age               0
               ... 
rcb               0
rb                0
joined_year     983
joined_month    983
joined_day      983
Length: 109, dtype: int64

## Analyzing Contract Expiration Trend
- Explore trends in contract expiration dates across clubs or nationalities

In [13]:
# 

## Player Performance vs Wage
- Analyze if younger players tend to have higher potential values compared to older players.

In [14]:
#


## Position Analysis
- Analyze which positions (forwards, midfielders, etc) have the highest overall ratings or wage

In [15]:
#