# 1. Imort Data

In [None]:
import pandas as pd

players_path = '../data/players.csv'
players_valuations_path = '../data/player_valuations.csv'

players = pd.read_csv(players_path)
players_valuations = pd.read_csv(players_valuations_path)

# 2. Dataset Information

## 2-1. players

In [None]:
print(players.info())
display(players.head())
print(players.columns)

## 2-3. players_valuations

In [None]:
print(players_valuations.info())
display(players_valuations.head())
print(players_valuations.columns)

## 결측치 확인

In [None]:
print(players.isna().sum(),'\n')
print(players_valuations.isna().sum())

## 기초 통계량

In [None]:
# 특정 컬럼에 대해 unique 값 개수 출력 함수 정의
def count_value(df, column):
    count = len(df[column].unique())
    print(f'Total {column}: {count}')

columns = ['player_id', 'current_club_id', 'country_of_citizenship'] # player_id, 소속 구단, 국적에 대한 unique 개수 확인

for column in columns:
    count_value(players, column)

In [None]:
# 선수 시장가치(market_value_in_eur)의 기초 통계량
players_valuations['market_value_in_eur'].describe()

In [None]:
pd.options.display.float_format = '{:.0f}'.format
players_valuations['market_value_in_eur'].describe()

In [None]:
# 평균 시장가치 이상인 선수 비율 계산
mean_ = players_valuations['market_value_in_eur'].mean()
over_mean = len(players_valuations[players_valuations['market_value_in_eur'] > mean_])
total = len(players_valuations)
print(f"percentile of player over mean value: {over_mean/total*100:.2f}%")

# 3. Handling DataFrame

In [None]:
players_with_val = pd.merge(players, players_valuations, on='player_id')
players_with_val[players_with_val['last_name']=='Son'].tail()

## 3-1. 태어난 년도 기준으로 각 연봉을 받았던 때의 나이 계산하여 age 컬럼 추가

In [None]:
# "YYYY-MM-DD” 형식의 date에서 연도만 뽑아내서 dateyear 컬럼 추가
players_with_val['dateyear'] = players_with_val['date'].apply(lambda x: int(x[:4]))
players_with_val['age'] = players_with_val['dateyear'] - players_with_val['date_of_birth'].apply(lambda x: int(x[:4]))

In [None]:
# 선수별 동일 연도 데이터 중복 제거 → 가장 마지막 기록만 남김
players_with_val.drop_duplicates(['player_id','dateyear'], keep='last',inplace=True)
players_with_val[players_with_val['last_name']=='Son'].head()

In [None]:
# 분석에 필요한 컬럼만 선별
columns = [
    'player_id', 'current_club_id_y', 'first_name', 'last_name', 'name', 'last_season_x', 'country_of_citizenship', 'city_of_birth', 'position', 'sub_position', 'dateyear', 'age', 'market_value_in_eur_y']
players_with_val = players_with_val[columns]

players_with_val.rename(
    columns={
    "current_club_id_y": "current_club_id",
    "last_season_x": "last_season",
    "market_value_in_eur_y":"market_value_in_eur"
    },
    inplace=True
)

players_with_val

In [None]:
players_with_val['market_value_in_eur'].describe()

In [None]:
# 2022년 기준 데이터만 추출
players_with_val_2022 = players_with_val[(players_with_val['dateyear'] == 2022) & (players_with_val['last_season'] == 2022)]
players_with_val_2022

In [None]:
# 2022년 선수별 market_value 순위 계산
players_with_val_2022['market_value_rank'] = players_with_val_2022['market_value_in_eur'].rank(method="min", ascending=False)

# market_value 기준 정렬 후 손흥민 선수 확인
players_with_val_2022.sort_values(by='market_value_rank')
players_with_val_2022[players_with_val_2022['last_name'] == 'Son']