# 1. Import Data

In [None]:
from pathlib import Path
import pandas as pd

cwd = Path.cwd()
for candidate in (cwd / 'data', cwd.parent / 'data'):
    if candidate.exists():
        DATA_DIR = candidate
        break
else:
    raise FileNotFoundError(f'Could not locate data directory from {cwd}')

players_path = DATA_DIR / 'players.csv'
players_valuations_path = DATA_DIR / 'player_valuations.csv'

players = pd.read_csv(players_path)
players_valuations = pd.read_csv(players_valuations_path)


# 2. Dataset Information

## 2-1. players

In [None]:
print(players.info())
display(players.head())
print(players.columns)

## 2-3. players_valuations

In [None]:
print(players_valuations.info())
display(players_valuations.head())
print(players_valuations.columns)

## 결측치 확인

In [None]:
print(players.isna().sum(),'\n')
print(players_valuations.isna().sum())

## 기초 통계량

In [None]:
# 특정 컬럼에 대해 unique 값 개수 출력 함수 정의
def count_value(df, column):
    count = len(df[column].unique())
    print(f'Total {column}: {count}')

columns = ['player_id', 'current_club_id', 'country_of_citizenship'] # player_id, 소속 구단, 국적에 대한 unique 개수 확인

for column in columns:
    count_value(players, column)

In [None]:
# 선수 시장가치(market_value_in_eur)의 기초 통계량
players_valuations['market_value_in_eur'].describe()

In [None]:
pd.options.display.float_format = '{:.0f}'.format
players_valuations['market_value_in_eur'].describe()

In [None]:
# 평균 시장가치 이상인 선수 비율 계산
mean_ = players_valuations['market_value_in_eur'].mean()
over_mean = len(players_valuations[players_valuations['market_value_in_eur'] > mean_])
total = len(players_valuations)
print(f"percentile of player over mean value: {over_mean/total*100:.2f}%")

# 3. Handling DataFrame

In [None]:
players_with_val = pd.merge(players, players_valuations, on='player_id')
players_with_val[players_with_val['last_name']=='Son'].tail()

## 3-1. 태어난 년도 기준으로 각 연봉을 받았던 때의 나이 계산하여 age 컬럼 추가

In [None]:
# "YYYY-MM-DD” 형식의 date에서 연도만 뽑아내서 dateyear 컬럼 추가
players_with_val['dateyear'] = players_with_val['date'].apply(lambda x: int(str(x)[:4]) if pd.notna(x) else None)
players_with_val['age'] = players_with_val['dateyear'] - players_with_val['date_of_birth'].apply(
    lambda x: int(str(x)[:4]) if pd.notna(x) else None
)

In [None]:
# 선수별 동일 연도 데이터 중복 제거 → 가장 마지막 기록만 남김
players_with_val.drop_duplicates(['player_id','dateyear'], keep='last',inplace=True)
players_with_val[players_with_val['last_name']=='Son'].head()

In [None]:
# 분석에 필요한 컬럼만 선별
columns = [
    'player_id', 'current_club_id_y', 'first_name', 'last_name', 'name', 'last_season_x', 'country_of_citizenship', 'city_of_birth', 'position', 'sub_position', 'dateyear', 'age', 'market_value_in_eur_y']
players_with_val = players_with_val[columns]

players_with_val.rename(
    columns={
    "current_club_id_y": "current_club_id",
    "last_season_x": "last_season",
    "market_value_in_eur_y":"market_value_in_eur"
    },
    inplace=True
)

players_with_val

In [None]:
players_with_val['market_value_in_eur'].describe()

In [None]:
# 2022년 기준 데이터만 추출
players_with_val_2022 = players_with_val[(players_with_val['dateyear'] == 2022) & (players_with_val['last_season'] == 2022)]
players_with_val_2022

In [None]:
# 2022년 선수별 market_value 순위 계산
players_with_val_2022['market_value_rank'] = players_with_val_2022['market_value_in_eur'].rank(method="min", ascending=False)

# market_value 기준 정렬 후 손흥민 선수 확인
players_with_val_2022.sort_values(by='market_value_rank')
players_with_val_2022[players_with_val_2022['last_name'] == 'Son']

# 4. Visaulization

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl

# 한글 폰트 설정
mpl.rc('font', family='AppleGothic')
# 음수 기호 깨짐 방지
mpl.rcParams['axes.unicode_minus'] = False

## 4-1. 연도별 선수가치 분포도

In [None]:
import matplotlib.pyplot as plt


plt.figure(figsize=(8,6))
plt.boxplot(players_with_val['market_value_in_eur'])
plt.title('선수들의 시장가치 분포도')
plt.show()

대부분의 선수들의 시장가치는 낮은 구간에 몰려있고
극소수의 스타 선수들만 박스플롯의 이상치로 표시되어있음을 볼 수 있다

In [None]:
# 선수 가치별로 그룹화하여 같은 시장가치를 가진 선수 수를 카운트
plt.figure(figsize=(8,6))
players_with_val.groupby('market_value_in_eur')['player_id'].count().plot()
plt.title('가치별 선수 수 분포도')
plt.show()

* x축: market_value_in_eur (시장가치, 보통 유로 단위).
*  같은 시장가치를 가진 선수 수.
결과적으로 “특정 시장가치를 가진 선수가 몇 명 있는지”를 보여줌.

**문제점**
* 시장가치 데이터는 연속형(금액이 다양함)인데, 그대로 groupby 하면 x축에 값이 너무 많아 복잡하게 나옴.
* 예: 시장가치가 500,000 유로, 510,000 유로, 520,000 유로… 이런 값들이 전부 개별 x축으로 들어감 → 해석 어려움.

In [None]:
sum_per_year = players_with_val.groupby('dateyear')['market_value_in_eur'].sum()
x = sum_per_year.index
y = sum_per_year.values

plt.figure(figsize=(8,6))
plt.plot(x, y)
plt.xticks(rotation=45)
plt.title('연도별 전체 선수 시장가치 총합의 변화 추세')
plt.show()

In [None]:
max_per_year = players_with_val.groupby('dateyear')['market_value_in_eur'].max()
x = max_per_year.index
y = max_per_year.values

plt.figure(figsize=(8,6))
plt.plot(x,y)
plt.xticks(rotation=45)
plt.title('연도별 선수가치 max 값의 변화 추세')
plt.show()

In [None]:
mean_per_year = players_with_val.groupby('dateyear')['market_value_in_eur'].mean()
x = mean_per_year.index
y = mean_per_year.values

plt.figure(figsize=(8,6))
plt.plot(x, y)
plt.xticks(rotation=45)
plt.title('연도별 선수가치 평균값의 변화 추세')
plt.show()

In [None]:
y = players_with_val.groupby('dateyear')['player_id'].count()

plt.figure(figsize=(8,6))
plt.bar(x, y)
plt.xticks(rotation=45)
plt.title('연도별 선수가치별 선수 수의 변화 추세')
plt.show()

In [None]:

colors = ['blue' if count > 5000 else 'green' for count in y]

plt.figure(figsize=(8,6))
plt.bar(x, y, color=colors)
plt.xticks(rotation=45)
plt.title("연도별 선수 시장가치 총합")
plt.show()


In [None]:
players_with_val = players_with_val[(players_with_val['dateyear'] >= 2013) & (players_with_val['dateyear'] < 2023)]

In [None]:
years = sorted(players_with_val['dateyear'].unique())

num_plots = len(years)
num_rows = 4
num_cols = (num_plots + 3) // 4

fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 8))

axes = axes.flatten()

for i, year in enumerate(years):
    market_values = players_with_val[players_with_val['dateyear'] == year]['market_value_in_eur'].values

    ax = axes[i]
    ax.boxplot(market_values)
    ax.set_title(year)

for j in range(num_plots, num_rows * num_cols):
    fig.delaxes(axes[j])

fig.suptitle("연도별 선수 시장가치 분포", fontsize=16)

plt.tight_layout()

plt.show()

## 4-2. 나이별 시장가치 분포도

In [None]:
players_with_val.info()

In [None]:
age_market_values = players_with_val.groupby('age')['market_value_in_eur'].mean()

plt.figure(figsize=(8,6))
age_market_values.plot(kind='bar')
plt.xlabel('Age')
plt.ylabel('Mean Market Value (EUR)')
plt.xticks(rotation=0)
plt.title('나이별 평균가치 분포도')
plt.show()

In [None]:
filtered_data = players_with_val[players_with_val['age'] <= 35]

age_market_values = filtered_data.groupby('age')['market_value_in_eur'].mean()

sorted_values = age_market_values.sort_values(ascending=False)

top_5_intervals = sorted_values.head(5).index

colors = ['blue' if age in top_5_intervals else 'green' for age in age_market_values.index]

plt.figure(figsize=(8, 6))
age_market_values.plot(kind='bar', color=colors)
plt.xlabel('Age')
plt.ylabel('Mean Market Value (EUR)')
plt.title('나이별 평균가치 분포(~35 세) 중 top 5 ')
plt.xticks(rotation=45)
plt.show()

In [None]:

import numpy as np

def get_top_name(g):
    if g['market_value_in_eur'].notna().any():
        return g.loc[g['market_value_in_eur'].idxmax(), 'name']
    return np.nan

top_players = (
    filtered_data
    .groupby('age', group_keys=False)
    .apply(get_top_name)
)


plt.figure(figsize=(12, 6))
ax = age_market_values.plot(kind='bar', color=colors)
ax.set_xlabel('Age')
ax.set_ylabel('Mean Market Value (EUR)')
ax.set_title('나이별 최고가 선수 (Up to 35)')

for i, (age, value) in enumerate(zip(age_market_values.index, age_market_values.values)):
    name = top_players.get(age, None)
    if pd.notna(name):
        ax.text(i, value * 1.01, name,
                ha='center', va='bottom',
                fontsize=9,
                fontweight='bold' if age in sorted_values.head().index else 'normal',
                rotation=0)

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 4-3. 포지셜별 가치 분포

In [None]:
position_market_values = players_with_val.groupby('position')['market_value_in_eur'].mean()

plt.figure(figsize=(8,6))
position_market_values.plot(kind='bar')
plt.xlabel('Position')
plt.ylabel('Mean Market Value (EUR)')
plt.title('포지션별 평균 가치')
plt.xticks(rotation=45)
plt.show()


In [None]:
position_market_values = players_with_val.groupby(['position', 'dateyear'])['market_value_in_eur'].mean()

position_market_values = position_market_values.reset_index()

plt.figure(figsize=(12, 8))
for position in position_market_values['position'].unique():
    position_data = position_market_values[position_market_values['position'] == position]
    plt.plot(position_data['dateyear'], position_data['market_value_in_eur'], label=position)

plt.xlabel('Year')
plt.ylabel('Mean Market Value (EUR)')
plt.title('포지션별 평균 시장가치 추이 (연도별)')
plt.legend()
plt.xticks(rotation=45)
plt.show()

In [None]:
positions = players_with_val['position'].unique()

fig, axes = plt.subplots(len(positions), figsize=(10, 20))

for i, position in enumerate(positions):
    ax = axes[i]
    ax.set_title(position)

    position_data = players_with_val[players_with_val['position'] == position]
    sub_positions = position_data['sub_position'].unique()

    for sub_position in sub_positions:
        sub_position_data = position_data[position_data['sub_position'] == sub_position]
        sub_position_value = sub_position_data.groupby('dateyear')['market_value_in_eur'].mean()

        ax.plot(sub_position_value.index, sub_position_value.values, label=sub_position)

    ax.legend()

plt.tight_layout()
plt.show()


In [None]:
positions = players_with_val['position'].unique()

fig, axes = plt.subplots(len(positions), figsize=(10, 20))

for i, position in enumerate(positions):
    ax = axes[i]
    ax.set_title(position)

    position_data = players_with_val[players_with_val['position'] == position]
    sub_positions = position_data['sub_position'].unique()

    for sub_position in sub_positions:
        sub_position_data = position_data[position_data['sub_position'] == sub_position]
        sub_position_value = sub_position_data.groupby('dateyear')['market_value_in_eur'].mean()

        ax.plot(sub_position_value.index, sub_position_value.values, label=sub_position)

        top_players = sub_position_data.loc[sub_position_data.groupby('dateyear')['market_value_in_eur'].idxmax(), 'name']

        for year, player in zip(sub_position_value.index, top_players):
            ax.text(year, sub_position_value[year], player, ha='center', va='bottom')

    ax.legend()

plt.tight_layout()
plt.show()


## 4-4. 국가별 가치 분포

In [None]:
country_player_counts = players_with_val.drop_duplicates('player_id')['country_of_citizenship'].value_counts()
country_player_counts.sort_values(ascending=False)

top_10_countries = country_player_counts.head(10)

plt.figure(figsize=(12, 8))
top_10_countries.plot(kind='bar')
plt.xlabel('Country of Citizenship')
plt.ylabel('Number of Players')
plt.title('	Top 10 국적별 선수 수 분포')

for i, value in enumerate(top_10_countries):
    country = top_10_countries.index[i]
    plt.text(i, value, str(value), ha='center', va='bottom')

plt.xticks(rotation=45)
plt.show()

In [None]:
country_player_counts_2022 = players_with_val_2022.drop_duplicates('player_id')['country_of_citizenship'].value_counts()
country_player_counts_2022.sort_values(ascending=False)

top_10_countries = country_player_counts_2022.head(10)

plt.figure(figsize=(12, 8))
top_10_countries.plot(kind='bar')
plt.xlabel('Country of Citizenship')
plt.ylabel('Number of Players')
plt.title('	Top 10 국적별 선수 수 분포 - 2022')

for i, value in enumerate(top_10_countries):
    country = top_10_countries.index[i]
    plt.text(i, value, str(value), ha='center', va='bottom')

plt.xticks(rotation=45)
plt.show()

## 4-5. 국가별 선수 수 분포도

In [None]:
from geopy.geocoders import Nominatim
import time

locations = []

errors = []

geolocator = Nominatim(user_agent="my-app")

countries = country_player_counts.index

for country in countries:
    try:
        location = geolocator.geocode(country)
    except Exception as e:
        print(f"지오코딩 오류 - 나라: {country}. 오류: {e}")
        errors.append(country)
        continue

    time.sleep(0.3)
    latitude = location.latitude
    longitude = location.longitude

    locations.append((country, latitude, longitude))
    print("나라:", country)
    print("위도:", latitude)
    print("경도:", longitude)
    print("-"*20)



In [None]:
locations_df_2022 = pd.DataFrame(locations)
locations_df_2022.rename(columns={
    0: 'country',
    1: 'Latitude',
    2: 'Longitude'
}, inplace=True)

locations_df_2022['player_count'] = locations_df_2022.country.apply(lambda x: country_player_counts_2022.get(x, 0))

In [None]:
import plotly.express as px


fig = px.density_mapbox(
    locations_df_2022,
    lat="Latitude",
    lon="Longitude",
    z="player_count",
    radius=15,
    center=dict(lat=20, lon=0),
    zoom=1,
    mapbox_style="open-street-map"
)

fig.update_layout(
    title="Density Map of 2022 Players"
)

fig.show(renderer="browser")

In [None]:
cities_in_england = players_with_val[players_with_val['country_of_citizenship'] == 'England'].drop_duplicates('player_id')['city_of_birth']
print(f"Number of City in Enlgand: {len(cities_in_england.unique())}")

england_players_count = cities_in_england.value_counts()

top_10_city_in_england = england_players_count.head(10)

plt.figure(figsize=(12, 8))
top_10_city_in_england.plot(kind='bar')
plt.xlabel('City of England')
plt.ylabel('Number of Playesr')
plt.title('영국 도시별 선수 수 분포')

for i, value in enumerate(top_10_city_in_england):
    city = top_10_city_in_england.index[i]
    plt.text(i, value, str(value), ha='center', va='bottom')

plt.xticks(rotation=45)
plt.show()

In [None]:
top_50_city_in_england = england_players_count.head(50)

enlgand_city_locations = []

errors = []

geolocator = Nominatim(user_agent="my-app")

cities = top_50_city_in_england.index

for city in cities:
    try:
        location = geolocator.geocode(city)
    except Exception as e:
        print(f"지오코딩 오류 - 나라: {city}. 오류: {e}")
        errors.append(city)
        continue

    time.sleep(0.3)
    latitude = location.latitude
    longitude = location.longitude

    enlgand_city_locations.append((city, latitude, longitude))


In [None]:
england_locations_df = pd.DataFrame(enlgand_city_locations)
england_locations_df.rename(columns={
    0: 'city',
    1: 'Latitude',
    2: 'Longitude'
}, inplace=True)

england_locations_df['player_count'] = england_locations_df.city.apply(lambda x: england_players_count[x])
england_locations_df.head()

In [None]:
import folium

england_location = [55.8670, -4.2621]
england_map = folium.Map(location=england_location, zoom_start=5)

for index, row in england_locations_df.iterrows():
    city = row['city']
    latitude = row['Latitude']
    longitude = row['Longitude']
    player_count = row['player_count']

    radius = player_count / 5
    color = 'darkred' if player_count > 100 else 'red' if player_count > 50 else 'lightred'
    folium.CircleMarker(
        location=[latitude, longitude],
        radius=radius,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.6,
        tooltip=f"<b>{city}</b><br>Player Count: {player_count}"
    ).add_to(england_map)

england_map

In [None]:
df_for_animation = pd.DataFrame(
    players_with_val[players_with_val['country_of_citizenship'] == 'England']
    .groupby(['dateyear', 'city_of_birth'])['player_id']
    .count()
    )

df_for_animation.reset_index(inplace=True)
df_for_animation.rename(
    columns={
        'city_of_birth': 'city',
        'player_id': 'player_count'
        },
    inplace=True
    )

df_for_animation = pd.merge(
    df_for_animation, england_locations_df[['city', 'Latitude', 'Longitude']],
    on='city', how='left')

df_for_animation.dropna(inplace=True)
df_for_animation

In [None]:
import plotly.express as px

fig = px.scatter_geo(
    df_for_animation, lat="Latitude", lon="Longitude",
    color="player_count", size="player_count", hover_name="city",
    animation_frame="dateyear", projection="natural earth"
)

fig.update_geos(
    projection_scale=3.5,
    scope="europe",
    center=dict(lat=55.8670, lon=-4.2621),
)

fig.update_layout(
    title="England Football Player Count by City Over Years",
    height=600, width=800,
)

fig.show(renderer="browser")

## 4-5. 클럽 단위 총 가치

In [None]:

clubs_path = DATA_DIR / 'clubs.csv'
competitions_path = DATA_DIR / 'competitions.csv'

clubs = pd.read_csv(clubs_path)
competitions = pd.read_csv(competitions_path)

print(clubs.head())
print(competitions.head())


In [None]:
clubs.info()

In [None]:
clubs_2022 = clubs[clubs['last_season'] == 2022]

In [None]:
total_market_values = players_with_val_2022.groupby('current_club_id')['market_value_in_eur'].sum()

clubs_2022['total_market_value'] = clubs_2022['club_id'].apply(lambda club_id: total_market_values[club_id])
clubs_2022

In [None]:
columns = ['competition_id', 'name', 'country_name']
competitions = competitions[columns].rename(columns={'competition_id': 'domestic_competition_id'})

In [None]:
clubs_2022 = pd.merge(clubs_2022, competitions, on='domestic_competition_id')

display(clubs_2022.head())
print(clubs_2022.info())

In [None]:
columns = ['club_id', 'club_code', 'name_x', 'total_market_value', 'squad_size', 'average_age', 'foreigners_number', 'foreigners_percentage', 'name_y', 'country_name']

clubs_2022 = clubs_2022[columns]
clubs_2022.rename(columns={
    "name_x": "club_name",
    "name_y": "competition_name"
}, inplace=True)

clubs_2022

In [None]:
clubs_2022

In [None]:
import pandas as pd, plotly
print(pd.__version__, plotly.__version__)

In [None]:
import plotly.express as px

sorted_clubs_2022 = clubs_2022.sort_values('total_market_value', ascending=False)

fig = px.treemap(
    sorted_clubs_2022,
    path=['club_name'],  # 'name'이 club_name임
    values='total_market_value',
    labels={'club_name': 'Club Name', 'total_market_value': 'Total Market Value'},
    title='Clubs in 2022 by Total Market Value'
)
fig.show()


In [None]:
sorted_clubs_2022 = clubs_2022.sort_values('total_market_value', ascending=False)

fig = px.treemap(sorted_clubs_2022, path=['country_name', 'club_name'], values='total_market_value',
                 labels={'club_name': 'Club Name', 'total_market_value': 'Total Market Value'},
                 title='Clubs in 2022 by Total Market Value')
fig.show(renderer="browser")

In [None]:
fig = px.bar(sorted_clubs_2022, x='club_name', y='total_market_value', color='country_name',
             labels={'club_name': 'Club Name', 'total_market_value': 'Total Market Value'},
             title='Clubs in 2022 by Total Market Value')
fig.show(renderer="browser")

In [None]:
fig = px.box(clubs_2022, x='competition_name', y='total_market_value', color='competition_name',
             title='Market Value Distribution by Competition(Domestic League)',
             labels={'competition_name': 'Competition', 'total_market_value': 'Total Market Value'})
fig.show(renderer="browser")

In [None]:
fig = px.box(clubs_2022, x='competition_name', y='average_age', color='competition_name',
             title='Age Distribution by Competition(Domestic League)',
             labels={'competition_name': 'Competition', 'average_age': 'Average Age'})
fig.show(renderer="browser")

## 5. Data Scrapping

## 5-1. 리그/대회 이름 별 그룹핑

In [None]:
# 리그(competition_name)별로 클럽들의 total_market_value 합계를 구함
top_5_competition = clubs_2022.groupby('competition_name')['total_market_value'].sum()

# 합계 기준 내림차순 정렬 후 상위 5개 리그만 추출 → 리그 이름만(Index 형태로) 가져옴
top_5_competition = top_5_competition.sort_values(ascending=False).head(5).index

# clubs_2022에서 'competition_name'이 상위 5개 리그에 속하는 클럽들만 필터링
# 해당 클럽들의 'club_id'만 뽑아서 unique()로 중복 제거
club_ids = clubs_2022[clubs_2022['competition_name'].isin(top_5_competition)]['club_id'].unique()

# 결과: 상위 5개 리그에 속하는 클럽들의 고유한 club_id 리스트
club_ids

## 5-2. 상위 5개 리그 소속의 골키퍼 제외한 모든 선수들 포지션,리그명

In [None]:
# 분석에 필요한 컬럼만 선택
columns = ['player_id', 'current_club_id', 'first_name', 'last_name',
           'name', 'position', 'sub_position', 'age', 'market_value_in_eur']

# 1) 상위 5개 리그(club_ids에 속한 클럽) 선수만 필터링 → 지정된 컬럼만 추출
players_with_stats = players_with_val_2022[players_with_val_2022['current_club_id'].isin(club_ids)][columns]

# 2) 포지션이 'Goalkeeper'인 선수 제외
players_with_stats = players_with_stats[players_with_stats['position'] != 'Goalkeeper']

# 3) 클럽 ID → 클럽 이름 매핑 (clubs_2022에서 club_id와 club_name 매칭)
players_with_stats['current_club_name'] = players_with_stats['current_club_id'].map(
    clubs_2022.set_index('club_id')['club_name']
)

# 4) 클럽 ID → 소속 리그 이름 매핑 (clubs_2022에서 club_id와 competition_name 매칭)
players_with_stats['competition_name'] = players_with_stats['current_club_id'].map(
    clubs_2022.set_index('club_id')['competition_name']
)

# 5) 시장가치(market_value_in_eur)를 기준으로 내림차순 정렬 (가장 비싼 선수부터 위에 오도록)
players_with_stats = players_with_stats.sort_values(by='market_value_in_eur', ascending=False)

# 최종 DataFrame: 상위 5개 리그 소속의 골키퍼 제외한 모든 선수들의 스탯 + 클럽/리그 이름
players_with_stats

In [None]:
attack_df = players_with_stats[players_with_stats['position'] == 'Attack'].reset_index(drop=True)
display(attack_df.head())

midfield_df = players_with_stats[players_with_stats['position'] == 'Midfield'].reset_index(drop=True)
display(midfield_df.head())

defender_df = players_with_stats[players_with_stats['position'] == 'Defender'].reset_index(drop=True)
display(defender_df.head())

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup

import time

In [None]:
urls = [
    "https://1xbet.whoscored.com/Regions/74/Tournaments/22/Seasons/9129/Stages/21037/PlayerStatistics/France-Ligue-1-2022-2023",
    "https://1xbet.whoscored.com/Regions/252/Tournaments/2/Seasons/9075/Stages/20934/PlayerStatistics/England-Premier-League-2022-2023",
    "https://1xbet.whoscored.com/Regions/81/Tournaments/3/Seasons/9120/Stages/21026/PlayerStatistics/Germany-Bundesliga-2022-2023",
    "https://1xbet.whoscored.com/Regions/206/Tournaments/4/Seasons/9149/Stages/21073/PlayerStatistics/Spain-LaLiga-2022-2023",
    "https://1xbet.whoscored.com/Regions/108/Tournaments/5/Seasons/9159/Stages/21087/PlayerStatistics/Italy-Serie-A-2022-2023",
]

In [None]:
position_df_dict = {
    "Offensive": attack_df,
    "Passing": midfield_df,
    "Defensive": defender_df
}

크롬 브라우저를 자동으로 켜는 WebDriver 생성하여 urls 페이지들 열어 전체 선수보이는 페이지 띄우기

In [None]:
driver = webdriver.Chrome()

url = urls[0]
driver.get(url)

driver.maximize_window()

driver.execute_script("window.scrollTo(0,700)")

apps = driver.find_elements(By.ID, 'apps')[-1]
all_players = apps.find_elements(By.CLASS_NAME, 'option')[-1]
all_players.click()

In [None]:
# 크롤링하려는 페이지 -> 전체 페이지 수(totalPages)
total_pages = int(driver.find_elements(By.ID, "totalPages")[-1].get_attribute('value'))
total_pages

In [None]:
html = driver.page_source # 현재 크롬 드라이버(driver)가 띄운 웹페이지의 HTML 원본을 가져옴
soup = BeautifulSoup(html, 'html.parser') # HTML을 다루기 쉽게 객체 구조로 바꿈
table = soup.findAll('table') # 페이지 안에 있는 <table> 태그들을 전부 찾아 리스트로 반환
stats_table = pd.read_html(str(table[1]))[0].iloc[:,1:] # 첫 번째 컬럼을 제외하고 나머지 열만 선택 -> 첫 열은 인덱스/번호라서 버림
stats_table

## 5-3. 웹에서 긁어온 스탯 테이블을 defender_df 선수 데이터프레임에 매핑

In [None]:
stats_table

In [None]:
# stats_table에서 선수 이름을 추출 → defender_df와 매칭 → 스탯 값 채워 넣기 → 다음 페이지로 이동
player_names = defender_df['name'].unique()

for idx, row in stats_table.iterrows():
    player_info = row['Player.1'].split(' ')
    player_name = ' '.join(player_info[:2])
    if player_name in player_names:
        print(player_name)
        player_row = defender_df[defender_df['name'] == player_name].iloc[0]
        defender_df.loc[player_row.name, stats_table.columns[1:]] = row[1:]

In [None]:
next_button = driver.find_elements(By.ID, "next")
next_button[-1].click()

In [None]:
driver.quit()

 각 URL 팀 페이지를 열고 → 포지션(3개)을 순서대로 클릭 → “All players”로 바꾼 뒤 → 페이지네이션을 돌며 표를 읽어와 → 내 position_df_dict(Attack/Midfield/Defender 등)에 스탯을 채워 넣기

In [None]:
# 	URL 하나당 드라이버를 새로 띄우고 끝에 quit()로 닫아주는 구조
for url in urls:
    driver = webdriver.Chrome()

    driver.get(url)

    driver.maximize_window()

    driver.execute_script("window.scrollTo(0,700)")
# 상세 스쿼드 보기” 버튼들 중 마지막 하나는 제외하고 0,1,2(=3개 포지션)을 순회 클릭 후 1.5초 하드 슬립으로 로딩 대기
    for i in range(3):
        positions = driver.find_elements(By.CLASS_NAME, "in-squad-detailed-view")[:-1]
        position = positions[i]
        position.click()
        time.sleep(1.5)

        key = position.text.strip()
        df = position_df_dict[key]
        player_names = df['name'].unique()

        # id=apps(드롭다운)에서 마지막 옵션(보통 “All players”) 클릭 → 전체 선수 보기로 전환.
        apps = driver.find_elements(By.ID, 'apps')[-1]
        all_players = apps.find_elements(By.CLASS_NAME, 'option')[-1]
        all_players.click()
        time.sleep(1.5)

        # 전체 페이지 수(totalPages)만큼 루프.
        total_pages = int(driver.find_elements(By.ID, "totalPages")[-1].get_attribute('value'))
        for _ in range(total_pages):
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')

            tables = soup.findAll('table')[:-1]
            table = str(tables[-1])
            stats_table = pd.read_html(table)[0].iloc[:,1:]

            for idx, row in stats_table.iterrows():
                player_info = row['Player.1'].split(' ')
                player_name = ' '.join(player_info[:2])

                if player_name in player_names:
                    player_row = df[df['name'] == player_name].iloc[0]
                    df.loc[player_row.name, stats_table.columns[1:]] = row[1:]

            next_button = driver.find_elements(By.ID, "next")
            next_button[-1].click()
            time.sleep(1.5)

    driver.quit()
    time.sleep(2)

In [None]:
attack_df_with_stats = position_df_dict['Offensive']
midfield_df_with_stats = position_df_dict['Passing']
defender_df_with_stats = position_df_dict['Defensive']

display(attack_df_with_stats)
display(midfield_df_with_stats)
display(defender_df_with_stats)

In [None]:
attack_df_with_stats_isna = attack_df_with_stats.isna().sum() / len(attack_df_with_stats) * 100
midfield_df_with_stats_isna = midfield_df_with_stats.isna().sum() / len(midfield_df_with_stats) * 100
defender_df_with_stats_isna = defender_df_with_stats.isna().sum() / len(defender_df_with_stats) * 100

print("Attack DataFrame NaN Ratio:")
print(attack_df_with_stats_isna)

print("\nMidfield DataFrame NaN Ratio:")
print(midfield_df_with_stats_isna)

print("\nDefender DataFrame NaN Ratio:")
print(defender_df_with_stats_isna)

In [None]:
attack_df_with_stats.dropna(inplace=True)
midfield_df_with_stats.dropna(inplace=True)
defender_df_with_stats.dropna(inplace=True)

In [None]:
display(attack_df_with_stats)
display(midfield_df_with_stats)
display(defender_df_with_stats)

## Correlation between football player's value and stats

In [None]:
attack_df = pd.read_csv(DATA_DIR / 'stats_attack_df.csv')
midfield_df = pd.read_csv(DATA_DIR / 'stats_midfield_df.csv')
defender_df = pd.read_csv(DATA_DIR / 'stats_defender_df.csv')

In [None]:
defender_df.info()

In [None]:
def convert_apps_to_number(apps_str):
    import re

    matches = re.findall(r'\d+', apps_str) # apps_str 안에 있는 모든 숫자 덩어리를 찾아 리스트로 반환
    total_apps = sum(int(match) for match in matches) # 찾은 숫자들을 int로 변환해서 전부 합산.
    return total_apps

# 각 데이터프레임의 Apps 컬럼에 위 함수를 적용해서 정수 합으로 바꾼 뒤, 마지막에 .astype(float)로 부동소수(float) 타입으로 캐스팅
attack_df['Apps'] = attack_df['Apps'].apply(convert_apps_to_number).astype(float)
midfield_df['Apps'] = midfield_df['Apps'].apply(convert_apps_to_number).astype(float)
defender_df['Apps'] = defender_df['Apps'].apply(convert_apps_to_number).astype(float)

In [None]:
attack_stat_columns = attack_df.columns[11:]
midfield_stat_columns = midfield_df.columns[11:]
defender_stat_columns = defender_df.columns[11:]

print(attack_stat_columns)
print(midfield_stat_columns)
print(defender_stat_columns)

In [None]:
for col in attack_stat_columns:
    attack_df[col] = attack_df[col].map(lambda x: 0 if x == '-' else x).astype(float)

for col in midfield_stat_columns:
    midfield_df[col] = midfield_df[col].map(lambda x: 0 if x == '-' else x).astype(float)

for col in defender_stat_columns:
    defender_df[col] = defender_df[col].map(lambda x: 0 if x == '-' else x).astype(float)

In [None]:
columns = ['name', 'position', 'sub_position', 'age', 'market_value_in_eur']

attack_df = attack_df[columns + list(attack_df.columns[11:])]
midfield_df = midfield_df[columns + list(midfield_df.columns[11:])]
defender_df = defender_df[columns + list(defender_df.columns[11:])]

In [None]:
# 여러 스탯(stat_columns)과 선수 시장가치(market_value_in_eur)의 관계를 산점도 + 추세선으로 시각화하는 함수
import seaborn as sns
import matplotlib.pyplot as plt

def plot_scatter_with_trendline(df, position_name, stat_columns):

    # 서브플롯 만들기
    num_stats = len(stat_columns)
    rows = (num_stats - 1) // 3 + 1
    fig, axes = plt.subplots(rows, 3, figsize=(15, 5 * rows))

    # 각 스탯별로 산점도 + 추세선
    for i, stat in enumerate(stat_columns): # 스탯을 3개씩 한 행에 배치할 때 필요한 행 개수 (예: 스탯 7개 → 3개 + 3개 + 1개 → 3행 필요)
        row_index = i // 3
        col_index = i % 3
        ax = axes[row_index, col_index]

        sns.regplot(x=stat, y='market_value_in_eur', data=df, ax=ax, scatter_kws={'alpha': 0.7}, line_kws={'color': 'red'})
        ax.set_title(f"{position_name} Position - Market Value vs. {stat}")
        ax.set_xlabel(stat)
        ax.set_ylabel("Market Value (in EUR)")
    # 남는 subplot 지우기
    for i in range(num_stats, rows * 3):
        row_index = i // 3
        col_index = i % 3
        fig.delaxes(axes[row_index, col_index])

    plt.tight_layout()
    plt.show()


In [None]:
plot_scatter_with_trendline(attack_df, "Attack", attack_stat_columns)

In [None]:
plot_scatter_with_trendline(midfield_df, "Midfield", midfield_stat_columns)

In [None]:
plot_scatter_with_trendline(defender_df, "Defender", defender_stat_columns)

수치형 변수들 간의 상관계수(Correlation Coefficient)

1.	색상 의미
	•	진한 붉은색: +1에 가까움 → 두 변수는 강한 양의 상관관계 (같이 커짐)
	•	진한 파란색: -1에 가까움 → 두 변수는 강한 음의 상관관계 (한쪽이 커질수록 다른 쪽은 작아짐)
	•	0 근처: 상관관계 거의 없음
	1.
2. 주요 해석
	•	Rating 과 SpG (슈팅당 골 비율) = 0.85 → 평점이 높은 선수일수록 슈팅 효율이 높음.
	•	Rating 과 KeyP (키패스) = 0.79 → 평점이 높을수록 공격 기여도(키패스)도 높음.
	•	Goals 와 Assists = 0.62 → 골을 많이 넣는 선수는 어시스트도 많은 경향이 있음.
	•	market_value_in_eur 와 Goals = 0.54 → 선수 가치가 높을수록 골을 많이 넣음.
	•	market_value_in_eur 와 Rating = 0.49 → 선수 가치와 평점은 중간 정도 상관.
	•	Apps (출전 경기 수) 와 Mins (출전 시간) = 0.89 → 당연히 출전 수가 많으면 출전 시간도 많음.
	2.
 3. 요약
    •	선수 가치는 골, 평점, 출전 시간과 관련이 있음.
	•	평점(Rating) 은 경기 내 주요 공격 지표(SpG, KeyP, Goals)와 매우 높은 상관관계.
	•	나이(Age) 는 이 데이터셋에서 시장가치나 경기력과는 큰 관련이 없음.

In [None]:
attack_corr = attack_df.select_dtypes(include='number').corr()
midfield_corr = midfield_df.select_dtypes(include='number').corr()
defender_corr = defender_df.select_dtypes(include='number').corr()


In [None]:
def plot_corr_heatmap(position_name, corr_df):
    import seaborn as sns
    import matplotlib.pyplot as plt

    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_df, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title(f"{position_name} Position - Correlation Heatmap")
    plt.show()

In [None]:
plot_corr_heatmap("Attack", attack_corr)

plot_corr_heatmap("Midfield", midfield_corr)

plot_corr_heatmap("Defender", defender_corr)

## Only Top 5 stats

In [None]:
def top_n_corr(positon_name, corr_df, n, is_print=False):
    correlation_with_market_value = corr_df['market_value_in_eur'].drop('market_value_in_eur').abs().nlargest(n)

    if is_print:
        print(f"{positon_name} Position - Top 5 Stats Correlation with Market Value:")
        print(correlation_with_market_value)
    else:
        return correlation_with_market_value.index

top_n_corr -> 상관행렬에서 시장가치(market_value_in_eur)와 가장 상관이 큰 통계를 뽑아주는 도우미

In [None]:
n = 5
top_n_corr("Attack", attack_corr, n, is_print=True)
top_n_corr("Midfield", midfield_corr, n, is_print=True)
top_n_corr("Defender", defender_corr, n, is_print=True)

In [None]:
top_5_attack_stats = top_n_corr("Attack", attack_corr, n)
top_5_midfield_stats = top_n_corr("Midfield", midfield_corr, n)
top_5_defender_stats = top_n_corr("Defender", defender_corr, n)

포지션별로 시장가치와 상위 5개 통계 지표의 관계를 Plotly로 시각화

In [None]:
import plotly.graph_objects as go
import plotly.subplots as sp
import numpy as np


# 5개 지표(stat_columns)를 받아서 1행 5열 서브플롯을 만들고, 각 지표에 대해 산점도(선수별 점)와 1차 추세선(시장가치에 대한 선형회귀) 그리기
# 마커에 선수 이름을 넣어 호버 시 확인할 수 있게 했고, 추세선은 붉은 점선으로 표시

def plot_line_graphs_with_trendline(position_name, df, stat_columns):
    fig = sp.make_subplots(rows=1, cols=5, subplot_titles=stat_columns)

    for i, stat in enumerate(stat_columns, start=1):
        fig.add_trace(go.Scatter(
            x=df[stat],
            y=df['market_value_in_eur'],
            mode='markers',
            name=stat,
            text=df['name'],
            marker=dict(size=8, opacity=0.7)
        ), row=1, col=i)

        z = np.polyfit(df[stat], df['market_value_in_eur'], 1)
        p = np.poly1d(z)
        trendline_x = [df[stat].min(), df[stat].max()]
        trendline_y = p(trendline_x)

        fig.add_trace(go.Scatter(
            x=trendline_x,
            y=trendline_y,
            mode='lines',
            name='',
            line=dict(color='red', width=2, dash='dash')
        ), row=1, col=i)

    fig.update_layout(
        title=f"{position_name} Position - Market Value vs. Top 5 Stats with Trendline",
        xaxis_title="Stat Value",
        yaxis_title="Market Value (in EUR)",
        hovermode='closest'
    )

    fig.show()

plot_line_graphs_with_trendline("Attack", attack_df, top_5_attack_stats)
plot_line_graphs_with_trendline("Midfield", midfield_df, top_5_midfield_stats)
plot_line_graphs_with_trendline("Defender", defender_df, top_5_defender_stats)


특정 선수를 기준으로 비슷한 시장가치를 가진 선수들과 주요 스탯을 비교하는 시각화

In [None]:
def evaluate_player_value(player_name, df, threshold, stat_columns):
    player_data = df[df['name'] == player_name]
    player_value = player_data['market_value_in_eur'].values[0]

    lower_bound = player_value * (1 - threshold)
    upper_bound = player_value * (1 + threshold)
    filtered_data = df[(df['market_value_in_eur'] >= lower_bound) & (df['market_value_in_eur'] <= upper_bound)]

    average_stats = filtered_data[stat_columns].mean().to_frame().T
    average_value = filtered_data['market_value_in_eur'].mean()
    average_stats['market_value_in_eur'] = average_value

    _, axes = plt.subplots(1, 5, figsize=(20, 4))
    for i, stat in enumerate(stat_columns):
        ax = axes[i]

        sns.scatterplot(x=stat, y='market_value_in_eur', data=filtered_data, ax=ax, color='green', alpha=0.7, label='Others')
        sns.scatterplot(x=stat, y='market_value_in_eur', data=average_stats, ax=ax, color='red', s=100, label="Average")
        sns.scatterplot(x=stat, y='market_value_in_eur', data=player_data, ax=ax, color='blue', s=100, label=f'{player_name}')
        ax.set_title(f"vs. {stat}")
        ax.set_xlabel(stat)
        ax.set_ylabel("Market Value (in EUR)")

        ax.axvline(x=average_stats[stat].values[0], color='red', linestyle='--', linewidth=1)
        ax.axhline(y=average_value, color='red', linestyle='--', linewidth=1)

    plt.show()

player_name = "Kylian Mbappé"
df = attack_df
threshold = 0.2
stat_columns = top_5_attack_stats
evaluate_player_value(player_name, df, threshold, stat_columns)
