In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import matplotlib.pyplot as plt
import gym
import torch
from stable_baselines3.common.callbacks import BaseCallback
import os

In [2]:
class GradientClippingCallback(BaseCallback):
    def __init__(self, clip_value=0.5, verbose=0):
        super(GradientClippingCallback, self).__init__(verbose)
        self.clip_value = clip_value

    def _on_step(self):
        torch.nn.utils.clip_grad_norm_(self.model.policy.parameters(), self.clip_value)
        return True

In [3]:
def evaluate_sensitivity(model, environment, feature_indices):
    original_state = environment.reset()
    print("Original state shape:", original_state.shape)
    sensitivities = []
    for idx in feature_indices:
        if idx >= original_state.shape[0]:
            print(f"Index {idx} is out of bounds for the state with shape {original_state.shape}")
            continue
        perturbed_state = np.copy(original_state)
        perturbation = 0.01 * (np.max(original_state) - np.min(original_state))
        perturbed_state[idx] += perturbation
        action, _states = model.predict(original_state, deterministic=True)
        perturbed_action, _ = model.predict(perturbed_state, deterministic=True)
        sensitivity = np.abs(action - perturbed_action)
        sensitivities.append(sensitivity)
    return sensitivities

In [4]:
class TransferEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, data):
        super(TransferEnv, self).__init__()
        self.data = data
        self.action_space = gym.spaces.Discrete(len(data['team'].unique()))
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(len(data.columns),), dtype=np.float32)
        self.state = self.reset()

    def validate_observations(self, obs):
        if np.isnan(obs).any():
            print("NaN detected in observations")
            obs = np.nan_to_num(obs)
        return obs

    def reset(self):
        self.current_step = 0
        obs = self.data.iloc[self.current_step].values
        return self.validate_observations(obs)

    def step(self, action):
        self.current_step += 1
        done = self.current_step >= len(self.data)
        reward = self.calculate_reward(action)
        if not done:
            next_state = self.data.iloc[self.current_step].values
        else:
            next_state = np.zeros_like(self.data.iloc[0].values)
        return self.validate_observations(next_state), reward, done, {}

    def calculate_reward(self, action):
        if self.current_step == 0:
            return 0
        else:
            previous_value = self.data.iloc[self.current_step - 1]['marketValue']
            current_value = self.data.iloc[self.current_step]['marketValue']
            reward = current_value - previous_value
            return reward

    def render(self, mode='human', close=False):
        pass

In [5]:
def convert_season(season):
    start, end = season.split('/')
    if int(start) >= 25:
        start = '19' + start
    else:
        start = '20' + start
    if int(end) >= 25:
        end = '19' + end
    else:
        end = '20' + end
    return f"{start}/{end}"

def assign_season(date):
    if date.month < 8:
        return f"{date.year-1}/{date.year}"
    else:
        return f"{date.year}/{date.year+1}"

def generate_value_season(df_player_values):
    df_player_values['date'] = pd.to_datetime(df_player_values['date'], format='%d.%m.%Y', errors='coerce')
    df_player_values['season'] = df_player_values['date'].apply(assign_season)
    return df_player_values

In [6]:
def prepare_label_encoders(df, columns):
    label_encoders = {}
    for column in columns:
        le = LabelEncoder()
        df[column] = df[column].astype(str) 
        le.fit(df[column])
        label_encoders[column] = le
    return df, label_encoders

In [8]:
script_dir = os.path.abspath(os.getcwd())
print(script_dir)
project_dir = os.path.abspath(os.path.join(script_dir, os.pardir, os.pardir))
print(project_dir)

C:\Users\Klaudia\PycharmProjects\TalentFinder\ml\clasification
C:\Users\Klaudia\PycharmProjects\TalentFinder


In [238]:
def load_data():
    dtype_dict = {
        'column_3': str,
        'column_4': str,
        'column_5': str
    }

    df_player_seasons = pd.read_csv(os.path.join(project_dir, 'data', 'transfermarkt', 'player_seasons.csv'))
    df_player_values = pd.read_csv(os.path.join(project_dir, 'data', 'transfermarkt','player_value.csv'))
    df_player_flags = pd.read_csv(os.path.join(project_dir, 'data', 'flags','player_flags_FINAL.csv'), low_memory=False, dtype=dtype_dict)
    df_player_transfers = pd.read_csv(os.path.join(project_dir, 'data', 'transfermarkt','transfers.csv'),dtype={'transferType': str})
    df_player_players = pd.read_csv(os.path.join(project_dir, 'data', 'transfermarkt','players.csv'))
    return df_player_seasons, df_player_values, df_player_flags, df_player_transfers, df_player_players

In [239]:
df_player_seasons, df_player_values, df_player_flags, df_player_transfers, df_player_players = load_data()
df_player_values = generate_value_season(df_player_values)
df_player_transfers['season'] = df_player_transfers['season'].apply(convert_season)
result = pd.merge(df_player_flags, df_player_players, on='id')
result = pd.merge(result, df_player_values, on=['id', 'season'])
result = pd.merge(result, df_player_transfers, on=['id', 'season'])

In [240]:
file = pd.read_csv('../../data/transfermarkt/elo_combined.csv').drop(columns=['elo_ranking_club']).rename(columns={"club": "team"})
manual_elo_values = {
    'Legia Warszawa': 2057.30,
    'Rakow Czestochowa': 2056.73,
    'Lech Poznan': 2028.27,
    'Jagiellonia Bialystok': 2010.47,
    'Pogon Szczecin': 1979.33,
    'Slask Wroclaw': 1936.03,
    'Piast Gliwice': 1934.58,
    'Cracovia Krakow': 1908.82,
    'Zaglebie Lubin': 1898.70,
    'Gornik Zabrze': 1891.41,
    'Korona Kielce': 1886.56,
    'Radomiak Radom': 1852.95,
    'Widzew Lodz': 1851.43,
    'Ruch Chorzow': 1829.66,
    'Warta Poznan': 1813.30,
    'Puszcza Niepolomice': 1804.84,
    'Stal Mielec': 1764.82,
    'LKS Lodz': 1729.92,
    'Arka Gdynia': 1874.70,
    'Lechia Gdansk': 1836.87,
    'Wisla Krakow': 1834.11,
    'GKS Katowice': 1785.84,
    'Wisla Plock': 1769.47,
    'Motor Lublin': 1766.02,
    'GKS Tychy': 1737.78,
    'Miedz Legnica': 1734.90,
    'LKS Nieciecza': 1722.74,
    'Gornik Leczna': 1696.46,
    'Odra Opole': 1691.37,
    'Polonia Warszawa': 1652.76,
    'Znicz Pruszkow': 1649.86,
    'Resovia Rzeszow': 1646.33,
    'Podbeskidzie Bielsko Biala': 1637.50,
    'Chrobry Glogow': 1636.25,
    'Stal Rzeszow': 1606.55,
    'Zaglebie Sosnowiec': 1572.21,
    'Pogon Siedlce': 1628.91,
    'Kotwica Kolobrzeg': 1618.69,
    'Stal Stalowa Wola': 1613.80,
    'KKS 1925 Kalisz': 1587.47,
    'Olimpia Grudziadz': 1578.32,
    'Chojniczanka Chojnice': 1573.93,
    'Hutnik Krakow': 1566.19,
    'SKRA Czestochowa': 1565.43,
    'Polonia Bytom': 1561.88,
    'Wisla Pulawy': 1548.34,
    'Olimpia Elblag': 1541.09,
    'Stomil Olsztyn': 1511.45,
    'GKS Jastrzebie': 1509.57,
    'Lech Poznan II': 1505.25,
    'Sandecja Nowy Sacz': 1501.80,
    'Lks Lodz II': 1493.72,
    'Zaglebie Lubin II': 1493.26,
    'Radunia Stezyca': 1489.67
}

In [241]:
for club, elo in manual_elo_values.items():
    file.loc[file['team'] == club, 'elo'] = elo
    
result = pd.merge(result, file, on=['team'])

columns_to_encode = [
    'season', 'team',
    'place_of_birth', 'birth_date', 'nationality', 'position', 'manager',
    'club', 'fee', 'clubName1', 'clubName2', 'transferType'
]

result, label_encoders = prepare_label_encoders(result, columns_to_encode)

In [242]:
result['previous_market_value'] = result.groupby('id')['marketValue'].shift(1)
result['previous_market_value'] = result['previous_market_value'].fillna(result['marketValue'])
target = result['marketValue']
result['height'] = result['height'].fillna(1.80)
features = result.drop(columns=['marketValue', 'slug', 'name', 'date_x', 'date_y', 'place_of_birth', 'manager', 'clubName1', 'clubName2', 'value', 'club'])

In [243]:
unique_ids = result['id'].unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.25, random_state=42)

train_data = result[result['id'].isin(train_ids)]
test_data = result[result['id'].isin(test_ids)]

In [244]:
X_train = train_data.drop(columns=['marketValue', 'slug', 'name', 'date_x', 'date_y', 'place_of_birth', 'manager', 'clubName1', 'clubName2', 'value', 'club'])
y_train = train_data['marketValue']
X_test = test_data.drop(columns=['marketValue', 'slug', 'name', 'date_x', 'date_y', 'place_of_birth', 'manager', 'clubName1', 'clubName2', 'value', 'club'])
y_test = test_data['marketValue']

X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

X_train = X_train.dropna(axis=1, how='any')
X_test = X_test.dropna(axis=1, how='any')

In [245]:
print("Typy danych X_train po konwersji:")
print(X_train.dtypes)
print("Typy danych X_test po konwersji:")
print(X_test.dtypes)

Typy danych X_train po konwersji:
id                         int64
height                   float64
age                        int64
elo                      float64
previous_market_value    float64
dtype: object
Typy danych X_test po konwersji:
id                         int64
height                   float64
age                        int64
elo                      float64
previous_market_value    float64
dtype: object


In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

In [None]:
# Raport klasyfikacji
classification_rep = classification_report(y_test, predictions)
print("Classification Report:")
print(classification_rep)

In [None]:
data_without_id = result.drop(columns=['id'])
data_description = data_without_id.describe()
print("Statystyki opisowe dla zbioru danych bez kolumny 'id':")
print(data_description)

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, predictions, alpha=0.3)
plt.xlabel('Prawdziwe wartości')
plt.ylabel('Przewidywane wartości')
plt.title('Prawdziwe wartości vs. Przewidywane wartości')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')
plt.show()

In [None]:
def predict_best_transfer(potential_transfers, model, features, label_encoders):
    columns_to_ignore = ['RED_CARD_PLAYER', 'YELLOW_CARD_PLAYER', 'VALUE_FLAG', 'ZERO_EXP', 'ALWAYS_ON_BENCH', 'PROGRESS_SINCE_NEW_TEAM', 'SAME_RESULTS_FOR_LONG_TIME']
    
    teams = features['team'].unique()
    best_team = None
    best_value = -np.inf

    for team in teams:
        potential_transfers['team'] = team
        for column in features.columns:
            if column not in potential_transfers.columns and column not in columns_to_ignore:
                potential_transfers[column] = '0' 

        for column in potential_transfers.columns:
            if column in columns_to_ignore:
                potential_transfers.drop(columns=[column], inplace=True)
            elif potential_transfers[column].isnull().all() or (potential_transfers[column] == '').all():
                potential_transfers.drop(columns=[column], inplace=True)

        for column in label_encoders:
            if column in potential_transfers.columns:
                le = label_encoders[column]
                default_value = le.classes_[0]
                potential_transfers[column] = potential_transfers[column].apply(
                    lambda x: le.transform([x])[0] if x in le.classes_ else le.transform([default_value])[0]
                )

        potential_transfers = potential_transfers.apply(pd.to_numeric, errors='coerce')

        potential_transfers = potential_transfers[features.columns]

        predicted_value = model.predict(potential_transfers)[0]

        if predicted_value > best_value:
            best_value = predicted_value
            best_team = team

    return best_team, best_value


In [None]:

# Przykład użycia funkcji
potential_transfers = pd.DataFrame({
    'season': ['2023/2024'],
    'team': ['Legia Warszawa'],
    'place_of_birth': ['Warszawa'],
    'birth_date': ['1995-01-01'],
    'nationality': ['Poland'],
    'position': ['Midfielder'],
    'manager': ['John Doe'],
    'club': ['Legia Warszawa'],
    'fee': [0],
    'clubName1': ['Legia Warszawa'],
    'clubName2': ['Lech Poznan'],
    'height': [1.80],
    'previous_market_value': [10000],
    'transferType': ['Bez odstepnego']
})

best_team, best_value = predict_best_transfer(potential_transfers, model, features, label_encoders)
print(f"Best Team: {best_team}, Predicted Market Value: {best_value}")
