In [41]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

In [42]:
df = pd.read_csv('data/test.csv')
games = pd.read_csv('data/games.csv')
goal_columns = games[["home_team_goal","away_team_goal"]]
games.drop(columns=["home_team_goal","away_team_goal"],inplace=True)
stacked_df = pd.concat([df, games], ignore_index=True)

In [43]:
def team_id_knn(test_df, is_home_team = True, n_neighbors=1):
    home_or_away = 'home' if is_home_team else 'away'
    df = test_df.copy()
    player_columns = [col for col in df.columns if col.startswith(f'{home_or_away}_player_')]
    player_team_df = df[[f'{home_or_away}_team_id'] + player_columns]
    
    imputer = KNNImputer(n_neighbors=n_neighbors)
    imputed_data = imputer.fit_transform(player_team_df)
    imputed_df = pd.DataFrame(imputed_data, columns=player_team_df.columns)
    
    imputed_df[f'{home_or_away}_team_id'] = np.round(imputed_df[f'{home_or_away}_team_id']).astype(int)
    df[f'{home_or_away}_team_id'] = imputed_df[f'{home_or_away}_team_id']
 
    return df

def player_id_knn(test_df, is_home_team = True, n_neighbors=1):
    home_or_away = 'home' if is_home_team else 'away'
    df = test_df.copy()
    player_columns = [col for col in df.columns if col.startswith(f'{home_or_away}_player_')]
    player_team_df = df[[f'{home_or_away}_team_id'] + player_columns]
    
    imputer = KNNImputer(n_neighbors=n_neighbors)
    imputed_data = imputer.fit_transform(player_team_df)
    imputed_df = pd.DataFrame(imputed_data, columns=player_team_df.columns)
    
    imputed_df[player_columns] = np.round(imputed_df[player_columns]).astype(int)
    df[player_columns] = imputed_df[player_columns]
 
    return df

# Clean the test data
cleaned_df = team_id_knn(stacked_df, is_home_team=True)
cleaned_df = team_id_knn(cleaned_df, is_home_team=False)
cleaned_df = player_id_knn(cleaned_df, is_home_team=True)
cleaned_df = player_id_knn(cleaned_df, is_home_team=False)


In [44]:
player_columns = [col for col in cleaned_df.columns if col.startswith('home_player_') or col.startswith('away_player_')]

# Check the number of missing values before and after cleaning
print("Missing values before cleaning:")
print(stacked_df[['home_team_id', 'away_team_id'] + player_columns].isna().sum())
print("\nMissing values after cleaning:")
print(cleaned_df[['home_team_id', 'away_team_id'] + player_columns].isna().sum())

# Save the cleaned data
test_df = cleaned_df.iloc[:1000]
games_df = cleaned_df.iloc[1000:]

games_df.reset_index(inplace=True, drop=True)
games_df = pd.concat([games_df, goal_columns], axis=1)

test_df.to_csv('processed_data/knn_test_df.csv', index=False)
games_df.to_csv('processed_data/knn_games_df.csv', index=False)


Missing values before cleaning:
home_team_id        71
away_team_id        82
home_player_1      493
home_player_2      579
home_player_3      550
                  ... 
away_player_Y7     960
away_player_Y8     960
away_player_Y9     961
away_player_Y10    961
away_player_Y11    964
Length: 68, dtype: int64

Missing values after cleaning:
home_team_id       0
away_team_id       0
home_player_1      0
home_player_2      0
home_player_3      0
                  ..
away_player_Y7     0
away_player_Y8     0
away_player_Y9     0
away_player_Y10    0
away_player_Y11    0
Length: 68, dtype: int64
