# Data Preprocessing

In [69]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

df_train = pd.read_csv('train.csv')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99872 entries, 0 to 99871
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       99872 non-null  object 
 1   player_id                99872 non-null  object 
 2   kill_death_ratio         97521 non-null  float64
 3   headshot_percentage      84716 non-null  float64
 4   win_rate                 90015 non-null  float64
 5   accuracy_score           96848 non-null  float64
 6   kill_consistency         97281 non-null  float64
 7   reaction_time_ms         73815 non-null  float64
 8   account_age_days         81323 non-null  float64
 9   level                    75795 non-null  float64
 10  level_progression_speed  80614 non-null  float64
 11  friend_network_size      91713 non-null  float64
 12  reports_received         75722 non-null  float64
 13  device_changes_count     90774 non-null  float64
 14  input_consistency_scor

In [70]:
numeric_cols = df_train.select_dtypes(include=['float64', 'int64']).columns

# Cull outliers based on IQR method
Q1 = df_train[numeric_cols].quantile(0.25)
Q3 = df_train[numeric_cols].quantile(0.75)
IQR = Q3 - Q1
df = df_train[~((df_train[numeric_cols] < (Q1 - 1.5 * IQR)) |(df_train[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 82171 entries, 0 to 99869
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       82171 non-null  object 
 1   player_id                82171 non-null  object 
 2   kill_death_ratio         80187 non-null  float64
 3   headshot_percentage      69608 non-null  float64
 4   win_rate                 74119 non-null  float64
 5   accuracy_score           79668 non-null  float64
 6   kill_consistency         80053 non-null  float64
 7   reaction_time_ms         60455 non-null  float64
 8   account_age_days         66927 non-null  float64
 9   level                    62447 non-null  float64
 10  level_progression_speed  64460 non-null  float64
 11  friend_network_size      75429 non-null  float64
 12  reports_received         62045 non-null  float64
 13  device_changes_count     74654 non-null  float64
 14  input_consistency_score  67

In [71]:
# Columns with high correlation to target 'is_cheater'
high_corr_cols = ['reports_received','crosshair_placement','headshot_percentage','kill_death_ratio','game_sense_score','account_age_days','accuracy_score','level','spray_control_score','friend_network_size','win_rate','aiming_smoothness','level_progression_speed','kill_consistency','reaction_time_ms']

# Drop rows with <= 10% missing data not in high correlation columns
df_clean = df.dropna(thresh=len(df_train.columns)*0.1)
print(f"Dropped {len(df) - len(df_clean)} rows with missing data.")

Dropped 0 rows with missing data.


In [72]:
# Impute high correlation columns using KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
df_clean[high_corr_cols] = imputer.fit_transform(df_clean[high_corr_cols])

# Impute remaining columns with random sampling
for col in df_clean.columns.difference(high_corr_cols):
    missing_mask = df_clean[col].isnull()
    n_missing = missing_mask.sum()
    if n_missing > 0:
        random_samples = df_clean.loc[~missing_mask, col].sample(n_missing, replace=True, random_state=42).values
        df_clean.loc[missing_mask, col] = random_samples

In [73]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 82171 entries, 0 to 99869
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       82171 non-null  object 
 1   player_id                82171 non-null  object 
 2   kill_death_ratio         82171 non-null  float64
 3   headshot_percentage      82171 non-null  float64
 4   win_rate                 82171 non-null  float64
 5   accuracy_score           82171 non-null  float64
 6   kill_consistency         82171 non-null  float64
 7   reaction_time_ms         82171 non-null  float64
 8   account_age_days         82171 non-null  float64
 9   level                    82171 non-null  float64
 10  level_progression_speed  82171 non-null  float64
 11  friend_network_size      82171 non-null  float64
 12  reports_received         82171 non-null  float64
 13  device_changes_count     82171 non-null  float64
 14  input_consistency_score  82

In [74]:
df_clean.to_csv('train_imputed.csv', index=False)