In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [None]:
df = pd.read_csv('assets\\pokemon.csv')
df_tmp = df.copy()
df_tmp = df_tmp.drop(['#', 'Name'], axis=1)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df_eda = round(df.describe(), 2)
df_eda.loc['unique'] = [df[col].nunique() for col in df_eda.columns]
df_eda.loc['duplicate'] = [df[col].duplicated().sum() for col in df_eda.columns]
df_eda.loc['missing'] = [df[col].isna().sum() for col in df_eda.columns]
df_eda.loc['type'] = [df[col].dtype for col in df_eda.columns]
df_eda.T

In [None]:
df.hist(figsize=(20, 10), bins=8, color='lightblue');

In [None]:
plt.scatter(df["Attack"][df["Legendary"]==1], df["Defense"][df["Legendary"]==1], c="lightblue")
plt.scatter(df["Attack"][df["Legendary"]==0], df["Defense"][df["Legendary"]==0], c="salmon")
plt.legend(["Legendary", "Not Legendary"])
plt.xlabel("Attack")
plt.ylabel("Defense");

In [None]:
pd.crosstab(df['HP']<100, df["Legendary"], normalize=True).plot(kind="bar", color=["salmon", "lightblue"])
plt.xticks((df['HP']<100).unique(), ['Below 100', 'Above 100'], rotation='horizontal');

In [None]:
col_di = {}
for col in df_tmp.columns:
     col_di[col] = df_tmp[col].nunique()

sorted(col_di.items(), key=lambda k:k[1])

In [None]:
cat_cols = ['Type 1', 'Type 2', 'Generation']
num_cols = [col for col in df_tmp.columns if col not in cat_cols and col != 'Legendary']

In [None]:
num_cols_i = [df_tmp.columns.get_loc(col) for col in df_tmp.columns if col in num_cols]

z_scores = pd.DataFrame(stats.zscore(df_tmp.iloc[:, num_cols_i]))
outliers = z_scores[(np.abs(z_scores) > 3).any(axis=1)]
df_outliers = df_tmp.drop(outliers.index)
df_outliers.info()

In [None]:
df_shuffled = df_outliers.sample(frac=1, random_state=42)

X = df_shuffled.drop("Legendary", axis=1)
y = df_shuffled["Legendary"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)

In [None]:
cat_feat = Pipeline(steps=[('test1', SimpleImputer(strategy='constant', fill_value='Missing')),('test2', OneHotEncoder(drop='first'))])

In [None]:
transformer = ColumnTransformer(transformers=[('num', StandardScaler(), num_cols), ('cat', cat_feat, cat_cols)], sparse_threshold=0)

X_train_tr = pd.DataFrame(transformer.fit_transform(X_train), columns=transformer.get_feature_names_out())
X_valid_tr = pd.DataFrame(transformer.transform(X_valid), columns=transformer.get_feature_names_out())

In [None]:
y_train = y_train.astype(float)
y_valid = y_valid.astype(float)

In [None]:
model = XGBClassifier(scale_pos_weight=400, early_stopping_rounds=20, learning_rate=0.01, random_state=42, eval_metric=["error", "logloss"])
eval_set = [(X_train_tr, y_train), (X_valid_tr, y_valid)]
model.fit(X_train_tr, y_train, eval_set=eval_set, verbose=False)

In [None]:
y_pred = model.predict(X_valid_tr)

In [None]:
print(classification_report(y_valid, y_pred))

In [None]:
cm = confusion_matrix(y_valid, y_pred)
ConfusionMatrixDisplay(cm, display_labels=model.classes_).plot(cmap='GnBu');

In [None]:
results = model.evals_result()
epochs = len(results['validation_0']['logloss'])
x_axis = range(0, epochs)

fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
plt.ylabel('Log Loss')

fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Test')
ax.legend()
plt.ylabel('Classification Error');

In [None]:
X_test['Type 2'] = X_test['Type 2'].fillna('Missing')
X_test_tr = pd.DataFrame(transformer.transform(X_test), columns=transformer.get_feature_names_out())
y_test = y_test.astype(float)

In [None]:
test_pred = model.predict(X_test_tr)

In [None]:
print(classification_report(y_test, test_pred))

In [None]:
cm_test = confusion_matrix(y_test, test_pred)
ConfusionMatrixDisplay(cm_test, display_labels=model.classes_).plot(cmap='GnBu');

In [None]:
y_test.index

In [None]:
y_corr = y_test.reset_index(drop=True)
df_corr = X_test_tr.join(y_corr)

In [None]:
plt.figure(figsize=(6,4))
sns.heatmap(df_corr.corr()['Legendary'].sort_values(ascending=False)[1:6].to_frame(), annot=True, cmap="YlGnBu", cbar=False);