# Lab: Pokémon Data Classification

## Objective
This notebook demonstrates how to classify Pokémon as legendary or not using two machine learning models:
- Logistic Regression
- Multi-Layer Perceptron (MLP)

We will compare the performance of both models using metrics such as accuracy, precision, recall, F1-score, and AUC-ROC.

## Step 1: Load and Explore the Dataset

In [2]:
# Your import 
import pandas as pd

# Load the Pokémon dataset
pokemon_data = pd.read_csv("pokemon.csv")

# Display the first few rows
pokemon_data.head(5)

Unnamed: 0,name,deck_no,attack,defense,sp_attack,sp_defense,speed,capture_rt,total_bs,type,gen,legendary
0,Bulbasaur,1,49,49,65,65,45,45,318,grass,1,0
1,Ivysaur,2,62,63,80,80,60,45,405,grass,1,0
2,Venusaur,3,100,123,122,120,80,45,625,grass,1,0
3,Charmander,4,52,43,60,50,65,45,309,fire,1,0
4,Charmeleon,5,64,58,80,65,80,45,405,fire,1,0


## Step 2: Data Preprocessing
### 2.1 Feature Engineering

### <span style="color:red">Please create a derived feature: sp_attack_to_sp_defense_ratio, Target is Legendary</span>. 

In [3]:
# Create a derived feature: sp_attack_to_sp_defense_ratio

# your code
pokemon_data["sp_attack_to_sp_defense_ratio"] = pokemon_data["sp_attack"] / pokemon_data["sp_defense"]

# Select all features + derived  
# target: Legendary 
# Your code
features = ['attack', 'defense', 'sp_attack', 'sp_defense', 'speed', 'capture_rt', 'total_bs', 'type', 'gen', 'sp_attack_to_sp_defense_ratio']
target = 'legendary' 

X = pokemon_data[features] # Your code
y = pokemon_data[target] # Your code

### 2.2 Data Transformation Using ColumnTransformer

In [4]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline


# Categorical and numerical columns
# your code
numerical_columns = ['attack', 'defense', 'sp_attack', 'sp_defense', 'speed', 'capture_rt', 'total_bs', 'gen', 'sp_attack_to_sp_defense_ratio']
categorical_columns = ['type'] 

# Preprocessing pipeline
# please choose one suitable method as your scaling method
# please use OneHot as your Encoder
# Your code

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ])

preprocessor

### Prepare the training data

In [5]:
from sklearn.model_selection import train_test_split

# Transform the data
X_transformed = preprocessor.fit_transform(X)

# Train-test split
# Step 1: First train-test split to create training and temporary sets
# X_transformed: Preprocessed feature data
# y: Target labels
# test_size=0.3: Reserve 30% of the data for validation and test sets
# random_state=42: Ensures reproducibility of the splits
# stratify=y: Maintains the class distribution in the split

#Your code
X_train, X_temp, y_train, y_temp = train_test_split(
    X_transformed, y, 
    test_size=0.3,      
    random_state=42,    
    stratify=y         
)



In [6]:
# Your import 
from sklearn.model_selection import train_test_split

# Please use train_test_split to prepare your data
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, y, 
    test_size=0.3,      
    random_state=42,    
    stratify=y         
)


# Your code

#Print your X_train and X_test
X_train[:2], X_test[:2]

(array([[-0.40006789, -0.26044725, -0.968214  , -1.10700876, -1.77001294,
          1.19130465, -0.9517153 , -1.39455066, -0.13843446,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  1.        ],
        [ 0.19111895, -0.09784534, -0.19502508, -0.03220389, -0.52510226,
         -0.70940526, -0.1542614 , -0.87620492, -0.30764966,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
          0.        ,  0.        ]]),
 array([[ 1.46683792, -0.42304916,  0.73280163, -0.39047218,  1.34226377,
         -0.70940526,  0.68516375,  0.67883232,  1.24348967,  0.        ,
          0.        ,  0.        ,  0.

## Step 3: Logistic Regression

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.metrics import accuracy_score

# Please build your Logistic Regression model



# Train Logistic Regression





# Please use AUC and accuracy score to evaluate your model
# By trying different methods, your accuracy can reach over 90%.






print("<Your name> + Logistic Regression - Validation Performance:")
print(classification_report(y_val, y_val_pred_logistic))
print(f"Validation AUC: {auc_val_logistic:.4f}")
print(f"Validation accuracy: {accuracy_score_logistic:.4f}")

## Step 4: Multi-Layer Perceptron (MLP)

In [None]:
from sklearn.neural_network import MLPClassifier

# Please build your Multi-Layer Perceptron model

# Hidden layer: 8 weights and 4 biases, activation: relu, iteration : 500
# Your code


# Please use AUC and accuracy score to evalue your model
# By trying different methods, your accuracy can reach over 90%.
# Your code





# Train MLP Classifier


# Evaluate MLP Classifier


print("<Your name> + MLP Classifier - Validation Performance:")
print(classification_report(y_val, y_val_pred_mlp))
print(f"Validation AUC: {auc_val_mlp:.4f}")
print(f"Validation accuracy: {accuracy_score_mlp:.4f}")

## Step 5: Comparison of Models - Training Loss Curve

### Manually record the loss of logistic regression

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
# Please record your loss of logistic regression with 500 iterations
# Your code


# Initialize the model with warm_start to fit incrementally


# To store the loss values
loss_curve = []

# Train the model incrementally
# Your code



In [None]:
# Extract loss during MLP training
# Your code
 

# Plot the loss difference during MLP training and Logistic
# please set color of Logistic as green, set MLP as red.
# Your code



In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
# Plot ROC Curve Comparison


# Plot ROC curves
# Your code 
 
# please set color of Logistic as green, set MLP as red.
# Your code 
 

In [None]:
# Your idea:



## Step 6: Visualization of Predictions and Confusion Matrices

### 6.1 Actual vs Predicted Results

In [None]:
# Please plot the Actual data and Predicted for Logistic and MLP model
# Your code


### Confusion Matrices

In [None]:
# plot Logistic Regression
from sklearn.metrics import confusion_matrix
import seaborn as sns
# Logistic Regression

# your code

In [None]:
# plot MLP
# Your code



## Conclusion

In [None]:
# Please compare the difference and common between Logistic results and MLP results
# Your idea :
