In [399]:
import pandas as pd
import numpy as np
import warnings
import numpy.typing as npt
from sklearn.model_selection import train_test_split, cross_val_score

warnings.filterwarnings("ignore")

In [400]:
# constants

TEST_SIZE: np.float16 = 0.3
CROSS_VALIDATION_K: np.int8 = 5
RANDOM_STATE: np.int8 = 42
SQUARE_VALUES: list[str] = ["x", "o", "b"]

In [401]:
COLUMNS: list[str] = ["tl", "tm", "tr", "ml", "mm", "mr", "bl", "bm", "br", "class"]

# ttt is a acronym for tic tac toe
ttt_df: pd.DataFrame = pd.read_csv(
    "../data/tictactoe/tic-tac-toe.data", header=None, names=COLUMNS
)
ttt_df["class"] = ttt_df["class"] == "positive"
ttt_df.rename(columns={"class": "positive"}, inplace=True)
ttt_df.head(5)

Unnamed: 0,tl,tm,tr,ml,mm,mr,bl,bm,br,positive
0,x,x,x,x,o,o,x,o,o,True
1,x,x,x,x,o,o,o,x,o,True
2,x,x,x,x,o,o,o,o,x,True
3,x,x,x,x,o,o,o,b,b,True
4,x,x,x,x,o,o,b,o,b,True


In [402]:
for col in ttt_df.columns:
    if col != "positive":
        ttt_df[col] = ttt_df[col].apply(lambda x: 0 if x == "x" else 1 if x == "o" else 2)
ttt_df.head()

Unnamed: 0,tl,tm,tr,ml,mm,mr,bl,bm,br,positive
0,0,0,0,0,1,1,0,1,1,True
1,0,0,0,0,1,1,1,0,1,True
2,0,0,0,0,1,1,1,1,0,True
3,0,0,0,0,1,1,1,2,2,True
4,0,0,0,0,1,1,2,1,2,True


In [403]:
# splitting data

data: npt.NDArray[np.string_] = ttt_df
ttt_X: npt.NDArray[np.string_] = data.iloc[:, 0 : data.shape[1] - 1]
ttt_y: npt.NDArray[np.string_] = data.iloc[:, data.shape[1] - 1].astype(int)
X_train, X_test, y_train, y_test = train_test_split(
    ttt_X, ttt_y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

In [404]:
# definiton of boosting parameters

NUMBER_OF_STUMPS: np.int8 = 3
NUMBER_OF_INSTANCES: np.int16 = ttt_df.shape[0]

In [405]:
"""
    Generate stumps for a binary classification using ADABOOST, where label_col is either True or False
"""


class AdaBoost:
    def __init__(self, number_of_stumps: np.int_ = 5):
        self.number_of_stumps: np.int_ = number_of_stumps
        self.estimators_errors: npt.NDArray[np.float_] = np.zeros(
            shape=number_of_stumps, dtype=np.float_
        )
        self.alphas: npt.NDArray[np.float_] = np.zeros(
            shape=number_of_stumps, dtype=np.float_
        )
        self.estimators: list = []

    def fit(self, data: pd.DataFrame, label_col: np.string_):
        stumps_used: dict = {}
        best_stump_col: np.string_ = ""
        best_stump_val: np.string_ = ""
        alphas: npt.NDArray[np.float64] = np.zeros(
            shape=self.number_of_stumps, dtype=np.float64
        )
        errors: npt.NDArray[np.float64] = np.zeros(
            shape=self.number_of_stumps, dtype=np.float64
        )
        best_stump_predictions: npt.NDArray[np.int8] = np.zeros(
            shape=data.shape[0], dtype=np.int8
        )
        true_labels: npt.NDArray[np.int_] = (
            data[label_col].apply(lambda x: 1 if x else -1).to_numpy()
        )
        weights: npt.NDArray[np.float64] = (
            np.ones(shape=data.shape[0], dtype=np.float64) / data.shape[0]
        )
        for i in np.arange(self.number_of_stumps):
            curr_error: np.float64 = np.inf
            for col in data.columns:
                for val in data[col].unique():
                    if (col, val) in stumps_used.keys():
                        continue
                    if col == label_col:
                        stump_predictions = (
                            (data[col] | True) if val else (data[col] & False)
                        )
                    else:
                        stump_predictions = data[col] == val
                    new_stump_error: np.float64 = (
                        (stump_predictions != data[label_col]).astype(np.int8) * weights
                    ).sum()
                    if new_stump_error < curr_error:
                        best_stump_predictions = stump_predictions.copy()
                        curr_error = new_stump_error
                        best_stump_col = col
                        best_stump_val = val
            errors[i] = curr_error
            alphas[i] = np.log((1 - errors[i]) / errors[i]) / 2
            stumps_used[(best_stump_col, best_stump_val)] = alphas[i]
            best_stump_predictions = best_stump_predictions.astype(np.int8)
            best_stump_predictions[best_stump_predictions == 0] = -1
            weights *= np.exp(
                -1 * alphas[i] * best_stump_predictions * true_labels
            )
            weights /= np.sum(weights)
        print(errors)
        return stumps_used


vampire_df = pd.read_csv("../data/vampire/vampire.csv")