In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

## Install libraries

```bash
conda create -n edu4 python=3.11 jupyter matplotlib
```

```bash 
! pip install -U -r requirements.txt
```

```bash
! pip install -U numpy
! pip install -U scikit-learn
```

## Update repository

In [None]:
! git pull

## Add import path

In [None]:
import os
import sys
import gc

In [None]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
del module_path

## Organize imports

In [None]:
import multiprocessing
from pathlib import Path

In [None]:
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
import plotly.express as px

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis, 
    QuadraticDiscriminantAnalysis
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.datasets import (
    load_iris,
    load_wine,
    load_breast_cancer
)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import (
    MaxAbsScaler,
    MinMaxScaler,
    StandardScaler,
    LabelEncoder, 
    OneHotEncoder,
)
from sklearn.metrics import (
    precision_score, 
    recall_score, 
    f1_score,
    classification_report,
    confusion_matrix
)
from sklearn.compose import ColumnTransformer, make_column_transformer

In [None]:
from scipy import stats
from scipy.interpolate import interp1d

In [None]:
from src.lattmc.fca.utils import *
from src.lattmc.fca.data_utils import *
from src.lattmc.fca.image_utils import *
from src.lattmc.fca.models import *
from src.lattmc.fca.fca_utils import *
from src.lattmc.fca.image_gens import *

#### Number of CPU cores

In [None]:
workers = multiprocessing.cpu_count()
workers

In [None]:
SEED = 2024

## Initialize Path

In [None]:
PATH = Path('data')
images_path = PATH / 'images'
images_path.mkdir(exist_ok=True, parents=True)
pumpkin_path = PATH / 'Pumpkin_Seeds_Dataset.xlsx'
ad_click_path = PATH / 'advertising.csv'

## Prepare the model

In [None]:
def fit_model(X, y, transforms=None, verbose=0):
    # Step 2: Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Step 3: Data Preprocessing
    # Standardize the features
    if transforms is not None:
        X_train = transforms.fit_transform(X_train)
        X_test = transforms.transform(X_test)
    
    # Step 4: Hyperparameter Tuning with GridSearchCV
    param_grid = {
        'C': np.logspace(-4, 4, 20),
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    }
    grid_search = GridSearchCV(
        LogisticRegression(
            max_iter=10000, 
            verbose=verbose
        ), 
        param_grid, 
        cv=5, 
        verbose=verbose
    )
    grid_search.fit(X_train, y_train)
    
    # Best parameters
    print(f"Best parameters: {grid_search.best_params_}")
    
    # Step 5: Make predictions on the test set
    y_pred = grid_search.predict(X_test)
    
    # Step 6: Evaluate the model
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Print evaluation metrics
    print('\nClassification Report:')
    print(classification_report(y_test, y_pred))
    print(f'Precision: {precision:.3f}')
    print(f'Recall: {recall:.3f}')
    print(f'F1 Score: {f1:.3f}')

    return grid_search, X_train, y_train, X_test, y_test

#### Breast Cancer datam

In [None]:
# Step 1: Load the Breast Cancer Wisconsin Dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Optional: Convert to DataFrame for easier handling
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y

# Step 2: Data Preprocessing
# Standardize the features
scaler = StandardScaler()

# Step 3: Split the dataset into training and testing sets
grid_search, X_train, y_train, X_test, y_test = fit_model(X, y, transforms=scaler)

#### Banknote auth

In [None]:
# Step 1: Load the Banknote Authentication Dataset
# Dataset URL
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt'

# Column names
columns = ['Variance', 'Skewness', 'Curtosis', 'Entropy', 'Class']

# Load the dataset
data = pd.read_csv(url, header=None, names=columns)

# Step 2: Data Preprocessing
X = data.drop('Class', axis=1).to_numpy()
y = data['Class'].to_numpy()

# Standardize the features
scaler = StandardScaler()

# Step 3: Split the dataset into training and testing sets
grid_search, X_train, y_train, X_test, y_test = fit_model(X, y, transforms=scaler)

#### Mushrooms dataset

In [None]:
# Step 1: Load the Mushroom Dataset
# Dataset URL
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'

# Column names
columns = [
    'class', 'cap_shape', 'cap_surface', 'cap_color', 'bruises', 'odor',
    'gill_attachment', 'gill_spacing', 'gill_size', 'gill_color', 'stalk_shape',
    'stalk_root', 'stalk_surface_above_ring', 'stalk_surface_below_ring',
    'stalk_color_above_ring', 'stalk_color_below_ring', 'veil_type', 'veil_color',
    'ring_number', 'ring_type', 'spore_print_color', 'population', 'habitat'
]

# Load the dataset
data = pd.read_csv(url, header=None, names=columns)

# Step 2: Data Preprocessing
# Handle missing values represented by '?'
data.replace('?', np.nan, inplace=True)
data.dropna(inplace=True)  # Alternatively, you can impute missing values

# Separate features and target variable
X = data.drop('class', axis=1)
y = data['class']

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # 'e' -> 0 (edible), 'p' -> 1 (poisonous)

# One-hot encode categorical features
X_encoded = pd.get_dummies(X).to_numpy()

# Step 3: Split the dataset into training and testing sets
grid_search, X_train, y_train, X_test, y_test = fit_model(X_encoded, y_encoded, transforms=None)

#### Ad click dataset

In [None]:
PATH

import kagglehub

path = kagglehub.dataset_download('bumba5341/advertisingcsv', str(PATH))

print("Path to dataset files:", path)

In [None]:
ad_click_path

In [None]:
data = pd.read_csv(ad_click_path)

In [None]:
data

In [None]:
candidates = ['Timestamp', 'Clicked on Ad', 'Ad Topic Line', 'Country', 'City']
X = data.drop([x for x in candidates if x in data.columns], axis=1)
# X = data.drop(['Timestamp', 'Clicked on Ad', 'Ad Topic Line', 'Country', 'City'], axis=1, errors='ignore')
y = data['Clicked on Ad'].to_numpy()

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# cat_columns = []
num_columns = ['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage', 'Male']


ct = make_column_transformer(
    (MinMaxScaler(), num_columns),
    (StandardScaler(), num_columns),
    remainder='passthrough'
)

# X_train = ct.fit_transform(X_train)
# X_test = ct.transform(X_test)

# Step 3: Split the dataset into training and testing sets
grid_search, X_train, y_train, X_test, y_test = fit_model(X, y, transforms=ct)

## GV build

In [None]:
mlc = grid_search.best_estimator_

In [None]:
mlc.coef_

In [None]:
pos_idx = np.where(mlc.coef_ >= 0)[1]
neg_idx = np.where(mlc.coef_ < 0)[1]
neg_idx, pos_idx

In [None]:
y_predt = mlc.predict(X_train)

In [None]:
X_trains = dict()
X_trains[0] = X_train[np.where(y_predt == 0)]
X_trains[1] = X_train[np.where(y_predt == 1)]

In [None]:
y_preds = mlc.predict(X_test)

In [None]:
X_tests = dict()
X_tests[0] = X_test[np.where(y_preds == 0)]
X_tests[1] = X_test[np.where(y_preds == 1)]

In [None]:
np.where(y_preds == 1)

In [None]:
y_preds[np.where(y_preds == 1)]

In [None]:
idxs = np.where(y_preds == 1)[0].tolist()

In [None]:
x = X_test[0]
np.all(mlc.predict(np.array([x])) > 0.5)

In [None]:
# v_A = find_v_A(X_train, np.where(y_train == 1)[0], pos_idx=pos_idx, neg_idx=neg_idx)
v_A, clusters = find_v_A_model(
    X_train, 
    np.where(y_predt == 1)[0], 
    pos_idx=pos_idx, 
    neg_idx=neg_idx,
    model = lambda x: 1 if np.all(mlc.predict(np.array([x])) > 0.5) else 0,
    y=1,
)

In [None]:
v_A, clusters

In [None]:
X_train.shape, v_A.shape

In [None]:
G_As = find_G_xs(X_test, v_A, pos_idx=pos_idx, neg_idx=neg_idx)

In [None]:
G_As

In [None]:
X_test.shape

In [None]:
ga_ls = list()
for x in G_As:
    ga_ls.extend(x)
ga_st = set(ga_ls)

In [None]:
len(ga_st) , ga_st

In [None]:
for G_x in G_As:
    print(mlc.predict(X_test[np.array(G_x)]))

In [None]:
i = 0
for x in X_test:
    for v in ga_st:
        if le(X_test[v], x, pos_idx=pos_idx, neg_idx=neg_idx):
            i += 1
print(i)

In [None]:
y_pred = mlc.predict(X_test)

In [None]:
np.count_nonzero(y_pred == 1)

In [None]:
G_A = find_G_x(X_train, v_A, pos_idx=pos_idx, neg_idx=neg_idx)

In [None]:
v_A

In [None]:
np.count_nonzero(y_train[G_A] == 1) / np.count_nonzero(y_train == 1)

In [None]:
np.count_nonzero(y_train[G_A] == 0) / np.count_nonzero(y_train == 1)

In [None]:
(np.count_nonzero(y_train[G_A] == 1) - np.count_nonzero(y_train[G_A] == 0)) / np.count_nonzero(y_train == 1)

In [None]:
v_B = find_v_A(X_test, np.where(y_test == 1), pos_idx=pos_idx, neg_idx=neg_idx)
G_B = find_G_x(X_test, v_A, pos_idx=pos_idx, neg_idx=neg_idx)

In [None]:
v_B

In [None]:
np.count_nonzero(y_test[G_B] == 1) / np.count_nonzero(y_test == 1)

In [None]:
np.count_nonzero(y_test[G_B] == 0) / np.count_nonzero(y_test == 1)

In [None]:
(np.count_nonzero(y_test[G_B] == 1) - np.count_nonzero(y_test[G_B] == 0)) / np.count_nonzero(y_test == 1)