In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

## Install libraries

```bash
conda create -n edu4 python=3.11 jupyter matplotlib
```

```bash 
! pip install -U -r requirements.txt
```

```bash
! pip install -U numpy
! pip install -U scikit-learn
```

## Update repository

In [None]:
! git pull

## Add import path

In [None]:
import os
import sys
import gc

In [None]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
del module_path

## Organize imports

In [None]:
import multiprocessing
from pathlib import Path

In [None]:
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
import plotly.express as px

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis, 
    QuadraticDiscriminantAnalysis
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.datasets import (
    load_iris,
    load_wine,
    load_breast_cancer
)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import (
    MaxAbsScaler,
    MinMaxScaler,
    StandardScaler,
    LabelEncoder, 
    OneHotEncoder,
)
from sklearn.metrics import (
    precision_score, 
    recall_score, 
    f1_score,
    classification_report,
    confusion_matrix
)
from sklearn.compose import ColumnTransformer, make_column_transformer

In [None]:
from scipy import stats
from scipy.interpolate import interp1d

In [None]:
from src.lattmc.fca.utils import *
from src.lattmc.fca.data_utils import *
from src.lattmc.fca.image_utils import *
from src.lattmc.fca.models import *
from src.lattmc.fca.fca_utils import *
from src.lattmc.fca.image_gens import *

#### Number of CPU cores

In [None]:
workers = multiprocessing.cpu_count()
workers

In [None]:
SEED = 2024

## Initialize Path

In [None]:
PATH = Path('data')
images_path = PATH / 'images'
images_path.mkdir(exist_ok=True, parents=True)
pumpkin_path = PATH / 'Pumpkin_Seeds_Dataset.xlsx'
ad_click_path = PATH / 'advertising.csv'

## Prepare the model

#### IRIS data

In [None]:
# Step 1: Load the Breast Cancer Wisconsin Dataset
X, y = load_iris(return_X_y=True)

# Step 2: Data Preprocessing
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 4: Hyperparameter Tuning with GridSearchCV
param_grid = {
    'C': np.logspace(-4, 4, 20),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}
grid_search = GridSearchCV(LogisticRegression(max_iter=10000), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best parameters
print(f"Best parameters: {grid_search.best_params_}")

# Step 5: Make predictions on the test set
y_pred = grid_search.predict(X_test)

# Step 6: Evaluate the model
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Print evaluation metrics
print('\nClassification Report:')
print(classification_report(y_test, y_pred))
print(f'Precision: {precision:.3f}')
print(f'Recall: {recall:.3f}')
print(f'F1 Score: {f1:.3f}')

## GV build

In [None]:
mlc = grid_search.best_estimator_

In [None]:
mlc.coef_

In [None]:
mlc.coef_.shape

In [None]:
if len(mlc.coef_.shape) == 2 and mlc.coef_.shape[0] > 1:
    st = mlc.coef_.shape[1]
    pos_list = list()
    neg_list = list()
    for i, cf in enumerate(mlc.coef_):
        pd = i * st + np.where(cf >= 0)[0]
        ng = i * st + np.where(cf < 0)[0]
        pos_list.append(pd)
        neg_list.append(ng)
    pos_idx = np.concatenate(pos_list)
    neg_idx = np.concatenate(neg_list)
    x_tr = list()
    x_ts = list()
    for x in X_train:
        x_tr.append(np.concatenate([x for _ in range(mlc.coef_.shape[0])]))
    X_traic = np.array(x_tr)
    for x in X_test:
        x_ts.append(np.concatenate([x for _ in range(mlc.coef_.shape[0])]))
    X_tesc = np.array(x_ts)
else:
    pos_idx = np.where(mlc.coef_ >= 0)[1]
    neg_idx = np.where(mlc.coef_ < 0)[1]
    X_traic = X_train
    X_tesc = X_test

In [None]:
pos_idx, neg_idx

In [None]:
X_trains = dict()
for i in range(np.max(y_train) + 1):
    X_trains[i] = X_traic[np.where(y_train == i)]

In [None]:
X_tests = dict()
for i in range(np.max(y_test) + 1):
    X_tests[i] = X_tesc[np.where(y_test == i)]

In [None]:
np.where(y_test == 1)

In [None]:
y_test[np.where(y_test == 1)]

In [None]:
v_A = find_v_A(X_traic, np.where(y_train == 1), pos_idx=pos_idx, neg_idx=neg_idx)
G_A = find_G_x(X_traic, v_A, pos_idx=pos_idx, neg_idx=neg_idx)

In [None]:
v_A

In [None]:
np.count_nonzero(y_train[G_A] == 1) / np.count_nonzero(y_train == 1)

In [None]:
np.count_nonzero(y_train[G_A] == 0) / np.count_nonzero(y_train == 1)

In [None]:
(np.count_nonzero(y_train[G_A] == 1) - np.count_nonzero(y_train[G_A] == 0)) / np.count_nonzero(y_train == 1)

In [None]:
v_B = find_v_A(X_tesc, np.where(y_test == 1), pos_idx=pos_idx, neg_idx=neg_idx)
G_B = find_G_x(X_tesc, v_A, pos_idx=pos_idx, neg_idx=neg_idx)

In [None]:
v_B

In [None]:
np.count_nonzero(y_test[G_B] == 1) / np.count_nonzero(y_test == 1)

In [None]:
np.count_nonzero(y_test[G_B] == 0) / np.count_nonzero(y_test == 1)

In [None]:
(np.count_nonzero(y_test[G_B] == 1) - np.count_nonzero(y_test[G_B] == 0)) / np.count_nonzero(y_test == 1)