## Name: Tensor decomposition for classifcaiton
### Date: 29/10/2024
### Status: Works out of the box! Interesting idea, as it is a likelihood-based model.
### Idea: 
Fit a PCA on the concat(X, Y) matrix, that is the feature matrix augmented with the class label per sample, generating a matrix of shape: $(Samples \times Features + 1)$.

Then, at test time create two matrices $Y_1 = concat(X_{test}, Ones)$, and $Y_0 = concat(X_{test}, Zeros)$ and score the log-likelihood of each sample, given the PCA-fitted model.

The resulting label is $1$ if $Y_1 > Y_0$ else $0$. (i.e. we keep the label of the most-probable configuration for each sample).

### Results:
34/64 (53%) wins over generic Decision Tree. Not bad.

Details:
- We keep 90% of the features as projection dimension for PCA
- Also experimented to find the optimal number of components by:
  - First fitting a PCA with components equal to features and finding the number of components with 90% cummulative variance, and then re-fitting the PCA. Worse results.

In [None]:
import pandas as pd
import cached_path
from pmlb import fetch_data
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
import time
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support


path_to_data_summary = "https://raw.githubusercontent.com/EpistasisLab/pmlb/master/pmlb/all_summary_stats.tsv"
dataset_df = pd.read_csv(cached_path.cached_path(path_to_data_summary), sep="\t")

classification_datasets = dataset_df[
    # (dataset_df["n_binary_features"] == dataset_df["n_features"])
    (dataset_df["task"] == "classification")
    & (dataset_df["n_classes"] == 2)
    & (dataset_df["n_features"] <= 100)
    & (dataset_df["n_instances"] <= 1000)
]["dataset"]

print(len(classification_datasets))

models = ['TD', 'DT']


number_of_cv_folds = 5
random_state = 42

cv = StratifiedKFold(number_of_cv_folds, random_state=random_state, shuffle=True)

res = []
for dataset_index, classification_dataset in enumerate(classification_datasets[::-1][:]):
    
    print(f"{classification_dataset} ({dataset_index + 1}/{len(classification_datasets) + 1})")
    X, y = fetch_data(classification_dataset, return_X_y=True)
    if y.max() != 1 or y.min() != 0:
        for wanted, actual in enumerate(np.unique(y)):
            y[y==actual] = wanted
        
    imb_ratio = np.bincount(y).max() / np.bincount(y).min()
    print(f"{X.shape} with ratio : {imb_ratio:.4f}\n")
    
    # X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=random_state)
    
    for model_name in models:
        time_s = time.time()
        if model_name == 'DT':
            clf = DecisionTreeClassifier(random_state=random_state)
            # clf.fit(X_train, y_train)
            # y_probas = clf.predict_proba(X_test)
            # y_pred = np.argmax(y_probas, axis=1)
            y_pred = cross_val_predict(clf, X, y, cv=cv).astype(int)
        
        
        elif model_name == 'TD':
            from sklearn.decomposition import PCA
            y_pred = np.empty_like(y)
            for train_indices, test_indices in cv.split(X,y):
                X_train, y_train = X[train_indices], y[train_indices]
                X_test, y_test = X[test_indices], y[test_indices]
                
                ext = np.hstack((X_train, y_train.reshape(-1,1)))
                ext_zeros = np.hstack((X_test, np.zeros_like(y_test).reshape(-1,1)))
                ext_ones = np.hstack((X_test, np.ones_like(y_test).reshape(-1,1)))
                
                tr = PCA(n_components=int(0.9*X_train.shape[1])) # PCA(n_components='mle', svd_solver='full')#
                # tr = PCA(n_components=X_train.shape[1])
                
                # tr.fit(ext)
                
                # num_important = np.argwhere(tr.explained_variance_ratio_.cumsum() >= 0.8)[0][0] + 1

                # tr = PCA(n_components=num_important)
                tr.fit(ext)
                
                print(f"Will fit on {tr.n_components_}/{X_train.shape[1]} (reduction: {100*(X_train.shape[1] - tr.n_components_)/X_train.shape[1]:.2f} %)")
                
                zeros_tr = tr.score_samples(ext_zeros)
                ones_tr = tr.score_samples(ext_ones)
                exp_ll = np.exp(np.vstack((zeros_tr, ones_tr)))
                y_pred_cur = (exp_ll[1,:] > exp_ll[0,:]).astype(int)
                y_pred[test_indices] = y_pred_cur
            #y_pred = np.concatenate(y_pred)

        
        
        acc = accuracy_score(y, y_pred)
        (prec, rec, f1, sup) = precision_recall_fscore_support(
            y, y_pred, average="binary"
        )
            
        
        print(model_name)    
        print(classification_report(y, y_pred))
        time_end = time.time() - time_s
        res.append((classification_dataset, imb_ratio, model_name, time_end, acc, prec, rec, f1))
        
res = pd.DataFrame(res, columns=['dataset', 'dataset_class_imb', 'model', 'time', 'acc', 'pr', 'rec', 'f1'])
# res.sort_values('f1', ascending=False)

# Step 2: Sort each group by 'f1'
sorted_df = res.groupby('dataset').apply(lambda x: x.sort_values(by='f1', ascending=False)).reset_index(drop=True)

# Step 3: Assign ranks within each group
sorted_df['rank'] = sorted_df.groupby('dataset').cumcount() + 1

# Step 4: Calculate mean rank for each model across all datasets
mean_ranks = sorted_df.groupby('model')['rank'].mean().reset_index().sort_values(by='rank')

print(mean_ranks)
            
    #break

In [117]:
wins_score = np.zeros((len(models), len(models)))

metric_to_score = 'f1'
for classification_dataset in res['dataset'].unique():
    cur_df = res[res['dataset'] == classification_dataset]
    # print(classification_dataset)
    # print(cur_df.sort_values('f1', ascending=False)[['model', 'time', 'acc', 'f1']])
    # print()
    cur_df = cur_df.set_index('model')
    score_metric = cur_df[metric_to_score]
    for i, m1 in enumerate(models):
        for j, m2 in enumerate(models[i:]):
            if cur_df.loc[m1][metric_to_score] > cur_df.loc[m2][metric_to_score]:
                wins_score[i, j+i] += 1
            elif cur_df.loc[m1][metric_to_score] < cur_df.loc[m2][metric_to_score]:
                wins_score[j+i, i] += 1
            else:
                pass
order_of_models = wins_score.mean(axis=1).argsort()[::-1]
wins_score = wins_score[order_of_models, :][:, order_of_models]
print('WINS')
print(pd.DataFrame(wins_score, columns = np.array(models)[order_of_models], index=np.array(models)[order_of_models]))

WINS
      TD    DT
TD   0.0  34.0
DT  29.0   0.0
