In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from itertools import product
import json
from collections import defaultdict

# call ../src/FoldSplitter.py
import sys

sys.path.append("../src")

from FoldSplitter import FoldSplitter

from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import NMF, PCA

## Constants

In [2]:
DISTANCE_MATRIX_PATH = "../data/raw/distances_duration_norm.csv"
LABELS_PATH = "../data/raw/dis.csv"

VALIDATIONS_ORDER = ["new_item", "new_subject", "new_item_and_subject"]
FOLD_INDICES = [0, 2, 4, 6, 8]
NUM_FOLDS = 10
FOLD_SPLITTER = FoldSplitter(
    item_columns=["batch", "article_id"],
    subject_column="subject_id",
    groupby_columns=[],
    num_folds=NUM_FOLDS,
)

TARGET_COLUMN = "has_preview"
if TARGET_COLUMN == "has_preview":  # determine positive label
    POS_LABEL = "Hunting"
else:
    raise ValueError(f"Unsupported target column - {TARGET_COLUMN}")

SCORER = {
    "accuracy": lambda y, y_pred: np.mean(y == y_pred),
    "precision": lambda y, y_pred: np.sum((y == POS_LABEL) & (y_pred == POS_LABEL))
    / np.sum(y_pred == POS_LABEL),
    "recall": lambda y, y_pred: np.sum((y == POS_LABEL) & (y_pred == POS_LABEL))
    / np.sum(y == POS_LABEL),
}

## Helper Functions

### Load Data

In [3]:
def load_distance_matrix(path=DISTANCE_MATRIX_PATH):
    distances_duration_norm = pd.read_csv(path)
    return distances_duration_norm


def load_labels(path=LABELS_PATH):
    labels = pd.read_csv(path)
    return labels

### Preprocess Data

In [4]:
def preprocess_labels(labels):
    labels[["batch", "article_id", "level", "paragraph_id"]] = labels[
        "unique_paragraph_id"
    ].str.split("_", expand=True)
    labels["unique_id"] = (
        labels["has_preview"]
        + "_"
        + labels["subject_id"]
        + "_"
        + labels["unique_paragraph_id"]
    )

    # remove unwanted rows and remember which rows were removed
    rows_removed = labels["reread"] == 1

    ## remove reread
    labels = labels[labels["reread"] == 0]

    # reset index
    labels = labels.reset_index(drop=True)

    return labels, rows_removed

In [5]:
def preprocess_distances(distances, labels, rows_removed):
    df = distances.copy()
    # remove 'Unnamed: 0' column
    df = df.drop(columns="Unnamed: 0")

    # remove rows and columns that were removed in the labels preprocessing
    index = df.index
    columns = df.columns
    assert len(index) == len(columns) == len(rows_removed), print(
        f"{len(index)}, {len(columns)}, {len(rows_removed)}"
    )

    df = df.loc[df.index[~rows_removed], df.columns[~rows_removed]]

    # set lables['unique_id'] as distances_duration_norm index and columns
    df = df.set_index(labels["unique_id"])

    df.columns = labels["unique_id"]

    return df

### Side Functions

In [6]:
def element_wise_mean(list_of_lists):
    return [np.mean(x) for x in zip(*list_of_lists)]

In [7]:
def get_hyperparameters_combinations(
    hyperparameters: dict[str, list], filter_func=None
):
    """
    Get all possible hyperparameters combinations
    """
    keys = hyperparameters.keys()
    values = hyperparameters.values()

    ret = [defaultdict(lambda: None, dict(zip(keys, v))) for v in product(*values)]
    if filter_func is not None:
        ret = list(filter(filter_func, ret))

    return ret

In [8]:
def train_val_test_generator(labels, distances):
    for fold_index in FOLD_INDICES:
        train_keys, val_keys_list, test_keys_list = (
            FOLD_SPLITTER._train_val_test_splits(
                group_keys=labels, fold_index=fold_index
            )
        )

        # distances from train_keys to train_keys
        train_distances = distances.loc[train_keys.unique_id, train_keys.unique_id]

        # distances from val_keys to train_keys
        val_list_distances = [
            distances.loc[val_keys.unique_id, train_keys.unique_id]
            for val_keys in val_keys_list
        ]

        # distances from test_keys to train_keys
        test_list_distances = [
            distances.loc[test_keys.unique_id, train_keys.unique_id]
            for test_keys in test_keys_list
        ]

        yield (
            fold_index,
            train_distances,
            val_list_distances,
            test_list_distances,
            train_keys,
            val_keys_list,
            test_keys_list,
        )

In [9]:
def get_classifier(hyperparameters):
    """return none if hyperparameters are invalid"""
    if hyperparameters["classifier"] == "knn":
        if (
            hyperparameters["dim_reduction"] is not None
            and hyperparameters["metric"] == "precomputed"
        ):
            return None
        if (
            hyperparameters["dim_reduction"] is None
            and hyperparameters["metric"] != "precomputed"
        ):
            return None
        clf = make_pipeline(
            KNeighborsClassifier(
                n_neighbors=hyperparameters["n_neighbors"],
                weights=hyperparameters["weights"],
                metric=hyperparameters["metric"],
            )
        )
        ## add dimensionality reduction
        if hyperparameters["dim_reduction"] is None:
            pass
        elif hyperparameters["dim_reduction"] == "pca":
            clf.steps.insert(
                0, ("pca", PCA(n_components=hyperparameters["n_components"]))
            )
        elif hyperparameters["dim_reduction"] == "nmf":
            clf.steps.insert(
                0, ("nmf", NMF(n_components=hyperparameters["n_components"]))
            )
        else:
            raise ValueError(
                f"Unsupported dimensionality reduction - {hyperparameters['dim_reduction']}"
            )

        return clf

    elif hyperparameters["classifier"] == "dummy":
        return DummyClassifier(strategy=hyperparameters["strategy"])
    else:
        return None

### Training

In [10]:
def _train_clf(
    clf,
    distances,
    labels,
    fold_index,
    train_distances,
    val_list_distances,
    test_list_distances,
    train_keys,
    val_keys_list,
    test_keys_list,
):
    labels_with_unique_id_as_index = labels.set_index("unique_id")

    clf.fit(
        train_distances,
        labels_with_unique_id_as_index.loc[train_distances.index, TARGET_COLUMN],
    )

    # predict val and test
    val_preds = [clf.predict(val_distances) for val_distances in val_list_distances]
    test_preds = [clf.predict(test_distances) for test_distances in test_list_distances]

    # calculate scores
    val_scores = {
        scorer_name: [
            scorer(
                labels_with_unique_id_as_index.loc[val_keys.unique_id, TARGET_COLUMN],
                val_pred,
            )
            for val_keys, val_pred in zip(val_keys_list, val_preds)
        ]
        for scorer_name, scorer in SCORER.items()
    }

    test_scores = {
        scorer_name: [
            scorer(
                labels_with_unique_id_as_index.loc[test_keys.unique_id, TARGET_COLUMN],
                test_pred,
            )
            for test_keys, test_pred in zip(test_keys_list, test_preds)
        ]
        for scorer_name, scorer in SCORER.items()
    }

    return val_scores, test_scores

In [11]:
def train_clf(hyperparameters, labels, distances):
    """
    @param hyperparameters: dict (name, values_list)
    """
    clfs = [
        get_classifier(hp) for hp in get_hyperparameters_combinations(hyperparameters)
    ]

    # filter None values
    clfs = [clf for clf in clfs if clf is not None]

    # print number of classifiers and hyperparameters
    for i, clf in enumerate(clfs):
        print(f"Classifier {i}: {clf}")

    scores = {}

    for (
        fold_index,
        train_distances,
        val_list_distances,
        test_list_distances,
        train_keys,
        val_keys_list,
        test_keys_list,
    ) in train_val_test_generator(labels, distances):
        for clf in (pbar := tqdm(clfs)):
            # update description of tqdm to str(clf)
            pbar.set_description(str(clf))

            # train and evaluate classifier
            val_scores, test_scores = _train_clf(
                clf,
                distances,
                labels,
                fold_index,
                train_distances,
                val_list_distances,
                test_list_distances,
                train_keys,
                val_keys_list,
                test_keys_list,
            )

            # save scores
            scores[fold_index] = scores.get(fold_index, {})  # default dict
            scores[fold_index][str(clf)] = {
                "val": val_scores,
                "test": test_scores,
            }

    return scores

In [12]:
def save_scores(scores, path):
    with open(path, "w") as f:
        json.dump(scores, f)

### Interperate Scores

In [13]:
def interprate_scores(scores):
    # interpret results

    ## for each fold, get the test scores for the best hyperparameters
    ## where the best hyperparameters are the ones that maximize the average of the scores
    best_hyperparameters_per_fold = {}
    for fold_index in scores:
        best_hyperparameters_per_fold[fold_index] = max(
            scores[fold_index].items(),
            key=lambda x: np.mean(x[1]["val"]["accuracy"]),
        )

    ## get the average of the scores for the best hyperparameters for each fold and average over all folds
    val_scores = {
        scorer_name: element_wise_mean(
            [
                scores[fold_index][best_hyperparameters_per_fold[fold_index][0]]["val"][
                    scorer_name
                ]
                for fold_index in scores
            ]
        )
        for scorer_name in SCORER.keys()
    }
    test_scores = {
        scorer_name: element_wise_mean(
            [
                scores[fold_index][best_hyperparameters_per_fold[fold_index][0]][
                    "test"
                ][scorer_name]
                for fold_index in scores
            ]
        )
        for scorer_name in SCORER.keys()
    }

    return best_hyperparameters_per_fold, val_scores, test_scores

In [14]:
def print_inter_scores_pretty(pretext, scores):
    print(pretext)
    """
    scores is a dict with the following structure:
    {
        scorer_name: [score_new-item, score_new-subject, score_new-item-and-subject]
    }
    print it as a table where columns are the val_order and rows are the scorer_name
    """
    df = pd.DataFrame(scores)

    # transpose
    df = df.T

    df.index = scores.keys()
    df.columns = VALIDATIONS_ORDER
    print(df, end="\n\n")

## Training

In [15]:
print("Loading labels and distances...")
distances = load_distance_matrix()

### kNN

In [16]:
print("Preprocessing labels and distances...")
labels = load_labels()
labels, rows_removed = preprocess_labels(labels)
processed_distances = preprocess_distances(distances, labels, rows_removed)

FOLD_SPLITTER.create_folds(group_keys=labels)

# train kNN classifier
hyperparameters = {
    "n_neighbors": [1, 2, 3, 5, 7, 9],
    "weights": ["uniform", "distance"],
    "metric": ["l2", "precomputed"],
    "classifier": ["knn"],
    "dim_reduction": [None, "pca", "nmf"],
    "n_components": [0.1, 0.3, 0.5, 0.7, 0.9],
}

print("Training classifiers...")
scores = train_clf(hyperparameters, labels, processed_distances)

HydraConfig not found. Using default path.
../data/folds
Classifier 0: Pipeline(steps=[('pca', PCA(n_components=0.1)),
                ('kneighborsclassifier',
                 KNeighborsClassifier(metric='l1', n_neighbors=1))])
Classifier 1: Pipeline(steps=[('pca', PCA(n_components=0.3)),
                ('kneighborsclassifier',
                 KNeighborsClassifier(metric='l1', n_neighbors=1))])
Classifier 2: Pipeline(steps=[('pca', PCA(n_components=0.5)),
                ('kneighborsclassifier',
                 KNeighborsClassifier(metric='l1', n_neighbors=1))])
Classifier 3: Pipeline(steps=[('pca', PCA(n_components=0.7)),
                ('kneighborsclassifier',
                 KNeighborsClassifier(metric='l1', n_neighbors=1))])
Classifier 4: Pipeline(steps=[('pca', PCA(n_components=0.9)),
                ('kneighborsclassifier',
                 KNeighborsClassifier(metric='l1', n_neighbors=1))])
Classifier 5: Pipeline(steps=[('nmf', NMF(n_components=0.1)),
                ('kne

  0%|          | 0/300 [00:00<?, ?it/s]

In [None]:
# save scores
save_scores(scores, "scores.json")

# interpret results
best_hyperparameters_per_fold, val_scores, test_scores = interprate_scores(scores)

# print results
print_inter_scores_pretty("Validation scores:", val_scores)
print_inter_scores_pretty("Test scores:", test_scores)
print(best_hyperparameters_per_fold)