In [None]:
!pip install wandb

In [None]:
import wandb
wandb.login()

In [None]:
"""
This file if a modified version of the one found here:
    https://github.com/pan-webis-de/muttenthaler19/blob/master/AuthorshipAttribution.ipynb
"""
import os
import pickle
import re
import json
import argparse
import time
import logging
import numpy as np
import wandb
import csv
import pandas as pd
from typing import List, Callable, Tuple, Union
import argparse
import copy

from sklearn import preprocessing
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import TruncatedSVD
# from sklearn.svm import SVC  # used in the original implementation but very slow on large datasets
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import roc_auc_score, f1_score, brier_score_loss
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


# Get the root logger
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)  # Set the logging level to DEBUG

# Create a console handler
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)

# dataset

In [None]:
def aa_as_pandas(data: List[List[Union[int, str]]]) -> pd.DataFrame:
    return pd.DataFrame(data, columns=['labels', 'text'])


def av_as_pandas(data: List[List[Union[int, str]]]) -> pd.DataFrame:
    return pd.DataFrame(data, columns=['same/diff', 'text0', 'text1'])


# def get_aa_dataset(dataset_path: str) -> List[List[Union[int, str]]]:
#     data = []
#     with open(dataset_path, 'r', errors='ignore') as f:
#         reader = csv.reader(f)
#         for i, line in enumerate(reader):
#             if i > 0:  # skip header
#                 data.append([int(line[0]), str(line[1])])
#     return data


def get_aa_dataset(dataset_path: str) -> List[List[Union[int, str]]]:
    data = []
    with open(dataset_path, 'r', errors='ignore') as f:
        reader = csv.reader(f)
        for i, line in enumerate(reader):
            # print(line)
            if i > 0:  # skip header
              if len(line) != 0:
                data.append([int(line[9]), str(line[1])])
                # data.append([int(line[0]), str(line[1])])
                # data.append([int(line[1]), str(line[2])])
    return data


def get_av_dataset(dataset_path: str) -> List[List[Union[int, str, str]]]:
    data = []
    with open(dataset_path, 'r', errors='ignore') as f:
        reader = csv.reader(f)
        for i, line in enumerate(reader):
            if i > 0:  # skip header
                data.append([int(line[0]), str(line[1]), str(line[2])])
    return data


def get_aa_as_pandas(dataset_path: str) -> pd.DataFrame:
    return pd.read_csv(dataset_path, header=0, names=['labels', 'text'])


def get_av_as_pandas(dataset_path: str) -> pd.DataFrame:
    return pd.read_csv(dataset_path, names=['same/diff', 'text0', 'text1'])

# evaluation

In [None]:

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
# Evaluation script for the Cross-Domain Authorship Verification task @PAN2020.

## Measures
The following evaluation measures are provided:
    - F1-score [Pedregosa et al. 2011]
    - Area-Under-the-Curve [Pedregosa et al. 2011]
    - c@1 [Peñas and Rodrigo 2011; Stamatatos 2014]
    - f_05_u_score [Bevendorff et al. 2019]
    - the complement of the Brier score loss [Pedregosa et al. 2011]

Systems will be evaluated, taking all of the measures into account.

## Formats
The script requires two files, one for the ground truth (gold standard)
and one for the system predictions. These files should be formatted using
the `jsonl`-convention, whereby each line should contain a valid
json-string: e.g.

``` json
    {"id": "1", "value": 0.123}
    {"id": "2", "value": 0.5}
    {"id": "3", "value": 0.888}
```

Only files will be considered that:
- have the `.jsonl` extension
- are properly encoded as UTF-8.

Please note:
    * For the c@1, all scores are will binarized using
      the conventional thresholds:
        * score < 0.5 -> 0
        * score > 0.5 -> 1
    * A score of *exactly* 0.5, will be considered a non-decision.
    * All problems which are present in the ground truth, but which
      are *not* provided an answer to by the system, will automatically
      be set to 0.5.
    * Non-answers are removed for the F1 score calculation below, but they
      are taken into account by the AUC and Brier score.

## Dependencies:
- Python 3.6+ (we recommend the Anaconda Python distribution)
- scikit-learn

## Usage

From the command line:

>>> python pan20-verif-evaluator.py -i COLLECTION -a ANSWERS -o OUTPUT

where
    COLLECTION is the path to the file with the ground truth
    ANSWERS is the path to the answers file for a submitted method
    OUTPUT is the path to the folder where the results of the evaluation will be saved

Example:

>>> python pan20_verif_evaluator.py -i "datasets/test_truth/truth.jsonl" \
        -a "out/answers.jsonl" \
        -o "pan20-evaluation"

## References
- E. Stamatatos, et al. Overview of the Author Identification
  Task at PAN 2014. CLEF Working Notes (2014): 877-897.
- Pedregosa, F. et al. Scikit-learn: Machine Learning in Python,
  Journal of Machine Learning Research 12 (2011), 2825--2830.
- A. Peñas and A. Rodrigo. A Simple Measure to Assess Nonresponse.
  In Proc. of the 49th Annual Meeting of the Association for
  Computational Linguistics, Vol. 1, pages 1415-1424, 2011.
- Bevendorff et al. Generalizing Unmasking for Short Texts,
  Proceedings of NAACL (2019), 654-659.

"""


def binarize(y, threshold=0.5, triple_valued=False):
    y = np.array(y)
    # y = np.ma.fix_invalid(y, fill_value=threshold)
    if triple_valued:
        y[y > threshold] = 1
    else:
        y[y >= threshold] = 1
    y[y < threshold] = 0
    return y


def auc(true_y, pred_y):
    """
    Calculates the AUC score (Area Under the Curve), a well-known
    scalar evaluation score for binary classifiers. This score
    also considers "unanswered" problem, where score = 0.5.

    Parameters
    ----------
    prediction_scores : array [n_problems]

        The predictions outputted by a verification system.
        Assumes `0 >= prediction <=1`.

    ground_truth_scores : array [n_problems]

        The gold annotations provided for each problem.
        Will typically be `0` or `1`.

    Returns
    ----------
    auc = the Area Under the Curve.

    References
    ----------
        E. Stamatatos, et al. Overview of the Author Identification
        Task at PAN 2014. CLEF (Working Notes) 2014: 877-897.

    """
    try:
        return roc_auc_score(true_y, pred_y)
    except ValueError:
        return 0.0


def c_at_1(true_y, pred_y, threshold=0.5):
    """
    Calculates the c@1 score, an evaluation method specific to the
    PAN competition. This method rewards predictions which leave
    some problems unanswered (score = 0.5). See:

        A. Peñas and A. Rodrigo. A Simple Measure to Assess Nonresponse.
        In Proc. of the 49th Annual Meeting of the Association for
        Computational Linguistics, Vol. 1, pages 1415-1424, 2011.

    Parameters
    ----------
    prediction_scores : array [n_problems]

        The predictions outputted by a verification system.
        Assumes `0 >= prediction <=1`.

    ground_truth_scores : array [n_problems]

        The gold annotations provided for each problem.
        Will always be `0` or `1`.

    Returns
    ----------
    c@1 = the c@1 measure (which accounts for unanswered
        problems.)


    References
    ----------
        - E. Stamatatos, et al. Overview of the Author Identification
        Task at PAN 2014. CLEF (Working Notes) 2014: 877-897.
        - A. Peñas and A. Rodrigo. A Simple Measure to Assess Nonresponse.
        In Proc. of the 49th Annual Meeting of the Association for
        Computational Linguistics, Vol. 1, pages 1415-1424, 2011.

    """

    n = float(len(pred_y))
    nc, nu = 0.0, 0.0

    for gt_score, pred_score in zip(true_y, pred_y):
        if pred_score == 0.5:
            nu += 1
        elif (pred_score > 0.5) == (gt_score > 0.5):
            nc += 1.0

    return (1 / n) * (nc + (nu * nc / n))


def f1(true_y, pred_y, threshold=0.5):
    """
    Assesses verification performance, assuming that every
    `score > 0.5` represents a same-author pair decision.
    Note that all non-decisions (scores == 0.5) are ignored
    by this metric.

    Parameters
    ----------
    prediction_scores : array [n_problems]

        The predictions outputted by a verification system.
        Assumes `0 >= prediction <=1`.

    ground_truth_scores : array [n_problems]

        The gold annotations provided for each problem.
        Will typically be `0` or `1`.

    Returns
    ----------
    acc = The number of correct attributions.

    References
    ----------
        E. Stamatatos, et al. Overview of the Author Identification
        Task at PAN 2014. CLEF (Working Notes) 2014: 877-897.
    """
    true_y_filtered, pred_y_filtered = [], []

    for true, pred in zip(true_y, pred_y):
        if pred != threshold:
            true_y_filtered.append(true)
            pred_y_filtered.append(pred)

    pred_y_filtered = binarize(pred_y_filtered, threshold=threshold)

    return f1_score(true_y_filtered, pred_y_filtered)


def f_05_u_score(true_y, pred_y, pos_label=1, threshold=0.5):
    """
    Return F0.5u score of prediction.

    :param true_y: true labels
    :param pred_y: predicted labels
    :param threshold: indication for non-decisions (default = 0.5)
    :param pos_label: positive class label (default = 1)
    :return: F0.5u score
    """

    pred_y = binarize(pred_y, triple_valued=True)

    n_tp = 0
    n_fn = 0
    n_fp = 0
    n_u = 0

    for i, pred in enumerate(pred_y):
        if pred == threshold:
            n_u += 1
        elif pred == pos_label and pred == true_y[i]:
            n_tp += 1
        elif pred == pos_label and pred != true_y[i]:
            n_fp += 1
        elif true_y[i] == pos_label and pred != true_y[i]:
            n_fn += 1

    return (1.25 * n_tp) / (1.25 * n_tp + 0.25 * (n_fn + n_u) + n_fp)

def brier_score(true_y, pred_y):
    """
    Calculates the complement of the Brier score loss (which is bounded
    to the [0-1]), so that higher scores indicate better performance.
    This score also considers "unanswered" problem, where score = 0.5.
    We use the Brier implementation in scikit-learn [Pedregosa et al.
    2011].

    Parameters
    ----------
    prediction_scores : array [n_problems]

        The predictions outputted by a verification system.
        Assumes `0 >= prediction <=1`.

    ground_truth_scores : array [n_problems]

        The gold annotations provided for each problem.
        Will typically be `0` or `1`.

    Returns
    ----------
    brier = float
        the complement of the Brier score

    References
    ----------
    - Pedregosa, F. et al. Scikit-learn: Machine Learning in Python,
      Journal of Machine Learning Research 12 (2011), 2825--2830.

    """
    try:
        return 1 - brier_score_loss(true_y, pred_y)
    except ValueError:
        return 0.0


def load_file(fn):
    problems = {}
    for line in open(fn):
        d =  json.loads(line.strip())
        if 'value' in d:
            problems[d['id']] = d['value']
        else:
            problems[d['id']] = int(d['same'])
    return problems


def evaluate_all(true_y, pred_y):
    """
    Convenience function: calculates all PAN20 evaluation measures
    and returns them as a dict, including the 'overall' score, which
    is the mean of the individual metrics (0 >= metric >= 1). All
    scores get rounded to three digits.
    """

    results = {'auc': auc(true_y, pred_y),
               'c@1': c_at_1(true_y, pred_y),
               'f_05_u': f_05_u_score(true_y, pred_y),
               'F1': f1(true_y, pred_y),
               'brier': brier_score(true_y, pred_y)
              }

    results['overall'] = np.mean(list(results.values()))

    for k, v in results.items():
        results[k] = round(v, 3)

    return results


def aa_metrics(labels, predictions, raw_outputs, prefix='', no_auc=False, special=False):

    accuracy = metrics.accuracy_score(labels, predictions)
    macro_accuracy = metrics.balanced_accuracy_score(labels, predictions)
    results = {
        f'{prefix}accuracy': accuracy,
        f'{prefix}macro_accuracy': macro_accuracy,
    }
    if special:
        return results

    micro_recall = metrics.recall_score(labels, predictions, average='micro')
    macro_recall = metrics.recall_score(labels, predictions, average='macro')
    micro_precision = metrics.precision_score(labels, predictions, average='micro')
    macro_precision = metrics.precision_score(labels, predictions, average='macro')

    # Calculate micro and macro F1 scores
    micro_f1 = metrics.f1_score(labels, predictions, average='micro')
    macro_f1 = metrics.f1_score(labels, predictions, average='macro')

    results.update({
        f'{prefix}micro_recall': micro_recall,
        f'{prefix}macro_recall': macro_recall,
        f'{prefix}micro_precision': micro_precision,
        f'{prefix}macro_precision': macro_precision,
        f'{prefix}micro_f1': micro_f1,
        f'{prefix}macro_f1': macro_f1,
    })

    if not no_auc:
        ovr_weighted_auc = metrics.roc_auc_score(labels, raw_outputs, average='weighted', multi_class='ovr')
        ovr_macro_auc = metrics.roc_auc_score(labels, raw_outputs, average='macro', multi_class='ovr')
        ovo_weighted_auc = metrics.roc_auc_score(labels, raw_outputs, average='weighted', multi_class='ovo')
        ovo_macro_auc = metrics.roc_auc_score(labels, raw_outputs, average='macro', multi_class='ovo')
        top2 = metrics.top_k_accuracy_score(labels, raw_outputs, k=2)
        top3 = metrics.top_k_accuracy_score(labels, raw_outputs, k=3)
        top4 = metrics.top_k_accuracy_score(labels, raw_outputs, k=4)
        top5 = metrics.top_k_accuracy_score(labels, raw_outputs, k=5)
        top6 = metrics.top_k_accuracy_score(labels, raw_outputs, k=6)
        top7 = metrics.top_k_accuracy_score(labels, raw_outputs, k=7)
        top8 = metrics.top_k_accuracy_score(labels, raw_outputs, k=8)
        top9 = metrics.top_k_accuracy_score(labels, raw_outputs, k=9)
        top10 = metrics.top_k_accuracy_score(labels, raw_outputs, k=10)
        micro_f1 = metrics.f1_score(labels, predictions, average="micro")
        macro_f1 = metrics.f1_score(labels, predictions, average="macro")

        results.update({
            f'{prefix}ovr_weighted_auc': ovr_weighted_auc,
            f'{prefix}ovr_macro_auc': ovr_macro_auc,
            f'{prefix}ovo_weighted_auc': ovo_weighted_auc,
            f'{prefix}ovo_macro_auc': ovo_macro_auc,
            f'{prefix}micro_f1': micro_f1,
            f'{prefix}macro_f1': macro_f1,
            f'{prefix}top2': top2,
            f'{prefix}top3': top3,
            f'{prefix}top4': top4,
            f'{prefix}top5': top5,
            f'{prefix}top6': top6,
            f'{prefix}top7': top7,
            f'{prefix}top8': top8,
            f'{prefix}top9': top9,
            f'{prefix}top10': top10
        })

    return results


def av_metrics(labels, predictions=None, probas=None, threshold=0.5, prefix=''):

    assert (predictions is not None) or (probas is not None), "no predictions or probas were passed in. . ."
    if predictions is None:
        predictions = binarize(probas, threshold=threshold)

    accuracy = metrics.accuracy_score(labels, predictions)
    macro_accuracy = metrics.balanced_accuracy_score(labels, predictions)
    micro_recall = metrics.recall_score(labels, predictions, average='micro', zero_division=0)
    macro_recall = metrics.recall_score(labels, predictions, average='macro', zero_division=0)
    micro_precision = metrics.precision_score(labels, predictions, average='micro', zero_division=0)
    macro_precision = metrics.precision_score(labels, predictions, average='macro', zero_division=0)

    results = {
        f'{prefix}accuracy': accuracy,
        f'{prefix}macro_accuracy': macro_accuracy,
        f'{prefix}micro_recall': micro_recall,
        f'{prefix}macro_recall': macro_recall,
        f'{prefix}micro_precision': micro_precision,
        f'{prefix}macro_precision': macro_precision,
        f'{prefix}threshold': threshold,
    }

    auc = metrics.roc_auc_score(labels, probas)
    f1 = metrics.f1_score(labels, predictions, zero_division=0)

    results.update({
        f'{prefix}auc': auc,
        f'{prefix}f1': f1,
    })

    return results


def main():
    parser = argparse.ArgumentParser(description='Evaluation script AA@PAN2020')
    parser.add_argument('-i', type=str,
                        help='Path to the jsonl-file with ground truth scores')
    parser.add_argument('-a', type=str,
                        help='Path to the jsonl-file with the answers (system prediction)')
    parser.add_argument('-o', type=str,
                        help='Path to output files')
    args = parser.parse_args()

    # validate:
    if not args.i:
        raise ValueError('The ground truth path is required')
    if not args.a:
        raise ValueError('The answers path is required')
    if not args.o:
        raise ValueError('The output folder path is required')

    # load:
    gt = load_file(f"{args.i}/truth.jsonl")
    pred = load_file(f"{args.a}/answers.jsonl")

    print('->', len(gt), 'problems in ground truth')
    print('->', len(pred), 'solutions explicitly proposed')

    # default missing problems to 0.5
    for probl_id in sorted(gt):
        if probl_id not in pred:
            pred[probl_id] = 0.5

    # sanity check:
    assert len(gt) == len(pred)
    assert set(gt.keys()).union(set(pred)) == set(gt.keys())

    # align the scores:
    scores = [(gt[k], pred[k]) for k in sorted(gt)]
    gt, pred = zip(*scores)
    gt = np.array(gt, dtype=np.float64)
    pred = np.array(pred, dtype=np.float64)

    assert len(gt) == len(pred)

    # evaluate:
    results = evaluate_all(gt, pred)
    print(results)

    with open(args.o + os.sep + 'out.json', 'w') as f:
        json.dump(results, f, indent=4, sort_keys=True)

    with open(args.o + os.sep + 'evaluation.prototext', 'w') as f:
        for metric, score in results.items():
            f.write('measure {\n')
            f.write(' key: "' + metric + '"\n')
            f.write(' value: "' + str(score) + '"\n')
            f.write('}\n')


def accuracy_calculator(normalized_similarities, truth, num_points=5):
    all_thresholds = [x for x in np.linspace(np.min(normalized_similarities), np.max(normalized_similarities), num_points)]
    best_acc = [0, 0]
    for threshold in all_thresholds:
        binarized_predictions = binarize(normalized_similarities, threshold)
        correct_predictions = np.zeros_like(binarized_predictions)
        correct_predictions[truth == binarized_predictions] = 1
        accuracy = sum(correct_predictions) / float(len(correct_predictions))
        best_acc = best_acc if best_acc[0] > accuracy else [accuracy, threshold]
    return best_acc


def accuracy_calculator_fixed_threshold(normalized_similarities, truth, threshold=None):
    if not threshold:
        threshold = np.mean(normalized_similarities)
    binarized_predictions = binarize(normalized_similarities, threshold)
    correct_predictions = np.zeros_like(binarized_predictions)
    correct_predictions[truth == binarized_predictions] = 1
    accuracy = sum(correct_predictions) / float(len(correct_predictions))
    return [accuracy, threshold]


def threshold_search(normalized_similarities, truth, num_thresholds=100):

    # i think we actually should normalize the simialrities first
    # renormed_sims = normalized_similarities - np.min(normalized_similarities)
    # renormed_sims = renormed_sims / np.max(renormed_sims)

    best_results = {'accuracy': 0,
                    'auc': 0,
                    'ca1': 0,
                    'f_05_u': 0,
                    'F1': 0,
                    'overall': 0,
                    'threshold': 0}
    all_thresholds = [x for x in np.linspace(np.min(normalized_similarities), np.max(normalized_similarities), num_thresholds)]

    for threshold in all_thresholds:
        results = {
            'accuracy': accuracy_calculator_fixed_threshold(normalized_similarities, truth, threshold)[0],
            'auc': roc_auc_score(truth, normalized_similarities),
            'ca1': c_at_1(truth, normalized_similarities, threshold),
            'f_05_u': f_05_u_score(truth, normalized_similarities, threshold=threshold),
            'F1': f1(truth, normalized_similarities, threshold=threshold)
        }
        results['overall'] = np.mean(list(results.values()))
        results['threshold'] = threshold

        best_results = best_results if best_results['accuracy'] > results['accuracy'] else copy.deepcopy(results)

    return best_results


# training

In [None]:
def base_preprocessor(string: str) -> str:
    """
    Function that computes regular expressions.
    """
    string = re.sub("[0-9]", "0", string)  # each digit will be represented as a 0
    string = re.sub(r'( \n| \t)+', '', string)
    # text = re.sub("[0-9]+(([.,^])[0-9]+)?", "#", text)
    string = re.sub("https:\\\+([a-zA-Z0-9.]+)?", "@", string)
    return string


def char_diff_preprocessor(string: str) -> str:
    """
    Function that computes regular expressions.
    """
    string = base_preprocessor(string)
    string = re.sub("[a-zA-Z]+", "*", string)
    # string = ''.join(['*' if char.isalpha() else char for char in string])
    return string


def word_preprocessor(string: str) -> str:
    """
    Function that computes regular expressions.
    """
    string = base_preprocessor(string)
    # if model is a word n-gram model, remove all punctuation
    string = ''.join([char for char in string if char.isalnum() or char.isspace()])
    return string


def get_vectorizers(analyzer: str = 'char',
                    gram_range: List = (1, 2),
                    preprocessor: Callable = base_preprocessor,
                    max_features: int = 1000,
                    min_df: float = 0.1,
                    smooth_idf: bool = True,
                    sublinear_tf: bool = True) -> Tuple[CountVectorizer, TfidfTransformer]:
    """
    Get a vectorizer for this project
    """
    logging.debug(f'Building a {gram_range} TfidfVectorizer for {analyzer} with the {preprocessor} preprocessor.')
    logging.debug(f'Other params:\n\t\tmax_features: {max_features}\n\t\tmin_df: {min_df}\n\t\tsmooth_idf: '
                  f'{smooth_idf}\n\t\tsublinear_tf: {sublinear_tf}')
    count_vectorizer = CountVectorizer(decode_error='ignore', strip_accents='unicode', lowercase=False, stop_words=None,
                                       ngram_range=gram_range, analyzer=analyzer, min_df=min_df,
                                       max_features=max_features)
    tfidf_vectorizer = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf)
    return count_vectorizer, tfidf_vectorizer


def ngram(analyzer: str, train_texts: List, train_labels: List, test_texts: List, test_labels: List, gram_range: List,
          preprocessor: Callable, max_features: int, min_df: float, sublinear_tf: bool, use_lsa: bool, lsa_factors: int,
          dual: bool, log_prefix: str, save_path: str = None, project: str = '', logistic_regression: bool = False,
          num_workers: int = 1):
    logging.info(f'{analyzer}: building the tf-idf vectorizer for the {analyzer} n-gram model')
    count_vectorizer, tfidf_transformer = get_vectorizers(analyzer=analyzer if 'dist' not in analyzer else 'char',
                                                          gram_range=gram_range,
                                                          preprocessor=preprocessor,
                                                          max_features=max_features,
                                                          min_df=min_df,
                                                          smooth_idf=True,
                                                          sublinear_tf=sublinear_tf)

    # cache the vectorizer, just load it if the params match up
    count_vectorizer_cache_path = save_path + f'/cv_{project}_{analyzer}_{gram_range[0]}-{gram_range[1]}_' \
                                  f'{max_features}_{min_df}.pkl'
    tfidf_vectorizer_cache_path = save_path + f'/idf_{project}_{analyzer}_{gram_range[0]}-{gram_range[1]}_' \
                                  f'{max_features}_{min_df}_{sublinear_tf}.pkl'

    # fit the count vectorizer
    if os.path.isfile(count_vectorizer_cache_path):
        logging.info(f'loading the pre-fit count vectorizer from {count_vectorizer_cache_path}')
        start = time.time()
        with open(count_vectorizer_cache_path, 'rb') as f:
            count_vectorizer = pickle.load(f)
        logging.debug(f'took {(time.time() - start) / 60} minutes')
        logging.info(f'transforming the texts with the pre-fit vectorizer.')
        train_term_matrix = count_vectorizer.transform(train_texts)

    else:
        logging.info(f'{analyzer}: fitting the count vectorizer')
        start = time.time()
        train_term_matrix = count_vectorizer.fit_transform(train_texts).toarray()
        logging.info(f'saving count vectorizer to cache: {count_vectorizer_cache_path}')
        os.makedirs(os.path.dirname(count_vectorizer_cache_path), exist_ok=True)
        with open(count_vectorizer_cache_path, 'wb') as f:
            pickle.dump(count_vectorizer, f)
        logging.debug(f'took {(time.time() - start) / 60} minutes')

    # fit the tfidf transformer
    if os.path.isfile(tfidf_vectorizer_cache_path):
        logging.info(f'loading the pre-fit tfidf vectorizer from {tfidf_vectorizer_cache_path}')
        start = time.time()
        with open(tfidf_vectorizer_cache_path, 'rb') as f:
            tfidf_transformer = pickle.load(f)
        logging.debug(f'took {(time.time() - start) / 60} minutes')
        logging.info(f'transforming the training texts with the  tfidf transformer')
        train_data = tfidf_transformer.transform(train_term_matrix)
    else:
        logging.info(f'{analyzer}: fitting the tfidf vectorizer')
        start = time.time()
        train_data = tfidf_transformer.fit_transform(train_term_matrix).toarray()
        logging.info(f'saving tfidf vectorizer to cache: {tfidf_vectorizer_cache_path}')
        os.makedirs(os.path.dirname(tfidf_vectorizer_cache_path), exist_ok=True)
        with open(tfidf_vectorizer_cache_path, 'wb') as f:
            pickle.dump(tfidf_transformer, f)
        logging.debug(f'took {(time.time() - start) / 60} minutes')


    logging.info(f'{analyzer}: vectorizing the test texts')
    test_data = tfidf_transformer.transform(count_vectorizer.transform(test_texts).toarray()).toarray()

    logging.info(f'{analyzer}: scaling the vectorized data')
    max_abs_scaler = preprocessing.MaxAbsScaler()
    scaled_train_data = max_abs_scaler.fit_transform(train_data)
    scaled_test_data = max_abs_scaler.transform(test_data)

    if use_lsa:
        lsa_cache_path = save_path + f'/lsa_{project}_{analyzer}_{gram_range[0]}-{gram_range[1]}_{max_features}_' \
                                f'{min_df}_{sublinear_tf}_{lsa_factors}.pkl'
        if os.path.isfile(lsa_cache_path):
            logging.info(f'loading the svd transform from cache')
            start = time.time()
            with open(lsa_cache_path, 'rb') as f:
                svd = pickle.load(f)
            scaled_train_data = svd.transform(scaled_train_data)
            scaled_test_data = svd.transform(scaled_test_data)
            logging.debug(f'took {(time.time() - start) / 60} minutes')
        else:
            logging.info(f'{analyzer}: reducing demensionality with TruncatedSVD')
            start = time.time()
            svd = TruncatedSVD(n_components=lsa_factors, algorithm='randomized', random_state=0)
            # Char
            scaled_train_data = svd.fit_transform(scaled_train_data)
            scaled_test_data = svd.transform(scaled_test_data)
            logging.debug(f'took {(time.time() - start) / 60} minutes')
            # cache the svd
            with open(lsa_cache_path, 'wb') as f:
                pickle.dump(svd, f)

    logging.info(f'{analyzer}: fitting the classifier')
    start = time.time()
    # This was the classifier used in the original implementation, but we need a more efficient one
    # char_std = CalibratedClassifierCV(OneVsRestClassifier(SVC(C=1, kernel='linear',
    #                                                           gamma='auto', verbose=True)))
    if logistic_regression:
        # classifier = LogisticRegression(multi_class='multinomial', dual=dual)
        classifier = SGDClassifier(loss='log_loss', n_jobs=num_workers, early_stopping=False, verbose=1)
    else:
        classifier = LogisticRegression(multi_class='multinomial', dual=dual)

    classifier.fit(scaled_train_data, train_labels)
    logging.debug(f'took {(time.time() - start) / 60} minutes')

    logging.info(f'{analyzer}: inference on the test set')
    start - time.time()
    predictions = classifier.predict(scaled_test_data)
    predicted_probs = classifier.predict_proba(scaled_test_data)
    logging.debug(f'took {(time.time() - start) / 60} minutes')

    # compute and log char ngram
    logging.info(f'{analyzer}: logging to wandb')
    wandb.sklearn.plot_classifier(classifier,
                                  scaled_train_data, scaled_test_data,
                                  train_labels, test_labels,
                                  predictions, predicted_probs,
                                  [x for x in range(len(set(train_labels)))],
                                  is_binary=False,
                                  model_name=analyzer)
    results = aa_metrics(test_labels, predictions, predicted_probs, prefix=log_prefix, no_auc=True)
    wandb.log(results)

    # save the model
    clf_name = 'logreg_sgd' if logistic_regression else 'logreg'
    svm_path = os.path.join(os.path.dirname(tfidf_vectorizer_cache_path), f'{analyzer}_{clf_name}.pkl')
    logging.debug(f'saving the {analyzer}_{clf_name} to {svm_path}')
    with open(svm_path, 'wb') as f:
        pickle.dump(classifier, f)

    wandb.save(svm_path)

    return predicted_probs


def run_ngram(config={}, ngram_type: str = 'char', train_pth: str = None, val_pth: str = None, test_pth: str = None,
              project='', num_workers=10, wandb_name=''):

    # need to make sure config is a namespace
    if isinstance(config, dict):
        config = argparse.Namespace(**config)

    sweep = True if project != '' else False
    project = project if project != '' else config.project
    # wandb_name = config.wandb_name
    wandb_tags = config.wandb_tags
    tmp = vars(config)
    tmp['model'] = ngram_type
    # print(sweep)

    with wandb.init(project=project, config=tmp, reinit=True, name=wandb_name, tags=eval(wandb_tags)):
        if sweep:
            config = wandb.config
            config.project = project
            config.num_workers = num_workers

        # config.project = project
        # config.save_path = os.path.join('ngram', project, wandb.run.name)

        # config.model = ngram_type
        # wandb.config.update(config)

        # add the run name to make sure we don't overwrite other models
        # if config.save_path is not None:
        save_path = os.path.join('/content/drive/MyDrive/msc_project/model/baseline', project, wandb.run.name.split('_')[0])
        print(save_path)
        logging.info('starting')

        # get the training and testing dataset as List[List[Union[int, str]]]
        logging.info('loading the datasets')
        train_dset = get_aa_dataset(train_pth)
        test_dset = get_aa_dataset(test_pth)
        # print(train_dset)
        # print(test_dset)

        if val_pth is not None and val_pth != '':
            log_prefix = 'test/'
            train_dset.extend(get_aa_dataset(val_pth))
        else:
            log_prefix = 'val/'

        train_texts = [text for _, text in train_dset]
        train_labels = [label for label, _ in train_dset]
        test_texts = [text for _, text in test_dset]
        test_labels = [label for label, _ in test_dset]
        # print(train_texts)
        # print(train_labels)
        # print(test_texts)
        # print(test_labels)

        # get the proper preprocessor and make sure ngram_type is set for the vectorizer
        if sweep:
            gram_range = config.gram_range
        if ngram_type == 'char':
            preprocessor = base_preprocessor
            if not sweep:
                gram_range = config.char_range
        elif ngram_type == 'dist_char':
            preprocessor = char_diff_preprocessor
            if not sweep:
                gram_range = config.dist_range
        elif ngram_type == 'word':
            preprocessor = word_preprocessor
            if not sweep:
                gram_range = config.word_range
        else:
            raise ValueError(f'ngram_type was not set properly, should be in [char, dist_char, word], got {ngram_type}')

        probas = ngram(analyzer=ngram_type, train_texts=train_texts, train_labels=train_labels, test_texts=test_texts,
                       test_labels=test_labels, gram_range=gram_range, preprocessor=preprocessor,
                       max_features=config.max_features,
                       min_df=config.min_df, sublinear_tf=config.sublinear_tf, use_lsa=config.use_lsa,
                       lsa_factors=config.lsa_factors, dual=not config.primal, log_prefix=log_prefix,
                       save_path=save_path, project=config.project, logistic_regression=config.logistic_regression,
                       num_workers=config.num_workers)

    return probas


def ensemble(config, test_labels, probas_word, probas_char, probas_dist, prefix='', wandb_name=''):

    if isinstance(config, dict):
        config = argparse.Namespace(**config)

    config.model = 'ensemble'
    wandb_tags = config.wandb_tags

    with wandb.init(project=args.project, config=vars(config), reinit=True, name=wandb_name, tags=eval(wandb_tags)):

        logging.info('ensembling the models')

        avg_probas = np.average([probas_word, probas_dist, probas_char], axis=0)
        avg_predictions = []
        for text_probs in avg_probas:
            ind_best = np.argmax(text_probs)
            avg_predictions.append(ind_best)

        ensemble_results = aa_metrics(test_labels, avg_predictions, avg_probas, prefix=prefix, no_auc=True)
        wandb.log(ensemble_results)
        wandb.finish()
        logging.info('done')


# Run

## IMDB62

In [None]:
if __name__ == '__main__':
    # get command line args
    parser = argparse.ArgumentParser(description='Run a N-Gram model from the command line')

    parser.add_argument('--project', type=str, default='CharNGram',
                        help='the mlflow experiment name')
    parser.add_argument('--wandb_name', type=str, default='imdb62_1')
    parser.add_argument('--wandb_tags', type=str, default='["baseline"]')
    # parser.add_argument('--wandb_notes', type=str, default='default hyperparameters')
    parser.add_argument('--train_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_train.csv')
    parser.add_argument('--val_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_AA_val.csv')
    # parser.add_argument('--val_dataset', type=str,
    #                     default='')
    parser.add_argument('--test_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_AA_test.csv')
    parser.add_argument('--save_path', type=str, default=None)
    parser.add_argument('--seed', metavar='seed', type=int, default=0)
    parser.add_argument('--word_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--dist_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--char_range', nargs='+', type=int, default=[2, 5])
    parser.add_argument('--n_best_factor', type=float, default=0.5)
    parser.add_argument('--pt', type=float, default=0.1)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--use_lsa', action='store_true') # to reduce the number of features
    parser.add_argument('--lsa_factors', type=int, default=63)
    parser.add_argument('--sublinear_tf', action='store_true') # suitable when you want to reduce the impact of very frequent terms or when working with datasets where document length varies significantly
    parser.add_argument('--primal', action='store_true') # set to True if the number of samples is greater than the number of features
    parser.add_argument('--max_features', type=int, default=100_000)
    parser.add_argument('--min_df', type=float, default=0.01)
    parser.add_argument('--type', type=str, default='')
    parser.add_argument('--logistic_regression', action='store_true')
    parser.add_argument('--num_workers', type=int, default=10)

    parser.set_defaults(lower=True, use_lsa=True, sublinear_tf=True, primal=True, logistic_regression=True)

    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    print(args)
    print(unknown)

    args.word_range = tuple(args.word_range)
    args.dist_range = tuple(args.dist_range)
    args.char_range = tuple(args.char_range)

    np.random.seed(args.seed)

    wandb.login()

    log_prf = 'test' if 'test' in args.test_dataset else 'val'

    total_time_start = time.time()

    if args.type == '':
        char_probas = run_ngram(args, 'char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_char')
        dist_probas = run_ngram(args, 'dist_char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_dist')
        word_probas = run_ngram(args, 'word', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_word')
        # now ensemble the results
        test_lbls = [lbl for lbl, _ in get_aa_dataset(args.test_dataset)]
        ensemble(args, test_labels=test_lbls, probas_char=char_probas, probas_dist=dist_probas, probas_word=word_probas, wandb_name=args.wandb_name+'_ensemble')
    else:
        run_ngram(args, args.type, train_pth=args.train_dataset, val_pth=args.val_dataset, test_pth=args.test_dataset)

    logging.info(f'this run took {(time.time() - total_time_start)/60} minutes')

In [None]:
if __name__ == '__main__':
    # get command line args
    parser = argparse.ArgumentParser(description='Run a N-Gram model from the command line')

    parser.add_argument('--project', type=str, default='CharNGram',
                        help='the mlflow experiment name')
    parser.add_argument('--wandb_name', type=str, default='imdb62_2')
    parser.add_argument('--wandb_tags', type=str, default='["baseline"]')
    # parser.add_argument('--wandb_notes', type=str, default='default hyperparameters')
    parser.add_argument('--train_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_train.csv')
    parser.add_argument('--val_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_AA_val.csv')
    parser.add_argument('--test_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_AA_test.csv')
    parser.add_argument('--save_path', type=str, default=None)
    parser.add_argument('--seed', metavar='seed', type=int, default=0)
    parser.add_argument('--word_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--dist_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--char_range', nargs='+', type=int, default=[2, 5])
    parser.add_argument('--n_best_factor', type=float, default=0.5)
    parser.add_argument('--pt', type=float, default=0.1)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--use_lsa', action='store_true') # to reduce the number of features
    parser.add_argument('--lsa_factors', type=int, default=63)
    parser.add_argument('--sublinear_tf', action='store_true') # suitable when you want to reduce the impact of very frequent terms or when working with datasets where document length varies significantly
    parser.add_argument('--primal', action='store_true') # set to True if the number of samples is greater than the number of features
    parser.add_argument('--max_features', type=int, default=100_000)
    parser.add_argument('--min_df', type=float, default=0.01)
    parser.add_argument('--type', type=str, default='')
    parser.add_argument('--logistic_regression', action='store_true')
    parser.add_argument('--num_workers', type=int, default=10)

    parser.set_defaults(lower=True, use_lsa=True, sublinear_tf=True, primal=True, logistic_regression=False)

    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    print(args)
    print(unknown)

    args.word_range = tuple(args.word_range)
    args.dist_range = tuple(args.dist_range)
    args.char_range = tuple(args.char_range)

    np.random.seed(args.seed)

    wandb.login()

    log_prf = 'test' if 'test' in args.test_dataset else 'val'

    total_time_start = time.time()

    if args.type == '':
        char_probas = run_ngram(args, 'char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_char')
        dist_probas = run_ngram(args, 'dist_char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_dist')
        word_probas = run_ngram(args, 'word', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_word')
        # now ensemble the results
        test_lbls = [lbl for lbl, _ in get_aa_dataset(args.test_dataset)]
        ensemble(args, test_labels=test_lbls, probas_char=char_probas, probas_dist=dist_probas, probas_word=word_probas, wandb_name=args.wandb_name+'_ensemble')
    else:
        run_ngram(args, args.type, train_pth=args.train_dataset, val_pth=args.val_dataset, test_pth=args.test_dataset)

    logging.info(f'this run took {(time.time() - total_time_start)/60} minutes')

In [None]:
if __name__ == '__main__':
    # get command line args
    parser = argparse.ArgumentParser(description='Run a N-Gram model from the command line')

    parser.add_argument('--project', type=str, default='CharNGram',
                        help='the mlflow experiment name')
    parser.add_argument('--wandb_name', type=str, default='imdb62_3')
    parser.add_argument('--wandb_tags', type=str, default='["baseline"]')
    # parser.add_argument('--wandb_notes', type=str, default='default hyperparameters')
    parser.add_argument('--train_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_train.csv')
    parser.add_argument('--val_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_AA_val.csv')
    parser.add_argument('--test_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_AA_test.csv')
    parser.add_argument('--save_path', type=str, default=None)
    parser.add_argument('--seed', metavar='seed', type=int, default=0)
    parser.add_argument('--word_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--dist_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--char_range', nargs='+', type=int, default=[2, 5])
    parser.add_argument('--n_best_factor', type=float, default=0.5)
    parser.add_argument('--pt', type=float, default=0.1)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--use_lsa', action='store_true') # to reduce the number of features
    parser.add_argument('--lsa_factors', type=int, default=63)
    parser.add_argument('--sublinear_tf', action='store_true') # suitable when you want to reduce the impact of very frequent terms or when working with datasets where document length varies significantly
    parser.add_argument('--primal', action='store_true') # set to True if the number of samples is greater than the number of features
    parser.add_argument('--max_features', type=int, default=100_000)
    parser.add_argument('--min_df', type=float, default=0.01)
    parser.add_argument('--type', type=str, default='')
    parser.add_argument('--logistic_regression', action='store_true')
    parser.add_argument('--num_workers', type=int, default=10)

    parser.set_defaults(use_lsa=False, sublinear_tf=True, primal=True, logistic_regression=False)

    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    print(args)
    print(unknown)

    args.word_range = tuple(args.word_range)
    args.dist_range = tuple(args.dist_range)
    args.char_range = tuple(args.char_range)

    np.random.seed(args.seed)

    wandb.login()

    log_prf = 'test' if 'test' in args.test_dataset else 'val'

    total_time_start = time.time()

    if args.type == '':
        char_probas = run_ngram(args, 'char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_char')
        dist_probas = run_ngram(args, 'dist_char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_dist')
        word_probas = run_ngram(args, 'word', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_word')
        # now ensemble the results
        test_lbls = [lbl for lbl, _ in get_aa_dataset(args.test_dataset)]
        ensemble(args, test_labels=test_lbls, probas_char=char_probas, probas_dist=dist_probas, probas_word=word_probas, wandb_name=args.wandb_name+'_ensemble')
    else:
        run_ngram(args, args.type, train_pth=args.train_dataset, val_pth=args.val_dataset, test_pth=args.test_dataset)

    logging.info(f'this run took {(time.time() - total_time_start)/60} minutes')

In [None]:
if __name__ == '__main__':
    # get command line args
    parser = argparse.ArgumentParser(description='Run a N-Gram model from the command line')

    parser.add_argument('--project', type=str, default='CharNGram',
                        help='the mlflow experiment name')
    parser.add_argument('--wandb_name', type=str, default='imdb62_4')
    parser.add_argument('--wandb_tags', type=str, default='["baseline"]')
    # parser.add_argument('--wandb_notes', type=str, default='default hyperparameters')
    parser.add_argument('--train_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_train.csv')
    parser.add_argument('--val_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_AA_val.csv')
    parser.add_argument('--test_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_AA_test.csv')
    parser.add_argument('--save_path', type=str, default=None)
    parser.add_argument('--seed', metavar='seed', type=int, default=0)
    parser.add_argument('--word_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--dist_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--char_range', nargs='+', type=int, default=[2, 5])
    parser.add_argument('--n_best_factor', type=float, default=0.5)
    parser.add_argument('--pt', type=float, default=0.1)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--use_lsa', action='store_true') # to reduce the number of features
    parser.add_argument('--lsa_factors', type=int, default=63)
    parser.add_argument('--sublinear_tf', action='store_true') # suitable when you want to reduce the impact of very frequent terms or when working with datasets where document length varies significantly
    parser.add_argument('--primal', action='store_true') # set to True if the number of samples is greater than the number of features
    parser.add_argument('--max_features', type=int, default=100_000)
    parser.add_argument('--min_df', type=float, default=0.01)
    parser.add_argument('--type', type=str, default='')
    parser.add_argument('--logistic_regression', action='store_true')
    parser.add_argument('--num_workers', type=int, default=10)

    parser.set_defaults(use_lsa=False, sublinear_tf=False, primal=True, logistic_regression=False)

    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    print(args)
    print(unknown)

    args.word_range = tuple(args.word_range)
    args.dist_range = tuple(args.dist_range)
    args.char_range = tuple(args.char_range)

    np.random.seed(args.seed)

    wandb.login()

    log_prf = 'test' if 'test' in args.test_dataset else 'val'

    total_time_start = time.time()

    if args.type == '':
        char_probas = run_ngram(args, 'char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_char')
        dist_probas = run_ngram(args, 'dist_char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_dist')
        word_probas = run_ngram(args, 'word', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_word')
        # now ensemble the results
        test_lbls = [lbl for lbl, _ in get_aa_dataset(args.test_dataset)]
        ensemble(args, test_labels=test_lbls, probas_char=char_probas, probas_dist=dist_probas, probas_word=word_probas, wandb_name=args.wandb_name+'_ensemble')
    else:
        run_ngram(args, args.type, train_pth=args.train_dataset, val_pth=args.val_dataset, test_pth=args.test_dataset)

    logging.info(f'this run took {(time.time() - total_time_start)/60} minutes')

In [None]:
if __name__ == '__main__':
    # get command line args
    parser = argparse.ArgumentParser(description='Run a N-Gram model from the command line')

    parser.add_argument('--project', type=str, default='CharNGram',
                        help='the mlflow experiment name')
    parser.add_argument('--wandb_name', type=str, default='imdb62_3')
    parser.add_argument('--wandb_tags', type=str, default='["baseline"]')
    # parser.add_argument('--wandb_notes', type=str, default='default hyperparameters')
    parser.add_argument('--train_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_train.csv')
    # parser.add_argument('--val_dataset', type=str,
    #                     default='/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_AA_val.csv')
    parser.add_argument('--val_dataset', type=str,
                        default='')
    parser.add_argument('--test_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_AA_test.csv')
    parser.add_argument('--save_path', type=str, default=None)
    parser.add_argument('--seed', metavar='seed', type=int, default=0)
    parser.add_argument('--word_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--dist_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--char_range', nargs='+', type=int, default=[2, 5])
    parser.add_argument('--n_best_factor', type=float, default=0.5)
    parser.add_argument('--pt', type=float, default=0.1)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--use_lsa', action='store_true') # to reduce the number of features
    parser.add_argument('--lsa_factors', type=int, default=63)
    parser.add_argument('--sublinear_tf', action='store_true') # suitable when you want to reduce the impact of very frequent terms or when working with datasets where document length varies significantly
    parser.add_argument('--primal', action='store_true') # set to True if the number of samples is greater than the number of features
    parser.add_argument('--max_features', type=int, default=100_000)
    parser.add_argument('--min_df', type=float, default=0.01)
    parser.add_argument('--type', type=str, default='')
    parser.add_argument('--logistic_regression', action='store_true')
    parser.add_argument('--num_workers', type=int, default=10)

    parser.set_defaults(use_lsa=False, sublinear_tf=True, primal=True, logistic_regression=False)

    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    print(args)
    print(unknown)

    args.word_range = tuple(args.word_range)
    args.dist_range = tuple(args.dist_range)
    args.char_range = tuple(args.char_range)

    np.random.seed(args.seed)

    wandb.login()

    log_prf = 'test' if 'test' in args.test_dataset else 'val'

    total_time_start = time.time()

    if args.type == '':
        char_probas = run_ngram(args, 'char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_char')
        dist_probas = run_ngram(args, 'dist_char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_dist')
        word_probas = run_ngram(args, 'word', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_word')
        # now ensemble the results
        test_lbls = [lbl for lbl, _ in get_aa_dataset(args.test_dataset)]
        ensemble(args, test_labels=test_lbls, probas_char=char_probas, probas_dist=dist_probas, probas_word=word_probas, wandb_name=args.wandb_name+'_ensemble')
    else:
        run_ngram(args, args.type, train_pth=args.train_dataset, val_pth=args.val_dataset, test_pth=args.test_dataset)

    logging.info(f'this run took {(time.time() - total_time_start)/60} minutes')

## blogs50

In [None]:
if __name__ == '__main__':
    # get command line args
    parser = argparse.ArgumentParser(description='Run a N-Gram model from the command line')

    parser.add_argument('--project', type=str, default='CharNGram',
                        help='the mlflow experiment name')
    parser.add_argument('--wandb_name', type=str, default='blogs50_1')
    parser.add_argument('--wandb_tags', type=str, default='["baseline"]')
    # parser.add_argument('--wandb_notes', type=str, default='default hyperparameters')
    parser.add_argument('--train_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/blogs50/processed/blogs50_train.csv')
    parser.add_argument('--val_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/blogs50/processed/blogs50_AA_val.csv')
    parser.add_argument('--test_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/blogs50/processed/blogs50_AA_test.csv')
    parser.add_argument('--save_path', type=str, default=None)
    parser.add_argument('--seed', metavar='seed', type=int, default=0)
    parser.add_argument('--word_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--dist_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--char_range', nargs='+', type=int, default=[2, 5])
    parser.add_argument('--n_best_factor', type=float, default=0.5)
    parser.add_argument('--pt', type=float, default=0.1)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--use_lsa', action='store_true') # to reduce the number of features
    parser.add_argument('--lsa_factors', type=int, default=63)
    parser.add_argument('--sublinear_tf', action='store_true') # suitable when you want to reduce the impact of very frequent terms or when working with datasets where document length varies significantly
    parser.add_argument('--primal', action='store_true') # set to True if the number of samples is greater than the number of features
    parser.add_argument('--max_features', type=int, default=100_000)
    parser.add_argument('--min_df', type=float, default=0.01)
    parser.add_argument('--type', type=str, default='')
    parser.add_argument('--logistic_regression', action='store_true')
    parser.add_argument('--num_workers', type=int, default=10)

    parser.set_defaults(use_lsa=False, sublinear_tf=True, primal=True, logistic_regression=False)

    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    print(args)
    print(unknown)

    args.word_range = tuple(args.word_range)
    args.dist_range = tuple(args.dist_range)
    args.char_range = tuple(args.char_range)

    np.random.seed(args.seed)

    wandb.login()

    log_prf = 'test' if 'test' in args.test_dataset else 'val'

    total_time_start = time.time()

    if args.type == '':
        char_probas = run_ngram(args, 'char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_char')
        dist_probas = run_ngram(args, 'dist_char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_dist')
        word_probas = run_ngram(args, 'word', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_word')
        # now ensemble the results
        test_lbls = [lbl for lbl, _ in get_aa_dataset(args.test_dataset)]
        ensemble(args, test_labels=test_lbls, probas_char=char_probas, probas_dist=dist_probas, probas_word=word_probas, wandb_name=args.wandb_name+'_ensemble')
    else:
        run_ngram(args, args.type, train_pth=args.train_dataset, val_pth=args.val_dataset, test_pth=args.test_dataset)

    logging.info(f'this run took {(time.time() - total_time_start)/60} minutes')

In [None]:
if __name__ == '__main__':
    # get command line args
    parser = argparse.ArgumentParser(description='Run a N-Gram model from the command line')

    parser.add_argument('--project', type=str, default='CharNGram',
                        help='the mlflow experiment name')
    parser.add_argument('--wandb_name', type=str, default='blogs50_1')
    parser.add_argument('--wandb_tags', type=str, default='["baseline"]')
    # parser.add_argument('--wandb_notes', type=str, default='default hyperparameters')
    parser.add_argument('--train_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/blogs50/processed/blogs50_train.csv')
    parser.add_argument('--val_dataset', type=str,
                        default='')
    parser.add_argument('--test_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/blogs50/processed/blogs50_AA_test.csv')
    parser.add_argument('--save_path', type=str, default=None)
    parser.add_argument('--seed', metavar='seed', type=int, default=0)
    parser.add_argument('--word_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--dist_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--char_range', nargs='+', type=int, default=[2, 5])
    parser.add_argument('--n_best_factor', type=float, default=0.5)
    parser.add_argument('--pt', type=float, default=0.1)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--use_lsa', action='store_true') # to reduce the number of features
    parser.add_argument('--lsa_factors', type=int, default=63)
    parser.add_argument('--sublinear_tf', action='store_true') # suitable when you want to reduce the impact of very frequent terms or when working with datasets where document length varies significantly
    parser.add_argument('--primal', action='store_true') # set to True if the number of samples is greater than the number of features
    parser.add_argument('--max_features', type=int, default=100_000)
    parser.add_argument('--min_df', type=float, default=0.01)
    parser.add_argument('--type', type=str, default='')
    parser.add_argument('--logistic_regression', action='store_true')
    parser.add_argument('--num_workers', type=int, default=10)

    parser.set_defaults(use_lsa=False, sublinear_tf=True, primal=True, logistic_regression=False)

    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    print(args)
    print(unknown)

    args.word_range = tuple(args.word_range)
    args.dist_range = tuple(args.dist_range)
    args.char_range = tuple(args.char_range)

    np.random.seed(args.seed)

    wandb.login()

    log_prf = 'test' if 'test' in args.test_dataset else 'val'

    total_time_start = time.time()

    if args.type == '':
        char_probas = run_ngram(args, 'char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_char')
        dist_probas = run_ngram(args, 'dist_char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_dist')
        word_probas = run_ngram(args, 'word', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_word')
        # now ensemble the results
        test_lbls = [lbl for lbl, _ in get_aa_dataset(args.test_dataset)]
        ensemble(args, test_labels=test_lbls, probas_char=char_probas, probas_dist=dist_probas, probas_word=word_probas, wandb_name=args.wandb_name+'_ensemble')
    else:
        run_ngram(args, args.type, train_pth=args.train_dataset, val_pth=args.val_dataset, test_pth=args.test_dataset)

    logging.info(f'this run took {(time.time() - total_time_start)/60} minutes')

# diffusionDB

In [None]:
if __name__ == '__main__':
    # get command line args
    parser = argparse.ArgumentParser(description='Run a N-Gram model from the command line')

    parser.add_argument('--project', type=str, default='CharNGram',
                        help='the mlflow experiment name')
    parser.add_argument('--wandb_name', type=str, default='diffusiondbpara_1')
    parser.add_argument('--wandb_tags', type=str, default='["baseline"]')
    # parser.add_argument('--wandb_notes', type=str, default='default hyperparameters')
    parser.add_argument('--train_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/train_random100_label_1.csv')
    parser.add_argument('--val_dataset', type=str,
                        default='')
    parser.add_argument('--test_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
    parser.add_argument('--save_path', type=str, default=None)
    parser.add_argument('--seed', metavar='seed', type=int, default=0)
    parser.add_argument('--word_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--dist_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--char_range', nargs='+', type=int, default=[2, 5])
    parser.add_argument('--n_best_factor', type=float, default=0.5)
    parser.add_argument('--pt', type=float, default=0.1)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--use_lsa', action='store_true') # to reduce the number of features
    parser.add_argument('--lsa_factors', type=int, default=63)
    parser.add_argument('--sublinear_tf', action='store_true') # suitable when you want to reduce the impact of very frequent terms or when working with datasets where document length varies significantly
    parser.add_argument('--primal', action='store_true') # set to True if the number of samples is greater than the number of features
    parser.add_argument('--max_features', type=int, default=100_000)
    parser.add_argument('--min_df', type=float, default=0.01)
    parser.add_argument('--type', type=str, default='')
    parser.add_argument('--logistic_regression', action='store_true')
    parser.add_argument('--num_workers', type=int, default=10)

    parser.set_defaults(use_lsa=False, sublinear_tf=True, primal=True, logistic_regression=False)

    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    print(args)
    print(unknown)

    args.word_range = tuple(args.word_range)
    args.dist_range = tuple(args.dist_range)
    args.char_range = tuple(args.char_range)

    np.random.seed(args.seed)

    wandb.login()

    log_prf = 'test' if 'test' in args.test_dataset else 'val'

    total_time_start = time.time()

    if args.type == '':
        char_probas = run_ngram(args, 'char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_char')
        dist_probas = run_ngram(args, 'dist_char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_dist')
        word_probas = run_ngram(args, 'word', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_word')
        # now ensemble the results
        test_lbls = [lbl for lbl, _ in get_aa_dataset(args.test_dataset)]
        ensemble(args, test_labels=test_lbls, probas_char=char_probas, probas_dist=dist_probas, probas_word=word_probas, wandb_name=args.wandb_name+'_ensemble')
    else:
        run_ngram(args, args.type, train_pth=args.train_dataset, val_pth=args.val_dataset, test_pth=args.test_dataset)

    logging.info(f'this run took {(time.time() - total_time_start)/60} minutes')

In [None]:
if __name__ == '__main__':
    # get command line args
    parser = argparse.ArgumentParser(description='Run a N-Gram model from the command line')

    parser.add_argument('--project', type=str, default='CharNGram',
                        help='the mlflow experiment name')
    parser.add_argument('--wandb_name', type=str, default='diffusiondbpara_1')
    parser.add_argument('--wandb_tags', type=str, default='["baseline"]')
    # parser.add_argument('--wandb_notes', type=str, default='default hyperparameters')
    parser.add_argument('--train_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/train_random100_label_1.csv')
    parser.add_argument('--val_dataset', type=str,
                        default='')
    parser.add_argument('--test_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
    parser.add_argument('--save_path', type=str, default=None)
    parser.add_argument('--seed', metavar='seed', type=int, default=0)
    parser.add_argument('--word_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--dist_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--char_range', nargs='+', type=int, default=[2, 5])
    parser.add_argument('--n_best_factor', type=float, default=0.5)
    parser.add_argument('--pt', type=float, default=0.1)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--use_lsa', action='store_true') # to reduce the number of features
    parser.add_argument('--lsa_factors', type=int, default=63)
    parser.add_argument('--sublinear_tf', action='store_true') # suitable when you want to reduce the impact of very frequent terms or when working with datasets where document length varies significantly
    parser.add_argument('--primal', action='store_true') # set to True if the number of samples is greater than the number of features
    parser.add_argument('--max_features', type=int, default=100_000)
    parser.add_argument('--min_df', type=float, default=0.01)
    parser.add_argument('--type', type=str, default='')
    parser.add_argument('--logistic_regression', action='store_true')
    parser.add_argument('--num_workers', type=int, default=10)

    parser.set_defaults(use_lsa=False, sublinear_tf=True, primal=True, logistic_regression=False)

    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    print(args)
    print(unknown)

    args.word_range = tuple(args.word_range)
    args.dist_range = tuple(args.dist_range)
    args.char_range = tuple(args.char_range)

    np.random.seed(args.seed)

    wandb.login()

    log_prf = 'test' if 'test' in args.test_dataset else 'val'

    total_time_start = time.time()

    if args.type == '':
        char_probas = run_ngram(args, 'char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_char')
        dist_probas = run_ngram(args, 'dist_char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_dist')
        word_probas = run_ngram(args, 'word', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_word')
        # now ensemble the results
        test_lbls = [lbl for lbl, _ in get_aa_dataset(args.test_dataset)]
        ensemble(args, test_labels=test_lbls, probas_char=char_probas, probas_dist=dist_probas, probas_word=word_probas, wandb_name=args.wandb_name+'_ensemble')
    else:
        run_ngram(args, args.type, train_pth=args.train_dataset, val_pth=args.val_dataset, test_pth=args.test_dataset)

    logging.info(f'this run took {(time.time() - total_time_start)/60} minutes')

In [None]:
if __name__ == '__main__':
    # get command line args
    parser = argparse.ArgumentParser(description='Run a N-Gram model from the command line')

    parser.add_argument('--project', type=str, default='CharNGram',
                        help='the mlflow experiment name')
    parser.add_argument('--wandb_name', type=str, default='diffusiondbclean_1')
    parser.add_argument('--wandb_tags', type=str, default='["baseline"]')
    # parser.add_argument('--wandb_notes', type=str, default='default hyperparameters')
    parser.add_argument('--train_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/diffusiondb/clean/train_random100_label_1.csv')
    parser.add_argument('--val_dataset', type=str,
                        default='')
    parser.add_argument('--test_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/diffusiondb/clean/test_random100_label_1.csv')
    parser.add_argument('--save_path', type=str, default=None)
    parser.add_argument('--seed', metavar='seed', type=int, default=0)
    parser.add_argument('--word_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--dist_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--char_range', nargs='+', type=int, default=[2, 5])
    parser.add_argument('--n_best_factor', type=float, default=0.5)
    parser.add_argument('--pt', type=float, default=0.1)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--use_lsa', action='store_true') # to reduce the number of features
    parser.add_argument('--lsa_factors', type=int, default=63)
    parser.add_argument('--sublinear_tf', action='store_true') # suitable when you want to reduce the impact of very frequent terms or when working with datasets where document length varies significantly
    parser.add_argument('--primal', action='store_true') # set to True if the number of samples is greater than the number of features
    parser.add_argument('--max_features', type=int, default=100_000)
    parser.add_argument('--min_df', type=float, default=0.01)
    parser.add_argument('--type', type=str, default='')
    parser.add_argument('--logistic_regression', action='store_true')
    parser.add_argument('--num_workers', type=int, default=10)

    parser.set_defaults(use_lsa=False, sublinear_tf=True, primal=True, logistic_regression=False)

    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    print(args)
    print(unknown)

    args.word_range = tuple(args.word_range)
    args.dist_range = tuple(args.dist_range)
    args.char_range = tuple(args.char_range)

    np.random.seed(args.seed)

    wandb.login()

    log_prf = 'test' if 'test' in args.test_dataset else 'val'

    total_time_start = time.time()

    if args.type == '':
        char_probas = run_ngram(args, 'char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_char')
        dist_probas = run_ngram(args, 'dist_char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_dist')
        word_probas = run_ngram(args, 'word', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_word')
        # now ensemble the results
        test_lbls = [lbl for lbl, _ in get_aa_dataset(args.test_dataset)]
        ensemble(args, test_labels=test_lbls, probas_char=char_probas, probas_dist=dist_probas, probas_word=word_probas, wandb_name=args.wandb_name+'_ensemble')
    else:
        run_ngram(args, args.type, train_pth=args.train_dataset, val_pth=args.val_dataset, test_pth=args.test_dataset)

    logging.info(f'this run took {(time.time() - total_time_start)/60} minutes')

# diffusiondb_vary

In [None]:
if __name__ == '__main__':
    # get command line args
    parser = argparse.ArgumentParser(description='Run a N-Gram model from the command line')

    parser.add_argument('--project', type=str, default='CharNGram',
                        help='the mlflow experiment name')
    parser.add_argument('--wandb_name', type=str, default='diffusiondbvary60_1')
    parser.add_argument('--wandb_tags', type=str, default='["baseline"]')
    # parser.add_argument('--wandb_notes', type=str, default='default hyperparameters')
    parser.add_argument('--train_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/diffusiondb/vary/train_random60_label_1.csv')
    parser.add_argument('--val_dataset', type=str,
                        default='')
    parser.add_argument('--test_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/diffusiondb/vary/test_random60_label_1.csv')
    parser.add_argument('--save_path', type=str, default=None)
    parser.add_argument('--seed', metavar='seed', type=int, default=0)
    parser.add_argument('--word_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--dist_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--char_range', nargs='+', type=int, default=[2, 5])
    parser.add_argument('--n_best_factor', type=float, default=0.5)
    parser.add_argument('--pt', type=float, default=0.1)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--use_lsa', action='store_true') # to reduce the number of features
    parser.add_argument('--lsa_factors', type=int, default=63)
    parser.add_argument('--sublinear_tf', action='store_true') # suitable when you want to reduce the impact of very frequent terms or when working with datasets where document length varies significantly
    parser.add_argument('--primal', action='store_true') # set to True if the number of samples is greater than the number of features
    parser.add_argument('--max_features', type=int, default=100_000)
    parser.add_argument('--min_df', type=float, default=0.01)
    parser.add_argument('--type', type=str, default='')
    parser.add_argument('--logistic_regression', action='store_true')
    parser.add_argument('--num_workers', type=int, default=10)

    parser.set_defaults(use_lsa=False, sublinear_tf=True, primal=True, logistic_regression=False)

    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    print(args)
    print(unknown)

    args.word_range = tuple(args.word_range)
    args.dist_range = tuple(args.dist_range)
    args.char_range = tuple(args.char_range)

    np.random.seed(args.seed)

    wandb.login()

    log_prf = 'test' if 'test' in args.test_dataset else 'val'

    total_time_start = time.time()

    if args.type == '':
        char_probas = run_ngram(args, 'char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_char')
        dist_probas = run_ngram(args, 'dist_char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_dist')
        word_probas = run_ngram(args, 'word', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_word')
        # now ensemble the results
        test_lbls = [lbl for lbl, _ in get_aa_dataset(args.test_dataset)]
        ensemble(args, test_labels=test_lbls, probas_char=char_probas, probas_dist=dist_probas, probas_word=word_probas, wandb_name=args.wandb_name+'_ensemble')
    else:
        run_ngram(args, args.type, train_pth=args.train_dataset, val_pth=args.val_dataset, test_pth=args.test_dataset)

    logging.info(f'this run took {(time.time() - total_time_start)/60} minutes')

In [None]:
if __name__ == '__main__':
    # get command line args
    parser = argparse.ArgumentParser(description='Run a N-Gram model from the command line')

    parser.add_argument('--project', type=str, default='CharNGram',
                        help='the mlflow experiment name')
    parser.add_argument('--wandb_name', type=str, default='diffusiondbvary80_1')
    parser.add_argument('--wandb_tags', type=str, default='["baseline"]')
    # parser.add_argument('--wandb_notes', type=str, default='default hyperparameters')
    parser.add_argument('--train_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/diffusiondb/vary/train_random80_label_1.csv')
    parser.add_argument('--val_dataset', type=str,
                        default='')
    parser.add_argument('--test_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/diffusiondb/vary/test_random80_label_1.csv')
    parser.add_argument('--save_path', type=str, default=None)
    parser.add_argument('--seed', metavar='seed', type=int, default=0)
    parser.add_argument('--word_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--dist_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--char_range', nargs='+', type=int, default=[2, 5])
    parser.add_argument('--n_best_factor', type=float, default=0.5)
    parser.add_argument('--pt', type=float, default=0.1)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--use_lsa', action='store_true') # to reduce the number of features
    parser.add_argument('--lsa_factors', type=int, default=63)
    parser.add_argument('--sublinear_tf', action='store_true') # suitable when you want to reduce the impact of very frequent terms or when working with datasets where document length varies significantly
    parser.add_argument('--primal', action='store_true') # set to True if the number of samples is greater than the number of features
    parser.add_argument('--max_features', type=int, default=100_000)
    parser.add_argument('--min_df', type=float, default=0.01)
    parser.add_argument('--type', type=str, default='')
    parser.add_argument('--logistic_regression', action='store_true')
    parser.add_argument('--num_workers', type=int, default=10)

    parser.set_defaults(use_lsa=False, sublinear_tf=True, primal=True, logistic_regression=False)

    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    print(args)
    print(unknown)

    args.word_range = tuple(args.word_range)
    args.dist_range = tuple(args.dist_range)
    args.char_range = tuple(args.char_range)

    np.random.seed(args.seed)

    wandb.login()

    log_prf = 'test' if 'test' in args.test_dataset else 'val'

    total_time_start = time.time()

    if args.type == '':
        char_probas = run_ngram(args, 'char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_char')
        dist_probas = run_ngram(args, 'dist_char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_dist')
        word_probas = run_ngram(args, 'word', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_word')
        # now ensemble the results
        test_lbls = [lbl for lbl, _ in get_aa_dataset(args.test_dataset)]
        ensemble(args, test_labels=test_lbls, probas_char=char_probas, probas_dist=dist_probas, probas_word=word_probas, wandb_name=args.wandb_name+'_ensemble')
    else:
        run_ngram(args, args.type, train_pth=args.train_dataset, val_pth=args.val_dataset, test_pth=args.test_dataset)

    logging.info(f'this run took {(time.time() - total_time_start)/60} minutes')

In [None]:
if __name__ == '__main__':
    # get command line args
    parser = argparse.ArgumentParser(description='Run a N-Gram model from the command line')

    parser.add_argument('--project', type=str, default='CharNGram',
                        help='the mlflow experiment name')
    parser.add_argument('--wandb_name', type=str, default='diffusiondbvary120_1')
    parser.add_argument('--wandb_tags', type=str, default='["baseline"]')
    # parser.add_argument('--wandb_notes', type=str, default='default hyperparameters')
    parser.add_argument('--train_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/diffusiondb/vary/train_random120_label_1.csv')
    parser.add_argument('--val_dataset', type=str,
                        default='')
    parser.add_argument('--test_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/diffusiondb/vary/test_random120_label_1.csv')
    parser.add_argument('--save_path', type=str, default=None)
    parser.add_argument('--seed', metavar='seed', type=int, default=0)
    parser.add_argument('--word_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--dist_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--char_range', nargs='+', type=int, default=[2, 5])
    parser.add_argument('--n_best_factor', type=float, default=0.5)
    parser.add_argument('--pt', type=float, default=0.1)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--use_lsa', action='store_true') # to reduce the number of features
    parser.add_argument('--lsa_factors', type=int, default=63)
    parser.add_argument('--sublinear_tf', action='store_true') # suitable when you want to reduce the impact of very frequent terms or when working with datasets where document length varies significantly
    parser.add_argument('--primal', action='store_true') # set to True if the number of samples is greater than the number of features
    parser.add_argument('--max_features', type=int, default=100_000)
    parser.add_argument('--min_df', type=float, default=0.01)
    parser.add_argument('--type', type=str, default='')
    parser.add_argument('--logistic_regression', action='store_true')
    parser.add_argument('--num_workers', type=int, default=10)

    parser.set_defaults(use_lsa=False, sublinear_tf=True, primal=True, logistic_regression=False)

    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    print(args)
    print(unknown)

    args.word_range = tuple(args.word_range)
    args.dist_range = tuple(args.dist_range)
    args.char_range = tuple(args.char_range)

    np.random.seed(args.seed)

    wandb.login()

    log_prf = 'test' if 'test' in args.test_dataset else 'val'

    total_time_start = time.time()

    if args.type == '':
        char_probas = run_ngram(args, 'char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_char')
        dist_probas = run_ngram(args, 'dist_char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_dist')
        word_probas = run_ngram(args, 'word', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_word')
        # now ensemble the results
        test_lbls = [lbl for lbl, _ in get_aa_dataset(args.test_dataset)]
        ensemble(args, test_labels=test_lbls, probas_char=char_probas, probas_dist=dist_probas, probas_word=word_probas, wandb_name=args.wandb_name+'_ensemble')
    else:
        run_ngram(args, args.type, train_pth=args.train_dataset, val_pth=args.val_dataset, test_pth=args.test_dataset)

    logging.info(f'this run took {(time.time() - total_time_start)/60} minutes')

In [None]:
if __name__ == '__main__':
    # get command line args
    parser = argparse.ArgumentParser(description='Run a N-Gram model from the command line')

    parser.add_argument('--project', type=str, default='CharNGram',
                        help='the mlflow experiment name')
    parser.add_argument('--wandb_name', type=str, default='diffusiondbvary100150_1')
    parser.add_argument('--wandb_tags', type=str, default='["baseline"]')
    # parser.add_argument('--wandb_notes', type=str, default='default hyperparameters')
    parser.add_argument('--train_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/diffusiondb/vary/train_random100_150_label_1.csv')
    parser.add_argument('--val_dataset', type=str,
                        default='')
    parser.add_argument('--test_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/diffusiondb/vary/test_random100_150_label_1.csv')
    parser.add_argument('--save_path', type=str, default=None)
    parser.add_argument('--seed', metavar='seed', type=int, default=0)
    parser.add_argument('--word_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--dist_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--char_range', nargs='+', type=int, default=[2, 5])
    parser.add_argument('--n_best_factor', type=float, default=0.5)
    parser.add_argument('--pt', type=float, default=0.1)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--use_lsa', action='store_true') # to reduce the number of features
    parser.add_argument('--lsa_factors', type=int, default=63)
    parser.add_argument('--sublinear_tf', action='store_true') # suitable when you want to reduce the impact of very frequent terms or when working with datasets where document length varies significantly
    parser.add_argument('--primal', action='store_true') # set to True if the number of samples is greater than the number of features
    parser.add_argument('--max_features', type=int, default=100_000)
    parser.add_argument('--min_df', type=float, default=0.01)
    parser.add_argument('--type', type=str, default='')
    parser.add_argument('--logistic_regression', action='store_true')
    parser.add_argument('--num_workers', type=int, default=10)

    parser.set_defaults(use_lsa=False, sublinear_tf=True, primal=True, logistic_regression=False)

    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    print(args)
    print(unknown)

    args.word_range = tuple(args.word_range)
    args.dist_range = tuple(args.dist_range)
    args.char_range = tuple(args.char_range)

    np.random.seed(args.seed)

    wandb.login()

    log_prf = 'test' if 'test' in args.test_dataset else 'val'

    total_time_start = time.time()

    if args.type == '':
        char_probas = run_ngram(args, 'char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_char')
        dist_probas = run_ngram(args, 'dist_char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_dist')
        word_probas = run_ngram(args, 'word', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_word')
        # now ensemble the results
        test_lbls = [lbl for lbl, _ in get_aa_dataset(args.test_dataset)]
        ensemble(args, test_labels=test_lbls, probas_char=char_probas, probas_dist=dist_probas, probas_word=word_probas, wandb_name=args.wandb_name+'_ensemble')
    else:
        run_ngram(args, args.type, train_pth=args.train_dataset, val_pth=args.val_dataset, test_pth=args.test_dataset)

    logging.info(f'this run took {(time.time() - total_time_start)/60} minutes')

In [None]:
if __name__ == '__main__':
    # get command line args
    parser = argparse.ArgumentParser(description='Run a N-Gram model from the command line')

    parser.add_argument('--project', type=str, default='CharNGram',
                        help='the mlflow experiment name')
    parser.add_argument('--wandb_name', type=str, default='diffusiondbvary100200_1')
    parser.add_argument('--wandb_tags', type=str, default='["baseline"]')
    # parser.add_argument('--wandb_notes', type=str, default='default hyperparameters')
    parser.add_argument('--train_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/diffusiondb/vary/train_random100_200_label_1.csv')
    parser.add_argument('--val_dataset', type=str,
                        default='')
    parser.add_argument('--test_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/diffusiondb/vary/test_random100_200_label_1.csv')
    parser.add_argument('--save_path', type=str, default=None)
    parser.add_argument('--seed', metavar='seed', type=int, default=0)
    parser.add_argument('--word_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--dist_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--char_range', nargs='+', type=int, default=[2, 5])
    parser.add_argument('--n_best_factor', type=float, default=0.5)
    parser.add_argument('--pt', type=float, default=0.1)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--use_lsa', action='store_true') # to reduce the number of features
    parser.add_argument('--lsa_factors', type=int, default=63)
    parser.add_argument('--sublinear_tf', action='store_true') # suitable when you want to reduce the impact of very frequent terms or when working with datasets where document length varies significantly
    parser.add_argument('--primal', action='store_true') # set to True if the number of samples is greater than the number of features
    parser.add_argument('--max_features', type=int, default=100_000)
    parser.add_argument('--min_df', type=float, default=0.01)
    parser.add_argument('--type', type=str, default='')
    parser.add_argument('--logistic_regression', action='store_true')
    parser.add_argument('--num_workers', type=int, default=10)

    parser.set_defaults(use_lsa=False, sublinear_tf=True, primal=True, logistic_regression=False)

    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    print(args)
    print(unknown)

    args.word_range = tuple(args.word_range)
    args.dist_range = tuple(args.dist_range)
    args.char_range = tuple(args.char_range)

    np.random.seed(args.seed)

    wandb.login()

    log_prf = 'test' if 'test' in args.test_dataset else 'val'

    total_time_start = time.time()

    if args.type == '':
        char_probas = run_ngram(args, 'char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_char')
        dist_probas = run_ngram(args, 'dist_char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_dist')
        word_probas = run_ngram(args, 'word', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_word')
        # now ensemble the results
        test_lbls = [lbl for lbl, _ in get_aa_dataset(args.test_dataset)]
        ensemble(args, test_labels=test_lbls, probas_char=char_probas, probas_dist=dist_probas, probas_word=word_probas, wandb_name=args.wandb_name+'_ensemble')
    else:
        run_ngram(args, args.type, train_pth=args.train_dataset, val_pth=args.val_dataset, test_pth=args.test_dataset)

    logging.info(f'this run took {(time.time() - total_time_start)/60} minutes')

# others

In [None]:
df = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb.csv')

df_selected = df[['user_name', 'prompt']]
df_filtered = df_selected.groupby('user_name').filter(lambda x: len(x) >= 100)
print('number of authors', len(df_filtered['user_name'].drop_duplicates()))

for idx in range(3):
  sampled_authors = df_filtered['user_name'].drop_duplicates().sample(n=100)
  df_sampled = df_filtered[df_filtered['user_name'].isin(sampled_authors)]
  df_final = df_sampled.groupby('user_name').apply(lambda x: x.sample(n=100)).reset_index(drop=True)

  train_data = pd.DataFrame()
  test_data = pd.DataFrame()

  for author in df_final['user_name'].unique():
      author_data = df_final[df_final['user_name'] == author]
      train, test = train_test_split(author_data, test_size=0.2)
      train_data = pd.concat([train_data, train])
      test_data = pd.concat([test_data, test])

  train_data.to_csv(f'/content/drive/MyDrive/msc_project/data/diffusiondb/train_random100_{idx+1}.csv', index=False)
  test_data.to_csv(f'/content/drive/MyDrive/msc_project/data/diffusiondb/test_random100_{idx+1}.csv', index=False)
  # print(train_data)
  # print(test_data)

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/train_random100_1.csv')
df_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_1.csv')

label_encoder = LabelEncoder()

df_train['user_name'] = label_encoder.fit_transform(df_train['user_name'])
df_test['user_name'] = label_encoder.transform(df_test['user_name'])

with open('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/label_encoder.pkl', 'wb') as le_file:
    pickle.dump(label_encoder, le_file)

df_train.to_csv(f'/content/drive/MyDrive/msc_project/data/diffusiondb/processed/train_random100_label_1.csv', index=False)
df_test.to_csv(f'/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv', index=False)

In [None]:
if __name__ == '__main__':
    # get command line args
    parser = argparse.ArgumentParser(description='Run a N-Gram model from the command line')

    parser.add_argument('--project', type=str, default='CharNGram',
                        help='the mlflow experiment name')
    parser.add_argument('--wandb_name', type=str, default='diffusiondb_1')
    parser.add_argument('--wandb_tags', type=str, default='["baseline"]')
    # parser.add_argument('--wandb_notes', type=str, default='default hyperparameters')
    parser.add_argument('--train_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/diffusiondb/processed/train_random100_label_1.csv')
    parser.add_argument('--val_dataset', type=str,
                        default='')
    parser.add_argument('--test_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
    parser.add_argument('--save_path', type=str, default=None)
    parser.add_argument('--seed', metavar='seed', type=int, default=0)
    parser.add_argument('--word_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--dist_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--char_range', nargs='+', type=int, default=[2, 5])
    parser.add_argument('--n_best_factor', type=float, default=0.5)
    parser.add_argument('--pt', type=float, default=0.1)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--use_lsa', action='store_true') # to reduce the number of features
    parser.add_argument('--lsa_factors', type=int, default=63)
    parser.add_argument('--sublinear_tf', action='store_true') # suitable when you want to reduce the impact of very frequent terms or when working with datasets where document length varies significantly
    parser.add_argument('--primal', action='store_true') # set to True if the number of samples is greater than the number of features
    parser.add_argument('--max_features', type=int, default=100_000)
    parser.add_argument('--min_df', type=float, default=0.01)
    parser.add_argument('--type', type=str, default='')
    parser.add_argument('--logistic_regression', action='store_true')
    parser.add_argument('--num_workers', type=int, default=10)

    parser.set_defaults(use_lsa=False, sublinear_tf=True, primal=True, logistic_regression=False)

    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    print(args)
    print(unknown)

    args.word_range = tuple(args.word_range)
    args.dist_range = tuple(args.dist_range)
    args.char_range = tuple(args.char_range)

    np.random.seed(args.seed)

    wandb.login()

    log_prf = 'test' if 'test' in args.test_dataset else 'val'

    total_time_start = time.time()

    if args.type == '':
        char_probas = run_ngram(args, 'char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_char')
        dist_probas = run_ngram(args, 'dist_char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_dist')
        word_probas = run_ngram(args, 'word', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_word')
        # now ensemble the results
        test_lbls = [lbl for lbl, _ in get_aa_dataset(args.test_dataset)]
        ensemble(args, test_labels=test_lbls, probas_char=char_probas, probas_dist=dist_probas, probas_word=word_probas, wandb_name=args.wandb_name+'_ensemble')
    else:
        run_ngram(args, args.type, train_pth=args.train_dataset, val_pth=args.val_dataset, test_pth=args.test_dataset)

    logging.info(f'this run took {(time.time() - total_time_start)/60} minutes')

# twitter_micro

In [None]:
df = pd.read_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/author_texts_cleaned.csv')
df.columns = ['user_name', 'prompts']
# df_selected = df[['user_name', 'prompt']]
df_filtered = df.groupby('user_name').filter(lambda x: len(x) >= 100)
print('number of authors', len(df_filtered['user_name'].drop_duplicates()))

for idx in range(3):
  sampled_authors = df_filtered['user_name'].drop_duplicates().sample(n=100)
  df_sampled = df_filtered[df_filtered['user_name'].isin(sampled_authors)]
  df_final = df_sampled.groupby('user_name').apply(lambda x: x.sample(n=100)).reset_index(drop=True)

  train_data = pd.DataFrame()
  test_data = pd.DataFrame()

  for author in df_final['user_name'].unique():
      author_data = df_final[df_final['user_name'] == author]
      train, test = train_test_split(author_data, test_size=0.2)
      train_data = pd.concat([train_data, train])
      test_data = pd.concat([test_data, test])

  train_data.to_csv(f'/content/drive/MyDrive/msc_project/data/twitter_micro/processed/train_random100_{idx+1}.csv', index=False)
  test_data.to_csv(f'/content/drive/MyDrive/msc_project/data/twitter_micro/processed/test_random100_{idx+1}.csv', index=False)
  # print(train_data)
  # print(test_data)

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/processed/train_random100_1.csv')
df_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/processed/test_random100_1.csv')

label_encoder = LabelEncoder()

df_train['user_name'] = label_encoder.fit_transform(df_train['user_name'])
df_test['user_name'] = label_encoder.transform(df_test['user_name'])

with open('/content/drive/MyDrive/msc_project/data/twitter_micro/processed/label_encoder.pkl', 'wb') as le_file:
    pickle.dump(label_encoder, le_file)

df_train.to_csv(f'/content/drive/MyDrive/msc_project/data/twitter_micro/processed/train_random100_label_1.csv', index=False)
df_test.to_csv(f'/content/drive/MyDrive/msc_project/data/twitter_micro/processed/test_random100_label_1.csv', index=False)

In [None]:
if __name__ == '__main__':
    # get command line args
    parser = argparse.ArgumentParser(description='Run a N-Gram model from the command line')

    parser.add_argument('--project', type=str, default='CharNGram',
                        help='the mlflow experiment name')
    parser.add_argument('--wandb_name', type=str, default='twitter_1')
    parser.add_argument('--wandb_tags', type=str, default='["baseline"]')
    # parser.add_argument('--wandb_notes', type=str, default='default hyperparameters')
    parser.add_argument('--train_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/twitter_micro/processed/train_random100_label_1.csv')
    parser.add_argument('--val_dataset', type=str,
                        default='')
    parser.add_argument('--test_dataset', type=str,
                        default='/content/drive/MyDrive/msc_project/data/twitter_micro/processed/test_random100_label_1.csv')
    parser.add_argument('--save_path', type=str, default=None)
    parser.add_argument('--seed', metavar='seed', type=int, default=0)
    parser.add_argument('--word_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--dist_range', nargs='+', type=int, default=[1, 3])
    parser.add_argument('--char_range', nargs='+', type=int, default=[2, 5])
    parser.add_argument('--n_best_factor', type=float, default=0.5)
    parser.add_argument('--pt', type=float, default=0.1)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--use_lsa', action='store_true') # to reduce the number of features
    parser.add_argument('--lsa_factors', type=int, default=63)
    parser.add_argument('--sublinear_tf', action='store_true') # suitable when you want to reduce the impact of very frequent terms or when working with datasets where document length varies significantly
    parser.add_argument('--primal', action='store_true') # set to True if the number of samples is greater than the number of features
    parser.add_argument('--max_features', type=int, default=100_000)
    parser.add_argument('--min_df', type=float, default=0.01)
    parser.add_argument('--type', type=str, default='')
    parser.add_argument('--logistic_regression', action='store_true')
    parser.add_argument('--num_workers', type=int, default=10)

    parser.set_defaults(use_lsa=False, sublinear_tf=True, primal=True, logistic_regression=False)

    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    print(args)
    print(unknown)

    args.word_range = tuple(args.word_range)
    args.dist_range = tuple(args.dist_range)
    args.char_range = tuple(args.char_range)

    np.random.seed(args.seed)

    wandb.login()

    log_prf = 'test' if 'test' in args.test_dataset else 'val'

    total_time_start = time.time()

    if args.type == '':
        char_probas = run_ngram(args, 'char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_char')
        dist_probas = run_ngram(args, 'dist_char', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_dist')
        word_probas = run_ngram(args, 'word', train_pth=args.train_dataset, val_pth=args.val_dataset,
                                test_pth=args.test_dataset, wandb_name=args.wandb_name+'_word')
        # now ensemble the results
        test_lbls = [lbl for lbl, _ in get_aa_dataset(args.test_dataset)]
        ensemble(args, test_labels=test_lbls, probas_char=char_probas, probas_dist=dist_probas, probas_word=word_probas, wandb_name=args.wandb_name+'_ensemble')
    else:
        run_ngram(args, args.type, train_pth=args.train_dataset, val_pth=args.val_dataset, test_pth=args.test_dataset)

    logging.info(f'this run took {(time.time() - total_time_start)/60} minutes')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/processed/train_random100_label_1.csv')
df