### Train Scikit-Learn Models

- Joel Stremmel
- 11-20-23

##### About

Train Scikit-Learn models on the formatted data using K-Fold Cross-Validation and save the scores.

##### Imports

In [1]:
import os
import re
import glob
import pickle
import requests
import zipfile
import scipy as sp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nlpaug.augmenter.word import WordEmbsAug
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

##### Set Parameters

In [2]:
outcome = 'cohesion' # 'cohesion'
params = {
    "tfidf": {"ngram_range": (2, 4), "min_df": 5, "max_df": 0.9, "sublinear_tf": True},
    "models": {
        "lr": {
            "C": 0.1,
            "seed": 42,
            "max_iter": 1000,
            "penalty": "l2",
            "fit_intercept": True,
            "solver": "saga",
        },
        "knn": {"n_neighbors": 3, "n_jobs": 1},
        "gd_svm": {
            "loss": "hinge",
            "penalty": "l2",
            "fit_intercept": True,
            "max_iter": 1000,
            "learning_rate": "optimal",
        },
        "nb": {},
    },
    "io": {"results_dir": "./results", "input_dir": "./data"},
    "data": {
        "sep2newlines": True,
        "sep_token": "</s>",
        "double_newlines": "\n\n",
    },
    "augmentation": {
        "add_synthetic": False,
        "aug_p": 0.2,
        "glove_file": "data/glove.6B.50d.txt",
        "glove_zip": "data/glove.6B.zip",
        "glove_url": "http://nlp.stanford.edu/data/glove.6B.zip",
    },
    "random": {"seed": 42},
}

##### Load Formatted Data

In [3]:
with open(os.path.join(params["io"]["input_dir"], f"{outcome}_X_folds.pkl"), "rb") as f:
    X_folds = pickle.load(f)

with open(os.path.join(params["io"]["input_dir"], f"{outcome}_y_folds.pkl"), "rb") as f:
    y_folds = pickle.load(f)

##### Define a Download Function

In [4]:
def download_file(url, filename):
    with open(filename, "wb") as file:
        response = requests.get(url)
        file.write(response.content)

##### Download GloVe Embeddings if Non-Existent

In [5]:
if not os.path.isfile(params["augmentation"]["glove_file"]):
    print("Downloading glove embeddings file...")
    os.makedirs(params["io"]["input_dir"], exist_ok=True)

    download_file(url, params["augmentation"]["glove_zip"])

    with zipfile.ZipFile(params["augmentation"]["glove_zip"], "r") as zip_ref:
        zip_ref.extractall(params["io"]["input_dir"])

    os.remove(params["augmentation"]["glove_zip"])
    print("Finished downloading glove embeddings file.")

##### Check Data Shape

In [6]:
assert len(X_folds) == len(y_folds), "Expected the same number of folds in X and y."
X = list(X_folds.values())
y = list(y_folds.values())

##### Check Number of Samples

In [7]:
num_samples = len([x for xx in y for x in xx])
print(f"Total number of samples: {num_samples}.")

Total number of samples: 117.


##### Check Target Prevalence

In [8]:
print(f"Target prevalance: {round(np.mean(np.concatenate(y)), 3)}.")

Target prevalance: 0.607.


##### Vectorize Text and Fit Model to Each Fold

In [9]:
y_probs, y_trues = {}, {}
for model in params["models"].keys():
    y_probs[model], y_trues[model] = [], []
    for i in range(len(X)):
        print(f"Fitting model: {model} using fold {i} as out of fold data.")

        # Define TFIDF vectorizer
        tfidf_vectorizer = TfidfVectorizer(
            sublinear_tf=params["tfidf"]["sublinear_tf"],
            strip_accents="unicode",
            analyzer="word",
            token_pattern=r"(?u)\b\w\w+\b",
            stop_words="english",
            ngram_range=params["tfidf"]["ngram_range"],
            norm="l2",
            min_df=params["tfidf"]["min_df"],
            max_df=params["tfidf"]["max_df"],
            smooth_idf=False,
            lowercase=True,
        )

        # Identify train folds and shuffle samples
        X_train, y_train = np.concatenate(X[0:i] + X[i + 1 :], axis=0), np.concatenate(
            y[0:i] + y[i + 1 :], axis=0
        )
        indices = np.arange(len(y_train))
        np.random.shuffle(indices)
        X_train, y_train = X_train[indices], y_train[indices]

        # Identify test folds
        X_test, y_test = X[i], y[i]

        # Optionally replace sep token with double newlines
        if params["data"]["sep2newlines"]:
            X_train = np.array(
                [
                    sample.replace(
                        params["data"]["sep_token"], params["data"]["double_newlines"]
                    )
                    for sample in X_train
                ]
            )
            X_test = np.array(
                [
                    sample.replace(
                        params["data"]["sep_token"], params["data"]["double_newlines"]
                    )
                    for sample in X_test
                ]
            )

        # Vectorize text
        train_features = tfidf_vectorizer.fit_transform(X_train)
        test_features = tfidf_vectorizer.transform(X_test)

        # Optionally add synthetic samples
        if params["augmentation"]["add_synthetic"]:
            # Augment train data with synthetic text using WordEmbsAug
            # .ContextualWordEmbsAug(model_path="roberta-large", action="insert")
            aug = WordEmbsAug(
                model_type="glove",
                model_path=params["augmenation"]["glove_file"],
                action="insert",
                aug_p=params["augmenation"]["aug_p"],
            )

            # Only augment non-empty samples
            # In general, we should check for empty samples and possibly remove them
            X_train_aug = []
            y_train_aug = []
            for sample, target in zip(X_train, y_train):
                if sample != "":
                    aug_sample = aug.augment(sample)[0]
                    X_train_aug.append(aug_sample)
                    y_train_aug.append(target)

            # Vectorize synthetic text
            train_features_aug = tfidf_vectorizer.transform(np.array(X_train_aug))

            # Combine original and synthetic text
            train_features = sp.sparse.vstack([train_features, train_features_aug])
            y_train = np.concatenate((y_train, y_train_aug), axis=0)

            # Shuffle combined training data
            new_indices = np.arange(len(y_train))
            np.random.shuffle(new_indices)
            train_features, y_train = train_features[new_indices], y_train[new_indices]

        # Define model to fit
        if model == "lr":
            clf = LogisticRegression(
                solver=params["models"][model]["solver"],
                fit_intercept=params["models"][model]["fit_intercept"],
                max_iter=params["models"][model]["max_iter"],
                penalty=params["models"][model]["penalty"],
                C=params["models"][model]["C"],
                class_weight=None,
                random_state=params["random"]["seed"],
            )
        elif model == "knn":
            clf = KNeighborsClassifier(
                n_neighbors=params["models"][model]["n_neighbors"],
                n_jobs=params["models"][model]["n_jobs"],
            )
            train_features = train_features.toarray()
            test_features = test_features.toarray()
        elif model == "nb":
            clf = GaussianNB()
            train_features = train_features.toarray()
            test_features = test_features.toarray()
        elif model == "gd_svm":
            clf = SGDClassifier(
                loss=params["models"][model]["loss"],
                penalty=params["models"][model]["penalty"],
                fit_intercept=params["models"][model]["fit_intercept"],
                max_iter=params["models"][model]["max_iter"],
                learning_rate=params["models"][model]["learning_rate"],
                random_state=params["random"]["seed"],
            )
        else:
            raise ValueError("Got unexpected model key: {model}.")

        # Fit model
        clf.fit(train_features, y_train)

        # Predict on test folds
        if model == "gd_svm":
            y_prob = clf.predict(test_features)
        else:
            y_prob = clf.predict_proba(test_features)[:, 1]

        # Save scores and labels
        y_probs[model].append(y_prob)
        y_trues[model].append(y_test)

Fitting model: lr using fold 0 as out of fold data.
Fitting model: lr using fold 1 as out of fold data.
Fitting model: lr using fold 2 as out of fold data.
Fitting model: lr using fold 3 as out of fold data.
Fitting model: lr using fold 4 as out of fold data.
Fitting model: lr using fold 5 as out of fold data.
Fitting model: lr using fold 6 as out of fold data.
Fitting model: lr using fold 7 as out of fold data.
Fitting model: lr using fold 8 as out of fold data.
Fitting model: lr using fold 9 as out of fold data.
Fitting model: lr using fold 10 as out of fold data.
Fitting model: knn using fold 0 as out of fold data.
Fitting model: knn using fold 1 as out of fold data.
Fitting model: knn using fold 2 as out of fold data.
Fitting model: knn using fold 3 as out of fold data.
Fitting model: knn using fold 4 as out of fold data.
Fitting model: knn using fold 5 as out of fold data.
Fitting model: knn using fold 6 as out of fold data.
Fitting model: knn using fold 7 as out of fold data.
Fit

##### Save Model Scores on Test Folds and True Labels

In [10]:
with open(os.path.join(params["io"]["results_dir"], f"{outcome}_sklearn_y_trues.pkl"), "wb") as f:
    pickle.dump(y_trues, f)

with open(os.path.join(params["io"]["results_dir"], f"{outcome}_sklearn_y_probs.pkl"), "wb") as f:
    pickle.dump(y_probs, f)