In [1]:
import os
import sys

import sklearn.metrics as metrics
from pyss3 import SS3
from pyss3.server import Live_Test
from pyss3.util import Evaluation, span

module_path = os.path.abspath(os.path.join("../../.."))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.config import END_OF_POST_TOKEN, PATH_INTERIM_CORPUS  # noqa: E402

In [2]:
CORPUS_KIND = "reddit"
CORPUS_NAME = "depression"

In [3]:
input_file_path_train = os.path.join(
    PATH_INTERIM_CORPUS, CORPUS_KIND, CORPUS_NAME, f"{CORPUS_NAME}-train-raw.txt"
)
input_file_path_test = os.path.join(
    PATH_INTERIM_CORPUS, CORPUS_KIND, CORPUS_NAME, f"{CORPUS_NAME}-test-raw.txt"
)

In [4]:
y_train = []
x_train = []
with open(input_file_path_train) as f:
    for line in f:
        label, document = line.split(maxsplit=1)
        y_train.append(label)
        posts = " ".join(document.split(END_OF_POST_TOKEN))
        x_train.append(posts)

In [5]:
y_test = []
x_test = []
with open(input_file_path_test) as f:
    for line in f:
        label, document = line.split(maxsplit=1)
        y_test.append(label)
        posts = " ".join(document.split(END_OF_POST_TOKEN))
        x_test.append(posts)

First train a base SS3 model. We can use the hyper-parameters found for more complex models later.

In [6]:
clf = SS3(name=f"{CORPUS_KIND}-{CORPUS_NAME}")

In [7]:
s, l, p, _ = clf.get_hyperparameters()

print("Smoothness(s):", s)
print("Significance(l):", l)
print("Sanction(p):", p)

Smoothness(s): 0.45
Significance(l): 0.5
Sanction(p): 1


In [8]:
clf.fit(x_train, y_train)

Training on 'positive': 100%|███████████████████████████████████████████| 2/2 [00:25<00:00, 12.82s/it]


In [9]:
clf.get_categories()

['negative', 'positive']

In [10]:
# Subset of values to search for each hyper-parameter.
s_vals = span(0.2, 0.8, 6)  # [0.2 , 0.32, 0.44, 0.56, 0.68, 0.8]
l_vals = span(0.1, 2, 6)  # [0.1 , 0.48, 0.86, 1.24, 1.62, 2]
p_vals = span(0.5, 2, 6)  # [0.5, 0.8, 1.1, 1.4, 1.7, 2]

In [11]:
k_fold = 12
best_s, best_l, best_p, _ = Evaluation.grid_search(
    clf,
    x_train,
    y_train,
    k_fold=k_fold,
    s=s_vals,
    l=l_vals,
    p=p_vals,
    cache=True,
    tag=f"grid search ({k_fold}-fold)",
)

[fold 12/12] Grid search: 100%|█████████████████████████████████████| 216/216 [03:29<00:00,  1.03it/s]


In [12]:
print("The hyperparameter values that obtained the best accuracy are:")
print("Smoothness(s):", best_s)
print("Significance(l):", best_l)
print("Sanction(p):", best_p)

The hyperparameter values that obtained the best accuracy are:
Smoothness(s): 0.44
Significance(l): 0.48
Sanction(p): 0.5


In [13]:
best_s, best_l, best_p, _ = Evaluation.get_best_hyperparameters(
    metric="f1-score", metric_target="macro avg"
)

print(f"s={best_s:.2f}, l={best_l:.2f}, and p={best_p:.2f}")

s=0.44, l=0.48, and p=0.50


In [14]:
clf.set_hyperparameters(s=best_s, l=best_l, p=best_p)

In [15]:
y_pred = clf.predict(x_test)

classification_report = metrics.classification_report(y_test, y_pred)
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

print(classification_report)
print(confusion_matrix)

Classification: 100%|███████████████████████████████████████████| 1121/1121 [00:00<00:00, 2238.31it/s]

              precision    recall  f1-score   support

    negative       0.91      0.92      0.92       633
    positive       0.90      0.88      0.89       488

    accuracy                           0.90      1121
   macro avg       0.90      0.90      0.90      1121
weighted avg       0.90      0.90      0.90      1121

[[584  49]
 [ 58 430]]





Train a SS3 model using 3-grams words with the hyper-parameters found earlier.

In [16]:
clf_ngrams = SS3(name=f"{CORPUS_KIND}-{CORPUS_NAME}-ngrams")

In [17]:
clf_ngrams.train(x_train, y_train, n_grams=3)

Training on 'positive': 100%|███████████████████████████████████████████| 2/2 [00:46<00:00, 23.22s/it]


In [18]:
y_pred_ngrams = clf_ngrams.predict(x_test)

classification_report_ngrams = metrics.classification_report(y_test, y_pred_ngrams)
confusion_matrix_ngrams = metrics.confusion_matrix(y_test, y_pred_ngrams)

print(classification_report_ngrams)
print(confusion_matrix_ngrams)

Classification: 100%|█████████████████████████████████████████████| 1121/1121 [01:57<00:00,  9.52it/s]

              precision    recall  f1-score   support

    negative       0.91      0.92      0.91       633
    positive       0.89      0.89      0.89       488

    accuracy                           0.90      1121
   macro avg       0.90      0.90      0.90      1121
weighted avg       0.90      0.90      0.90      1121

[[580  53]
 [ 55 433]]





In [19]:
# Use the best parameters found in cross-validation during training the base model.
clf_ngrams.set_hyperparameters(s=best_s, l=best_l, p=best_p)

In [20]:
y_pred_ngrams_best = clf_ngrams.predict(x_test)

classification_report_ngrams_best = metrics.classification_report(
    y_test, y_pred_ngrams_best
)
confusion_matrix_ngrams_best = metrics.confusion_matrix(y_test, y_pred_ngrams_best)

print(classification_report_ngrams_best)
print(confusion_matrix_ngrams_best)

Classification: 100%|█████████████████████████████████████████████| 1121/1121 [01:58<00:00,  9.43it/s]

              precision    recall  f1-score   support

    negative       0.91      0.93      0.92       633
    positive       0.90      0.88      0.89       488

    accuracy                           0.91      1121
   macro avg       0.91      0.90      0.91      1121
weighted avg       0.91      0.91      0.91      1121

[[586  47]
 [ 57 431]]





In [21]:
clf_ngrams.save()

In [22]:
Live_Test.run(clf_ngrams, x_test, y_test, 9876)