This notebook will generate the corpus that we'll use to train the new __DMC__.

In [None]:
import json
import os
import pickle
import sys

import numpy as np

module_path = os.path.abspath(os.path.join("../../.."))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.config import PATH_BEST_MODELS  # noqa: E402
from src.config import PATH_INTERIM_CORPUS  # noqa: E402
from src.config import END_OF_POST_TOKEN, PICKLE_PROTOCOL  # noqa: E402
from src.models.model import EarlyModel, SimpleStopCriterion  # noqa: E402

In [None]:
INTERIM_DATASETS_PATH = os.path.join(PATH_INTERIM_CORPUS, "xml/depression")

In [None]:
CORPUS_PATH = os.path.join(INTERIM_DATASETS_PATH, "depression-train-raw.txt")
posts_list = []
with open(CORPUS_PATH) as f:
    for line in f:
        label, posts = line.split(maxsplit=1)
        if label == "positive":
            posts_list.append(posts.rstrip())

In [None]:
# List of tuples where the first element of each tuple is the index of the post in `posts_list` and the second index
# represents the manually label point in which the reading should stop.
# A value of -1 in the second index means that the users does not seem to be a positive user based on the posts only.
reviews = [
    (71, 3),
    (96, 1),
    (108, 23),
    (111, 9),
    (3, 12),
    (113, 36),
    (54, 1),
    (56, 8),
    (104, 1),
    (32, 1),
    (4, 1),
    (99, 117),
    (78, 207),
    (75, 518),
    (118, 196),
    (55, 89),
    (17, 6),
    (125, 21),
    (70, 1),
    (122, 240),
    (126, 1157),
    (59, -1),
    (16, -1),
    (46, -1),
    (74, -1),
    (24, 5),
    (129, 36),
    (94, 19),
    (97, 26),
    (36, 20),
    (107, 61),
    (58, 38),
    (26, -1),
    (47, 5),
    (69, 13),
    (61, 6),
    (21, -1),
    (28, 109),
    (7, -1),
    (133, 406),
    (132, 11),
    (72, 39),
    (41, 32),
    (20, 55),
    (11, 114),
    (80, 240),
    (76, 12),
    (90, -1),
    (35, 23),
    (77, 6),
]

In [None]:
# Remove users which don't seem positive.
reviews = [t for t in reviews if t[1] != -1]

-----------------------------------------------------

Generate a corpus to train the decision tree DMC model.

For this corpus, generate a sample for each post `i` from a user.
Each sample has all posts up to `i`.

Since for some representations, as the number of posts increases the information is "diluted", we cannot place the label `1` for all `posts > i` where `i` is the cutoff point. This can cause the model to learn things that are not appropriate.
But if we only consider positive those posts where we mark the cutoff point, we will obtain a very unbalanced corpus.
Therefore, after the cutoff point, we consider nine post more as positive.

In [None]:
NUMBER_POSTS_AFTER_STOP_POINT = 9
TRAIN_TEST_SPLIT = 0.5
TRAIN_SPLIT = int(len(reviews) * TRAIN_TEST_SPLIT)

In [None]:
raw_dmc_corpus_train = os.path.join(
    INTERIM_DATASETS_PATH, "depression-dmc-train-raw.txt"
)
if not os.path.exists(raw_dmc_corpus_train):
    for idx, stop_time in reviews[:TRAIN_SPLIT]:
        upper_limit = stop_time + NUMBER_POSTS_AFTER_STOP_POINT
        current_posts = posts_list[idx].split(END_OF_POST_TOKEN)[: upper_limit + 1]
        for i in range(1, min(upper_limit, len(current_posts)) + 1):
            label = "positive" if i >= stop_time else "negative"
            concatenated_post = END_OF_POST_TOKEN.join(current_posts[:i])
            with open(raw_dmc_corpus_train, "a", encoding="utf-8") as f:
                f.write(f"{label}\t{concatenated_post}\n")
else:
    print(f"The corpus {raw_dmc_corpus_train} was already created")

In [None]:
raw_dmc_corpus_test = os.path.join(INTERIM_DATASETS_PATH, "depression-dmc-test-raw.txt")
if not os.path.exists(raw_dmc_corpus_test):
    for idx, stop_time in reviews[TRAIN_SPLIT:]:
        upper_limit = stop_time + NUMBER_POSTS_AFTER_STOP_POINT
        current_posts = posts_list[idx].split(END_OF_POST_TOKEN)[: upper_limit + 1]
        for i in range(1, min(upper_limit, len(current_posts)) + 1):
            label = "positive" if i >= stop_time else "negative"
            concatenated_post = END_OF_POST_TOKEN.join(current_posts[:i])
            with open(raw_dmc_corpus_test, "a", encoding="utf-8") as f:
                f.write(f"{label}\t{concatenated_post}\n")
else:
    print(f"The corpus {raw_dmc_corpus_test} was already created")

-------

In [None]:
feature_names = [
    "current_probability",
    "avg_last_10_probabilities",
    "avg_last_5_probabilities",
    "median_last_10_probabilities",
    "current_delay",
    "num_words_information_gain_percentile_0_01",
    "num_words_chi2_percentile_0_015",
    "current_cpi_decision",
    "avg_last_10_cpi_decision",
]

dmc_corpus_feature_names = os.path.join(
    INTERIM_DATASETS_PATH, "depression-dmc-feature-names.json"
)
with open(dmc_corpus_feature_names, "w", encoding="utf-8") as fp:
    json.dump(fp=fp, obj=feature_names, indent="\t")

In [None]:
with open("depression_information_gain_words.json") as fp:
    depression_information_gain_words = json.load(fp=fp)

with open("depression_chi2_words.json") as fp:
    depression_chi2_words = json.load(fp=fp)

In [None]:
# Load base model to obtain the probabilities of the partial inputs.
# TODO: You need to update this.
model_path = os.path.join(
    PATH_BEST_MODELS, "positive_f1/reddit/depression/selected_models/03_model_SVC.json"
)

simple_criterion = SimpleStopCriterion(threshold=0.07, min_delay=1)

model = EarlyModel(
    path_to_model_information=model_path, stop_criterion=simple_criterion
)
model.clear_model_state()

In [None]:
y_train = []
x_train = []
groups_train = []
for idx, stop_time in reviews[:TRAIN_SPLIT]:
    last_probabilities = []
    last_decisions = []
    upper_limit = stop_time + NUMBER_POSTS_AFTER_STOP_POINT
    current_posts = posts_list[idx].split(END_OF_POST_TOKEN)[: upper_limit + 1]
    for i in range(1, min(upper_limit, len(current_posts)) + 1):
        label = 1 if i >= stop_time else 0
        y_train.append(label)
        raw_post = " ".join(current_posts[:i])
        concatenated_post = END_OF_POST_TOKEN.join(current_posts[:i])
        model.predict(documents_test=[concatenated_post], delay=i)
        predictions = model.predictions
        scores = model.probabilities
        if len(last_probabilities) < 10:
            last_probabilities.append(scores.item())
            last_decisions.append(predictions.item())
        else:
            last_probabilities = last_probabilities[1:] + [scores.item()]
            last_decisions = last_decisions[1:] + [predictions.item()]
        current_features = [
            scores.item(),  # CURRENT_PROBABILITY
            np.average(last_probabilities).item(),  # AVG_LAST_10_PROBABILITIES
            np.average(last_probabilities[-5:]).item(),  # AVG_LAST_5_PROBABILITIES
            np.median(last_probabilities).item(),  # MEDIAN_LAST_10_PROBABILITIES
            i,  # CURRENT_DELAY
            sum(
                1 if w in depression_information_gain_words else 0
                for w in raw_post.split()
            ),  # NUM_WORDS_INFORMATION_GAIN_PERCENTILE_0_01
            sum(
                1 if w in depression_chi2_words else 0 for w in raw_post.split()
            ),  # NUM_WORDS_CHI2_PERCENTILE_0_015
            predictions.item(),  # CURRENT_CPI_DECISION
            np.average(last_decisions).item(),  # AVG_LAST_10_CPI_DECISION
        ]
        x_train.append(current_features)
        groups_train.append(idx)
        model.clear_model_state()

In [None]:
cant_pos_train = {}
for i, idx in enumerate(groups_train):
    if idx not in cant_pos_train:
        cant_pos_train[idx] = 0
    cant_pos_train[idx] += y_train[i]
cant_pos_train

In [None]:
x_train = np.array(x_train)
y_train = np.array(y_train)
groups_train = np.array(groups_train)

In [None]:
dmc_corpus_train = os.path.join(INTERIM_DATASETS_PATH, "depression-dmc-train.pkl")
with open(dmc_corpus_train, "wb") as fp:
    pickle.dump((x_train, y_train, groups_train), fp, protocol=PICKLE_PROTOCOL)

------

In [None]:
y_test = []
x_test = []
groups_test = []
for idx, stop_time in reviews[TRAIN_SPLIT:]:
    last_probabilities = []
    last_decisions = []
    upper_limit = stop_time + NUMBER_POSTS_AFTER_STOP_POINT
    current_posts = posts_list[idx].split(END_OF_POST_TOKEN)[: upper_limit + 1]
    for i in range(1, min(upper_limit, len(current_posts)) + 1):
        label = 1 if i >= stop_time else 0
        y_test.append(label)
        raw_post = " ".join(current_posts[:i])
        concatenated_post = END_OF_POST_TOKEN.join(current_posts[:i])
        model.predict(documents_test=[concatenated_post], delay=i)
        predictions = model.predictions
        scores = model.probabilities
        if len(last_probabilities) < 10:
            last_probabilities.append(scores.item())
            last_decisions.append(predictions.item())
        else:
            last_probabilities = last_probabilities[1:] + [scores.item()]
            last_decisions = last_decisions[1:] + [predictions.item()]
        current_features = [
            scores.item(),  # CURRENT_PROBABILITY
            np.average(last_probabilities).item(),  # AVG_LAST_10_PROBABILITIES
            np.average(last_probabilities[-5:]).item(),  # AVG_LAST_5_PROBABILITIES
            np.median(last_probabilities).item(),  # MEDIAN_LAST_10_PROBABILITIES
            i,  # CURRENT_DELAY
            sum(
                1 if w in depression_information_gain_words else 0
                for w in raw_post.split()
            ),  # NUM_WORDS_INFORMATION_GAIN_PERCENTILE_0_01
            sum(
                1 if w in depression_chi2_words else 0 for w in raw_post.split()
            ),  # NUM_WORDS_CHI2_PERCENTILE_0_015
            predictions.item(),  # CURRENT_CPI_DECISION
            np.average(last_decisions).item(),  # AVG_LAST_10_CPI_DECISION
        ]
        x_test.append(current_features)
        groups_test.append(idx)
        model.clear_model_state()

In [None]:
cant_pos_test = {}
for i, idx in enumerate(groups_test):
    if idx not in cant_pos_test:
        cant_pos_test[idx] = 0
    cant_pos_test[idx] += y_test[i]
cant_pos_test

In [None]:
x_test = np.array(x_test)
y_test = np.array(y_test)
groups_test = np.array(groups_test)

In [None]:
dmc_corpus_test = os.path.join(INTERIM_DATASETS_PATH, "depression-dmc-test.pkl")
with open(dmc_corpus_test, "wb") as fp:
    pickle.dump((x_test, y_test, groups_test), fp, protocol=PICKLE_PROTOCOL)