In [None]:
import json
import os
import sys

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile, chi2, mutual_info_classif

module_path = os.path.abspath(os.path.join("../../.."))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.config import END_OF_POST_TOKEN, PATH_INTERIM_CORPUS  # noqa: E402

In [None]:
corpus_name = "depression"

In [None]:
CORPUS_PATH = os.path.join(
    PATH_INTERIM_CORPUS, f"xml/{corpus_name}/{corpus_name}-train-raw.txt"
)

In [None]:
labels_train = []
documents_train = []
with open(CORPUS_PATH) as f:
    for line in f:
        label, document = line.split(maxsplit=1)
        label = 1 if label == "positive" else 0
        labels_train.append(label)
        posts = " ".join(document.split(END_OF_POST_TOKEN))
        documents_train.append(posts)

In [None]:
cv_params = {
    "binary": True,
}

In [None]:
count_vect = CountVectorizer(**cv_params)

In [None]:
x_train_counts = count_vect.fit_transform(documents_train)

-----

In [None]:
selector_01 = SelectPercentile(mutual_info_classif, percentile=0.01)
selector_01.fit(x_train_counts, labels_train)

In [None]:
is_selected_01 = selector_01.get_support()

In [None]:
selected_variables_01 = []
for i, v in enumerate(is_selected_01):
    if v:
        selected_variables_01.append(i)

In [None]:
vocabulary = count_vect.get_feature_names()
for i in selected_variables_01:
    print(vocabulary[i])

In [None]:
vocabulary = count_vect.get_feature_names()
words_most_information_gain_01 = [vocabulary[i] for i in selected_variables_01]

with open(f"{corpus_name}_information_gain_words.json", "w", encoding="utf-8") as fp:
    json.dump(fp=fp, obj=words_most_information_gain_01, indent="\t")

-----

In [None]:
selector_015_chi2 = SelectPercentile(chi2, percentile=0.015)
selector_015_chi2.fit(x_train_counts, labels_train)

In [None]:
is_selected_015_chi2 = selector_015_chi2.get_support()

In [None]:
selected_variables_015_chi2 = []
for i, v in enumerate(is_selected_015_chi2):
    if v:
        selected_variables_015_chi2.append(i)

In [None]:
vocabulary = count_vect.get_feature_names()
for i in selected_variables_015_chi2:
    print(vocabulary[i])

In [None]:
vocabulary = count_vect.get_feature_names()
words_chi2_015 = [vocabulary[i] for i in selected_variables_015_chi2]

with open(f"{corpus_name}_chi2_words.json", "w", encoding="utf-8") as fp:
    json.dump(fp=fp, obj=words_chi2_015, indent="\t")