In [1]:
import json
import os
import sys

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile, chi2, mutual_info_classif

module_path = os.path.abspath(os.path.join("../../.."))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.config import END_OF_POST_TOKEN, PATH_INTERIM_CORPUS  # noqa: E402

In [2]:
corpus_name = "depression"

In [3]:
CORPUS_PATH = os.path.join(
    PATH_INTERIM_CORPUS, f"xml/{corpus_name}/{corpus_name}-train-raw.txt"
)

In [4]:
labels_train = []
documents_train = []
with open(CORPUS_PATH) as f:
    for line in f:
        label, document = line.split(maxsplit=1)
        label = 1 if label == "positive" else 0
        labels_train.append(label)
        posts = " ".join(document.split(END_OF_POST_TOKEN))
        documents_train.append(posts)

In [5]:
cv_params = {
    "binary": True,
}

In [6]:
count_vect = CountVectorizer(**cv_params)

In [7]:
x_train_counts = count_vect.fit_transform(documents_train)

-----

In [8]:
selector_01 = SelectPercentile(mutual_info_classif, percentile=0.01)
selector_01.fit(x_train_counts, labels_train)

SelectPercentile(percentile=0.01,
                 score_func=<function mutual_info_classif at 0x7f0fd433e3b0>)

In [9]:
is_selected_01 = selector_01.get_support()

In [10]:
selected_variables_01 = []
for i, v in enumerate(is_selected_01):
    if v:
        selected_variables_01.append(i)

In [11]:
vocabulary = count_vect.get_feature_names()
for i in selected_variables_01:
    print(vocabulary[i])

2015
anxiety
california
china
congress
depressed
depression
depressive
diagnosed
footage
global
gop
iran
iraq
isis
launch
medication
meds
nasa
obama
released
suicidal
technology
therapist


In [12]:
vocabulary = count_vect.get_feature_names()
words_most_information_gain_01 = [vocabulary[i] for i in selected_variables_01]

with open(f"{corpus_name}_information_gain_words.json", "w", encoding="utf-8") as fp:
    json.dump(fp=fp, obj=words_most_information_gain_01, indent="\t")

-----

In [13]:
selector_015_chi2 = SelectPercentile(chi2, percentile=0.015)
selector_015_chi2.fit(x_train_counts, labels_train)

SelectPercentile(percentile=0.015, score_func=<function chi2 at 0x7f0fd4949710>)

In [14]:
is_selected_015_chi2 = selector_015_chi2.get_support()

In [15]:
selected_variables_015_chi2 = []
for i, v in enumerate(is_selected_015_chi2):
    if v:
        selected_variables_015_chi2.append(i)

In [16]:
vocabulary = count_vect.get_feature_names()
for i in selected_variables_015_chi2:
    print(vocabulary[i])

25mg
antidepressants
anxiety
boyfriend
california
cbt
citalopram
clinically
confide
cope
coping
depressants
depressed
depression
depressive
diagnosed
emotionally
exhausting
footage
global
hobbies
hugs
hurtful
lexapro
medication
meds
prescribed
psychiatrist
rut
sertraline
suicidal
supportive
therapist
venlafaxine
wellbutrin
zoloft


In [17]:
vocabulary = count_vect.get_feature_names()
words_chi2_015 = [vocabulary[i] for i in selected_variables_015_chi2]

with open(f"{corpus_name}_chi2_words.json", "w", encoding="utf-8") as fp:
    json.dump(fp=fp, obj=words_chi2_015, indent="\t")