In [None]:
import os
import sys
from multiprocessing import Pool

import gensim
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from wordcloud import WordCloud

module_path = os.path.abspath(os.path.join("../../.."))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.config import END_OF_POST_TOKEN, PATH_INTERIM_CORPUS  # noqa: E402
from src.features.build_features import get_corpus_id2word  # noqa: E402

In [None]:
CORPUS_KIND = "reddit"
CORPUS_NAME = "depression"
NUM_SUB_PROCESSES = int(2 * os.cpu_count() / 3)

## Generate a DataFrame for the training and testing datasets

In [None]:
input_file_path_train = os.path.join(
    PATH_INTERIM_CORPUS, CORPUS_KIND, CORPUS_NAME, f"{CORPUS_NAME}-train-clean.txt"
)
input_file_path_test = os.path.join(
    PATH_INTERIM_CORPUS, CORPUS_KIND, CORPUS_NAME, f"{CORPUS_NAME}-test-clean.txt"
)

In [None]:
labels = []
documents = []
with open(input_file_path_train) as f:
    for line in f:
        label, document = line.split(maxsplit=1)
        labels.append(label)
        posts = document.split(END_OF_POST_TOKEN)
        documents.append(posts)
df_train = pd.DataFrame({"label": labels, "posts": documents})

In [None]:
labels = []
documents = []
with open(input_file_path_test) as f:
    for line in f:
        label, document = line.split(maxsplit=1)
        labels.append(label)
        posts = document.split(END_OF_POST_TOKEN)
        documents.append(posts)
df_test = pd.DataFrame({"label": labels, "posts": documents})

## Define functions to obtain interesting features from the documents

In [None]:
TextBlob(df_test.posts[0][0]).sentiment

In [None]:
def get_num_tokens(posts_list):
    return [len(post.split()) for post in posts_list]


def get_num_tokens_first_person(posts_list):
    return [
        sum(1 if word == "i" else 0 for word in post.split()) for post in posts_list
    ]


def get_polarity(posts_list):
    return [round(TextBlob(post).sentiment.polarity, 2) for post in posts_list]


def get_subjectivity(posts_list):
    return [round(TextBlob(post).sentiment.subjectivity, 2) for post in posts_list]

## Apply functions to the training corpus

In [None]:
df_train["num_tokens"] = df_train.posts.apply(get_num_tokens)
df_train["num_tokens_first_person"] = df_train.posts.apply(get_num_tokens_first_person)
df_train["polarity"] = df_train.posts.apply(get_polarity)
df_train["subjectivity"] = df_train.posts.apply(get_subjectivity)

In [None]:
df_train

## Apply functions to the testing corpus

In [None]:
df_test["num_tokens"] = df_test.posts.apply(get_num_tokens)
df_test["num_tokens_first_person"] = df_test.posts.apply(get_num_tokens_first_person)
df_test["polarity"] = df_test.posts.apply(get_polarity)
df_test["subjectivity"] = df_test.posts.apply(get_subjectivity)

In [None]:
df_test

## Compare the datasets

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(8, 3))
ax1.set_title("Training")
ax2.set_title("Testing")
fig.suptitle("Number of users with depression")
sns.countplot(x="label", data=df_train, ax=ax1)
sns.countplot(x="label", data=df_test, ax=ax2)
plt.tight_layout(pad=2.8, w_pad=0.5, h_pad=1.0);

In [None]:
num_tokens_train = df_train.groupby("label").agg({"num_tokens": sum})
num_tokens_train = num_tokens_train.num_tokens.apply(lambda x: np.array(x))
num_tokens_train.apply(lambda x: x.sort())

num_tokens_test = df_test.groupby("label").agg({"num_tokens": sum})
num_tokens_test = num_tokens_test.num_tokens.apply(lambda x: np.array(x))
num_tokens_test.apply(lambda x: x.sort())

fig, axis = plt.subplots(nrows=2, ncols=2, sharey=False, figsize=(10, 6))

axis[0, 0].set_title("Training")
axis[0, 1].set_title("Testing")
fig.suptitle("Posts length")

sns.kdeplot(num_tokens_train["positive"], ax=axis[0, 0], label="positive")
sns.kdeplot(num_tokens_train["negative"], ax=axis[0, 0], label="negative")

sns.kdeplot(num_tokens_test["positive"], ax=axis[0, 1], label="positive")
sns.kdeplot(num_tokens_test["negative"], ax=axis[0, 1], label="negative")

sns.distplot(num_tokens_train["positive"], ax=axis[1, 0], label="positive", kde=False)
sns.distplot(num_tokens_train["negative"], ax=axis[1, 0], label="negative", kde=False)

sns.distplot(num_tokens_test["positive"], ax=axis[1, 1], label="positive", kde=False)
sns.distplot(num_tokens_test["negative"], ax=axis[1, 1], label="negative", kde=False)

for ax in axis.ravel():
    ax.legend()

plt.tight_layout(pad=2.8, w_pad=0.5, h_pad=1.0);

In [None]:
outlier_train_negative, outlier_train_positive = (
    np.quantile(num_tokens_train[0], 0.97),
    np.quantile(num_tokens_train[1], 0.97),
)
outlier_test_negative, outlier_test_positive = (
    np.quantile(num_tokens_test[0], 0.97),
    np.quantile(num_tokens_test[1], 0.97),
)

fig, axis = plt.subplots(nrows=2, ncols=2, sharey=False, figsize=(10, 6))
axis[0, 0].set_title("Training")
axis[0, 1].set_title("Testing")
fig.suptitle("Posts length (without outliers)")

without_outliers = num_tokens_train["positive"][
    num_tokens_train["positive"] < outlier_train_positive
]
sns.kdeplot(without_outliers, ax=axis[0, 0], label="positive")
sns.distplot(without_outliers, ax=axis[1, 0], label="positive", kde=False)

without_outliers = num_tokens_train["negative"][
    num_tokens_train["negative"] < outlier_train_negative
]
sns.kdeplot(without_outliers, ax=axis[0, 0], label="negative")
sns.distplot(without_outliers, ax=axis[1, 0], label="negative", kde=False)

without_outliers = num_tokens_test["positive"][
    num_tokens_test["positive"] < outlier_test_positive
]
sns.kdeplot(without_outliers, ax=axis[0, 1], label="positive")
sns.distplot(without_outliers, ax=axis[1, 1], label="positive", kde=False)

without_outliers = num_tokens_test["negative"][
    num_tokens_test["negative"] < outlier_test_negative
]
sns.kdeplot(without_outliers, ax=axis[0, 1], label="negative")
sns.distplot(without_outliers, ax=axis[1, 1], label="negative", kde=False)

for ax in axis.ravel():
    ax.legend()

plt.tight_layout(pad=2.8, w_pad=0.5, h_pad=1.0);

In [None]:
num_first_person_train = df_train.groupby("label").agg({"num_tokens_first_person": sum})
num_first_person_train = num_first_person_train.num_tokens_first_person.apply(
    lambda x: np.array(x)
)
num_first_person_train.apply(lambda x: x.sort())

num_first_person_test = df_test.groupby("label").agg({"num_tokens_first_person": sum})
num_first_person_test = num_first_person_test.num_tokens_first_person.apply(
    lambda x: np.array(x)
)
num_first_person_test.apply(lambda x: x.sort())

fig, axis = plt.subplots(nrows=2, ncols=2, sharey=False, figsize=(10, 6))
axis[0, 0].set_title("Training")
axis[0, 1].set_title("Testing")
fig.suptitle("References to the first person in the posts")

sns.kdeplot(num_first_person_train["positive"], ax=axis[0, 0], label="positive")
sns.distplot(
    num_first_person_train["positive"], ax=axis[1, 0], label="positive", kde=False
)
sns.kdeplot(num_first_person_train["negative"], ax=axis[0, 0], label="negative")
sns.distplot(
    num_first_person_train["negative"], ax=axis[1, 0], label="negative", kde=False
)

sns.kdeplot(num_first_person_test["positive"], ax=axis[0, 1], label="positive")
sns.distplot(
    num_first_person_test["positive"], ax=axis[1, 1], label="positive", kde=False
)
sns.kdeplot(num_first_person_test["negative"], ax=axis[0, 1], label="negative")
sns.distplot(
    num_first_person_test["negative"], ax=axis[1, 1], label="negative", kde=False
)

for ax in axis.ravel():
    ax.legend()

plt.tight_layout(pad=2.8, w_pad=0.5, h_pad=1.0);

In [None]:
outlier_threshold = 0.98
outlier_train_negative, outlier_train_positive = (
    np.quantile(num_first_person_train[0], outlier_threshold),
    np.quantile(num_first_person_train[1], outlier_threshold),
)
outlier_test_negative, outlier_test_positive = (
    np.quantile(num_first_person_test[0], outlier_threshold),
    np.quantile(num_first_person_test[1], outlier_threshold),
)

fig, axis = plt.subplots(nrows=2, ncols=2, sharey=False, figsize=(10, 6))
axis[0, 0].set_title("Training")
axis[0, 1].set_title("Testing")
fig.suptitle("References to the first person in the posts (without outliers)")

without_outliers = num_first_person_train["positive"][
    num_first_person_train["positive"] < outlier_train_positive
]
sns.kdeplot(without_outliers, ax=axis[0, 0], label="positive")
sns.distplot(without_outliers, ax=axis[1, 0], label="positive", kde=False)

without_outliers = num_first_person_train["negative"][
    num_first_person_train["negative"] < outlier_train_negative
]
sns.kdeplot(without_outliers, ax=axis[0, 0], label="negative")
sns.distplot(without_outliers, ax=axis[1, 0], label="negative", kde=False)

without_outliers = num_first_person_test["positive"][
    num_first_person_test["positive"] < outlier_test_positive
]
sns.kdeplot(without_outliers, ax=axis[0, 1], label="positive")
sns.distplot(without_outliers, ax=axis[1, 1], label="positive", kde=False)

without_outliers = num_first_person_test["negative"][
    num_first_person_test["negative"] < outlier_test_negative
]
sns.kdeplot(without_outliers, ax=axis[0, 1], label="negative")
sns.distplot(without_outliers, ax=axis[1, 1], label="negative", kde=False)

for ax in axis.ravel():
    ax.legend()

plt.tight_layout(pad=2.8, w_pad=0.5, h_pad=1.0);

## Latent Dirichlet Allocation (LDA)
We apply LDA to obtain the most relevant topics from documents of both positive and negative people for depression problems, both for training and for testing.

In [None]:
posts = []
for user_posts in df_train.posts:
    aux_list_posts = []
    for post in user_posts:
        for word in post.split():
            aux_list_posts.append(word)
    posts.append(aux_list_posts)

In [None]:
corpus, id2word, bigram = get_corpus_id2word(posts)

In [None]:
lda_train = gensim.models.LdaModel(
    corpus=corpus,
    num_topics=15,
    id2word=id2word,
    chunksize=100,
    passes=50,
    eval_every=1,
    random_state=30,
    per_word_topics=True,
)

In [None]:
lda_train.print_topics(num_topics=-1, num_words=20)

In [None]:
top_topics = lda_train.get_document_topics(corpus[1], minimum_probability=0.0)
topic_vec = [top_topics[i][1] for i in range(10)]
top_topics, topic_vec

In [None]:
posts_test = []
for user_posts in df_test.posts:
    aux_list_posts = []
    for post in user_posts:
        for word in post.split():
            aux_list_posts.append(word)
    posts_test.append(aux_list_posts)

In [None]:
posts_test[0][:10]

In [None]:
corpus_test, _, _ = get_corpus_id2word(posts_test, bigram_model=bigram, id2word=id2word)

In [None]:
top_topics_test = lda_train.get_document_topics(corpus_test[1], minimum_probability=0.0)
topic_vec_test = [top_topics_test[i][1] for i in range(10)]
top_topics_test, topic_vec_test

## Graphs related to the posts sentiment analysis

In [None]:
polarity_train = df_train.groupby("label").agg({"polarity": sum})
polarity_train = polarity_train.polarity.apply(lambda x: np.array(x))

polarity_test = df_test.groupby("label").agg({"polarity": sum})
polarity_test = polarity_test.polarity.apply(lambda x: np.array(x))

fig, axis = plt.subplots(nrows=2, ncols=2, sharey=False, figsize=(10, 6))
axis[0, 0].set_title("Training")
axis[0, 1].set_title("Testing")
fig.suptitle("Polarity of the posts")

sns.kdeplot(polarity_train["positive"], ax=axis[0, 0], label="positive")
sns.distplot(polarity_train["positive"], ax=axis[1, 0], label="positive", kde=False)
sns.kdeplot(polarity_train["negative"], ax=axis[0, 0], label="negative")
sns.distplot(polarity_train["negative"], ax=axis[1, 0], label="negative", kde=False)

sns.kdeplot(polarity_test["positive"], ax=axis[0, 1], label="positive")
sns.distplot(polarity_test["positive"], ax=axis[1, 1], label="positive", kde=False)
sns.kdeplot(polarity_test["negative"], ax=axis[0, 1], label="negative")
sns.distplot(polarity_test["negative"], ax=axis[1, 1], label="negative", kde=False)

for ax in axis.ravel():
    ax.legend()

plt.tight_layout(pad=2.8, w_pad=0.5, h_pad=1.0);

In [None]:
subjectivity_train = df_train.groupby("label").agg({"subjectivity": sum})
subjectivity_train = subjectivity_train.subjectivity.apply(lambda x: np.array(x))

subjectivity_test = df_test.groupby("label").agg({"subjectivity": sum})
subjectivity_test = subjectivity_test.subjectivity.apply(lambda x: np.array(x))

fig, axis = plt.subplots(nrows=2, ncols=2, sharey=False, figsize=(10, 6))
axis[0, 0].set_title("Training")
axis[0, 1].set_title("Testing")
fig.suptitle("Subjectivity of the posts")

sns.kdeplot(subjectivity_train["positive"], ax=axis[0, 0], label="positive")
sns.distplot(subjectivity_train["positive"], ax=axis[1, 0], label="positive", kde=False)
sns.kdeplot(subjectivity_train["negative"], ax=axis[0, 0], label="negative")
sns.distplot(subjectivity_train["negative"], ax=axis[1, 0], label="negative", kde=False)

sns.kdeplot(subjectivity_test["positive"], ax=axis[0, 1], label="positive")
sns.distplot(subjectivity_test["positive"], ax=axis[1, 1], label="positive", kde=False)
sns.kdeplot(subjectivity_test["negative"], ax=axis[0, 1], label="negative")
sns.distplot(subjectivity_test["negative"], ax=axis[1, 1], label="negative", kde=False)

for ax in axis.ravel():
    ax.legend()

plt.tight_layout(pad=2.8, w_pad=0.5, h_pad=1.0);

##  Word cloud

In [None]:
plain_train_posts = [p for _posts in df_train.posts for p in _posts]
plain_test_posts = [p for _posts in df_test.posts for p in _posts]

plain_posts = plain_train_posts + plain_test_posts

In [None]:
vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b[\w']+\b", use_idf=True)
vectorizer.fit(plain_posts)

In [None]:
idx2word = {idx: word for (word, idx) in vectorizer.vocabulary_.items()}

In [None]:
idf_sort_idxs = np.argsort(vectorizer.idf_)

print("Top 10 words with biggest IDF")
for i in idf_sort_idxs[:10]:
    print(f"'{idx2word[i]}': {vectorizer.idf_[i]}")

In [None]:
# Distribution of idf values throughout the dataset.
sns.distplot(vectorizer.idf_, kde=False);

In [None]:
min_percentile = 0.1
min_idf_allowed = np.percentile(vectorizer.idf_, min_percentile)

In [None]:
def get_words_in_vectorizer(posts_list):
    return [
        word
        for post in posts_list
        for word in post.split()
        if (word in vectorizer.vocabulary_)
        and (vectorizer.idf_[vectorizer.vocabulary_[word]] > min_idf_allowed)
    ]

### Training corpus

In [None]:
users_posts = df_train.posts

with Pool(processes=NUM_SUB_PROCESSES) as pool:
    result_users_posts = pool.map(get_words_in_vectorizer, users_posts)
result_users_posts = [" ".join(r) for r in result_users_posts]
result_users_posts = " ".join(result_users_posts)

In [None]:
wordcloud = WordCloud(width=1600, height=800, background_color="white").generate(
    result_users_posts
)

fig = plt.figure(figsize=(30, 10), facecolor="white")
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title(
    "The top 100 most used words in the training corpus\n(without considering the "
    f"{min_percentile}%\nwords with least value of idf)",
    fontsize=70,
)
plt.tight_layout(pad=0)
plt.show()

In [None]:
mostcommon_small_training = FreqDist(result_users_posts.split()).most_common(25)

x, y = zip(*mostcommon_small_training)
plt.figure(figsize=(50, 30))
plt.margins(0.02)
plt.bar(x, y)
plt.xlabel("Words", fontsize=50)
plt.ylabel("Frequency of Words", fontsize=50)
plt.yticks(fontsize=40)
plt.xticks(rotation=60, fontsize=40)
plt.title(
    "Frecuency of the 25 most common words for the training corpus\n(without considering the "
    f"{min_percentile}%\nwords with least value of idf)",
    fontsize=60,
)
plt.show()

### Wordcloud for positive users in the training corpus

In [None]:
positive_users_posts = df_train.posts[df_train.label == "positive"]

with Pool(processes=NUM_SUB_PROCESSES) as pool:
    result_positive_users_posts = pool.map(
        get_words_in_vectorizer, positive_users_posts
    )
result_positive_users_posts = [" ".join(r) for r in result_positive_users_posts]
result_positive_users_posts = " ".join(result_positive_users_posts)

In [None]:
positive_wordcloud = WordCloud(
    width=1600, height=800, background_color="white"
).generate(result_positive_users_posts)

fig = plt.figure(figsize=(30, 10), facecolor="white")
plt.imshow(positive_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title(
    "Training corpus WordCloud for the positive users\n(without considering the "
    f"{min_percentile}% words with least value of idf)",
    fontsize=70,
)
plt.tight_layout(pad=0)
plt.show()

In [None]:
positive_mostcommon_small = FreqDist(result_positive_users_posts.split()).most_common(
    25
)

x, y = zip(*positive_mostcommon_small)
plt.figure(figsize=(50, 30))
plt.margins(0.02)
plt.bar(x, y)
plt.xlabel("Words", fontsize=50)
plt.ylabel("Frequency of Words", fontsize=50)
plt.yticks(fontsize=40)
plt.xticks(rotation=60, fontsize=40)
plt.title(
    "Frecuency of the 25 most common words for the training corpus\nfor the positive users (without considering the "
    f"{min_percentile}%\nwords with least value of idf)",
    fontsize=60,
)
plt.show()

### Wordcloud for negative users in the training corpus

In [None]:
negative_users_posts = df_train.posts[df_train.label == "negative"]

with Pool(processes=NUM_SUB_PROCESSES) as pool:
    result_negative_users_posts = pool.map(
        get_words_in_vectorizer, negative_users_posts
    )
result_negative_users_posts = [" ".join(r) for r in result_negative_users_posts]
result_negative_users_posts = " ".join(result_negative_users_posts)

In [None]:
negative_wordcloud = WordCloud(
    width=1600, height=800, background_color="white"
).generate(result_negative_users_posts)

fig = plt.figure(figsize=(30, 10), facecolor="white")
plt.imshow(negative_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title(
    "The top 100 most used words in the training corpus\nfor the negative users (without considering the "
    f"{min_percentile}%\nwords with least value of idf)",
    fontsize=70,
)
plt.tight_layout(pad=0)
plt.show()

In [None]:
negative_mostcommon_small = FreqDist(result_negative_users_posts.split()).most_common(
    25
)

x, y = zip(*negative_mostcommon_small)
plt.figure(figsize=(50, 30))
plt.margins(0.02)
plt.bar(x, y)
plt.xlabel("Words", fontsize=50)
plt.ylabel("Frequency of Words", fontsize=50)
plt.yticks(fontsize=40)
plt.xticks(rotation=60, fontsize=40)
plt.title(
    "Frecuency of the 25 most common words for the training corpus\nfor the negative users (without considering the "
    f"{min_percentile}% words with least value of idf)",
    fontsize=60,
)
plt.show()

### Testing corpus

In [None]:
users_posts = df_test.posts

with Pool(processes=NUM_SUB_PROCESSES) as pool:
    result_users_posts = pool.map(get_words_in_vectorizer, users_posts)
result_users_posts = [" ".join(r) for r in result_users_posts]
result_users_posts = " ".join(result_users_posts)

In [None]:
wordcloud = WordCloud(width=1600, height=800, background_color="white").generate(
    result_users_posts
)

fig = plt.figure(figsize=(30, 10), facecolor="white")
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title(
    "The top 100 most used words in the testing corpus\n(without considering the "
    f"{min_percentile}%\nwords with least value of idf)",
    fontsize=70,
)
plt.tight_layout(pad=0)
plt.show()

In [None]:
mostcommon_small = FreqDist(result_users_posts.split()).most_common(25)

x, y = zip(*mostcommon_small)
plt.figure(figsize=(50, 30))
plt.margins(0.02)
plt.bar(x, y)
plt.xlabel("Words", fontsize=50)
plt.ylabel("Frequency of Words", fontsize=50)
plt.yticks(fontsize=40)
plt.xticks(rotation=60, fontsize=40)
plt.title(
    "Frecuency of the 25 most common words for the testing corpus\n(without considering the "
    f"{min_percentile}%\nwords with least value of idf)",
    fontsize=60,
)
plt.show()

### Wordcloud for positive users in the testing corpus

In [None]:
positive_users_posts = df_test.posts[df_test.label == "positive"]

with Pool(processes=NUM_SUB_PROCESSES) as pool:
    result_positive_users_posts = pool.map(
        get_words_in_vectorizer, positive_users_posts
    )
result_positive_users_posts = [" ".join(r) for r in result_positive_users_posts]
result_positive_users_posts = " ".join(result_positive_users_posts)

In [None]:
positive_wordcloud = WordCloud(
    width=1600, height=800, background_color="white"
).generate(result_positive_users_posts)

fig = plt.figure(figsize=(30, 10), facecolor="white")
plt.imshow(positive_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title(
    "Testing corpus WordCloud for the positive users\n(without considering the "
    f"{min_percentile}% words with least value of idf)",
    fontsize=70,
)
plt.tight_layout(pad=0)
plt.show()

In [None]:
positive_mostcommon_small = FreqDist(result_positive_users_posts.split()).most_common(
    25
)

x, y = zip(*positive_mostcommon_small)
plt.figure(figsize=(50, 30))
plt.margins(0.02)
plt.bar(x, y)
plt.xlabel("Words", fontsize=50)
plt.ylabel("Frequency of Words", fontsize=50)
plt.yticks(fontsize=40)
plt.xticks(rotation=60, fontsize=40)
plt.title(
    "Frecuency of the 25 most common words for the testing corpus\nfor the positive users (without considering the "
    f"{min_percentile}%\nwords with least value of idf)",
    fontsize=60,
)
plt.show()

### Wordcloud for negative users in the testing corpus

In [None]:
negative_users_posts = df_test.posts[df_test.label == "negative"]

with Pool(processes=NUM_SUB_PROCESSES) as pool:
    result_negative_users_posts = pool.map(
        get_words_in_vectorizer, negative_users_posts
    )
result_negative_users_posts = [" ".join(r) for r in result_negative_users_posts]
result_negative_users_posts = " ".join(result_negative_users_posts)

In [None]:
negative_wordcloud = WordCloud(
    width=1600, height=800, background_color="white"
).generate(result_negative_users_posts)

fig = plt.figure(figsize=(30, 10), facecolor="white")
plt.imshow(negative_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title(
    "The top 100 most used words in the testing corpus\nfor the negative users (without considering the "
    f"{min_percentile}%\nwords with least value of idf)",
    fontsize=70,
)
plt.tight_layout(pad=0)
plt.show()

In [None]:
negative_mostcommon_small = FreqDist(result_negative_users_posts.split()).most_common(
    25
)

x, y = zip(*negative_mostcommon_small)
plt.figure(figsize=(50, 30))
plt.margins(0.02)
plt.bar(x, y)
plt.xlabel("Words", fontsize=50)
plt.ylabel("Frequency of Words", fontsize=50)
plt.yticks(fontsize=40)
plt.xticks(rotation=60, fontsize=40)
plt.title(
    "Frecuency of the 25 most common words for the testing corpus\nfor the negative users (without considering the "
    f"{min_percentile}% words with least value of idf)",
    fontsize=60,
)
plt.show()