In [None]:
import pandas as pd

df = pd.read_csv("../data/Corona_NLP_train.csv", encoding="latin1")
df = df[["OriginalTweet", "Sentiment"]]
df.shape

In [None]:
import spacy
from collections import Counter

nlp = spacy.load("en_core_web_md")
stopwords = nlp.Defaults.stop_words

In [None]:
custom_stopwords = {
    "covid",
    "coronavirus",
    "covid19",
    "corona",
    "coranaviru",
    "covid2019",
    "coronacrisis",
    "coronavirusoutbreak",
    "coronaviruspandemic",
    "coronavirusupdate",
    "coronavirusupdates",
    "coronavirususa",
    "coronavirusuk",
    "covid19uk",
    "covid19usa",
    "19",
    "2019",
    "amp",  # provavelmente &amp;
    # Palavras tiradas do wordcloud presentes em todos os sentimentos
    "food",
    "prices",
    "people",
    "store",
    "supermarket",
    "grocery",
    "will",
}

df["CleanedTweet"] = (
    df["OriginalTweet"]
    .str.replace(r"https\S+|www\S+|https\S+", "", regex=True)
    .str.replace(r"\@\w+", "", regex=True)
    .str.replace(r"\#(\w+)", "", regex=True)
    .str.normalize("NFKD")
    .str.encode("ascii", errors="ignore")
    .str.decode("utf-8")
    .str.replace(r"\s+", " ", regex=True)
    .apply(
        lambda text: " ".join(
            [
                word
                for word in text.split()
                if word.lower() not in stopwords
                and word.isalpha()
                and len(word) > 2
                and word.lower() not in custom_stopwords
            ]
        )
    )
    .str.lower()
    .str.strip()
)

df = df.loc[df["CleanedTweet"].str.split().str.len() > 2]

In [None]:
docs = nlp.pipe(df["CleanedTweet"])

df["CleanedTweet"] = [
    " ".join([token.lemma_ for token in doc if not token.is_punct]) for doc in docs
]

In [None]:
words = df["CleanedTweet"].str.cat(sep=" ").split()
types = Counter(words)

print(f"Total de palavras: {len(words):,}")
print(f"Tamanho do vocabulário: {len(types):,}")
print(f"Riqueza do corpus: {len(types) / len(words):.2%}")

In [None]:
low_freq_words = {word for word, freq in types.items() if freq == 1}
print(f"Palavras de baixa frequência: {len(low_freq_words):,}")

In [None]:
df["CleanedTweet"] = df["CleanedTweet"].apply(
    lambda text: " ".join([word for word in text.split() if word not in low_freq_words])
)
df = df[df["CleanedTweet"] != ""]

In [None]:
import tomotopy as tp

mdl = tp.LDAModel(k=10)

for doc in df["CleanedTweet"]:
    mdl.add_doc(doc.split())

for i in range(0, 100, 10):
    mdl.train(10)
    print("Iteration: {}\tLog-likelihood: {}".format(i, mdl.ll_per_word))

for k in range(mdl.k):
    print("Top 10 words of topic #{}".format(k))
    print(mdl.get_topic_words(k, top_n=10))

mdl.summary()

In [None]:
topics = []
for k in range(mdl.k):
    topics_k = mdl.get_topic_words(k, top_n=10)
    topics_k = pd.DataFrame(topics_k, columns=["word", "prob"])
    topics_k["topic"] = k + 1
    topics.append(topics_k)

topics = pd.concat(topics)

In [None]:
import seaborn.objects as so
import matplotlib.pyplot as plt


fig, axes = plt.subplots(5, 2, figsize=(15, 15), sharex=True)

for k, ax in enumerate(axes.ravel()):
    (
        so.Plot(topics[topics["topic"] == k + 1], x="prob", y="word")
        .add(so.Bar())
        .on(ax)
        .plot()
    )