<div align="center">
  <h1 style="color:darkblue"> Classificação de sentimentos nos Tweets - Parte 1🐦</h1>
</div>

Nesse notebook, vamos abordar um problema de classificação de sentimentos em tweets. O objetivo é classificar os tweets em cinco categorias: muito negativo, negativo, neutro, positivo e muito positivo. 

In [None]:
# %%bash

# python -m spacy download en_core_web_md

## 1. Importando as bibliotecas necessárias

In [None]:
from collections import Counter

from tqdm.auto import tqdm

import pandas as pd
import matplotlib.pyplot as plt

import spacy
import numpy as np
import itertools

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Palavras que não são relevantes para a análise, obtidas a partir da análise de frequência
CUSTOM_STOPWORDS = {
    "covid",
    "coronavirus",
    "corona",
    "coranaviru",
    "coronacrisis",
    "coronavirusoutbreak",
    "coronaviruspandemic",
    "coronavirusupdate",
    "coronavirusupdates",
    "coronavirususa",
    "coronavirusuk",
    "coviduk",
    "covidusa",
}


nlp = spacy.load("en_core_web_md")
STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS | CUSTOM_STOPWORDS

In [None]:
df = pd.read_csv("../data/Corona_NLP_train.csv", encoding="latin1")
df = df[["OriginalTweet", "Sentiment"]]
df.sample(5)

## 2. Preprocessamento

In [None]:
def preprocess_text(text):
    return (
        text.str.lower()
        # remove links
        .str.replace(r"https\S+|www\S+|https\S+", "", regex=True)
        # remove usernames
        .str.replace(r"\@\w+", "", regex=True)
        # remove hashtags
        .str.replace(r"\#(\w+)", "", regex=True)
        # remove non-ascii characters
        .str.normalize("NFKD")
        .str.encode("ascii", errors="ignore")
        .str.decode("utf-8")
        # manter apenas letras, espaços e apóstrofos
        .str.replace(r"[^a-z\s\']", "", regex=True)
        # remove excesso de espaços
        .str.replace(r"\s+", " ", regex=True)
        # remove espaços no começo e no fim
        .str.strip()
    )


df["CleanTweet"] = preprocess_text(df["OriginalTweet"])
df["CleanTweetNoStopwords"] = df["CleanTweet"].apply(
    lambda text: " ".join([word for word in text.split() if word not in STOPWORDS])
)
df = df.loc[df["CleanTweet"].str.split().str.len() > 2]
df = df.drop_duplicates(subset=["CleanTweet", "Sentiment"])
df.sample(5)

No notebook de exploração de dados, vimos que mais da metade do vocabulário dos tweets era composto por palavras de frequência única. Para acelerar o processamento, vamos remover essas palavras do vocabulário.

In [None]:
words = df["CleanTweet"].str.cat(sep=" ").split()
types = Counter(words)
hapax = set([word for word, count in types.items() if count <= 1])

print(f"Total de palavras: {len(words):,}")
print(f"Tamanho do vocabulário: {len(types):,}")
print(f"Palavras únicas: {len(hapax):,}")

In [None]:
df["CleanTweet"] = df["CleanTweet"].apply(
    lambda text: " ".join([word for word in text.split() if word not in hapax])
)
# remove tweets com menos de 3 palavras
df = df.loc[df["CleanTweet"].str.split().str.len() > 2]
df.shape

In [None]:
docs = nlp.pipe(df["CleanTweet"])

df["Lemmatized"] = [
    " ".join([token.lemma_ for token in doc])
    for doc in tqdm(docs, total=len(df), desc="Lemmatizing")
]

In [None]:
docs = nlp.pipe(df["Lemmatized"])
df["LemmatizedNoStopwords"] = [
    " ".join([token.text for token in doc if token.text not in STOPWORDS])
    for doc in tqdm(docs, total=len(df), desc="Extracting stopwords")
]

In [None]:
df.sample(5)

## 3. Treinamento

No pré-processamento, criamos quatro colunas no dataframe de treino: 
- `CleanTweet`: tweets padronizados em minúsculas e sem caracteres especiais
- `CleanTweetNoStopwords`: tweets padronizados sem stopwords
- `Lemmatized`: tweets padronizados e lematizados
- `LemmatizedNoStopwords`: tweets padronizados, lematizados e sem stopwords

Pretendemos treinar um modelo para cada coluna a fim de comparar a performance dos modelos.

In [None]:
df["Sentiment"] = pd.Categorical(
    df["Sentiment"],
    categories=[
        "Extremely Negative",
        "Negative",
        "Neutral",
        "Positive",
        "Extremely Positive",
    ],
    ordered=True,
)

In [None]:
def plot_confusion_matrix(
    confusion_matrix, target_names, title="Confusion matrix", cmap=None, normalize=True
):
    if cmap is None:
        cmap = plt.get_cmap("Blues")

    plt.figure(figsize=(6, 6))
    plt.imshow(confusion_matrix, interpolation="nearest", cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = range(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        confusion_matrix = (
            confusion_matrix.astype("float")
            / confusion_matrix.sum(axis=1)[:, np.newaxis]
        )

    thresh = confusion_matrix.max() / 1.5 if normalize else confusion_matrix.max() / 2
    for i, j in itertools.product(
        range(confusion_matrix.shape[0]), range(confusion_matrix.shape[1])
    ):
        if normalize:
            plt.text(
                j,
                i,
                "{:0.4f}".format(confusion_matrix[i, j]),
                horizontalalignment="center",
                color="white" if confusion_matrix[i, j] > thresh else "black",
            )
        else:
            plt.text(
                j,
                i,
                "{:,}".format(confusion_matrix[i, j]),
                horizontalalignment="center",
                color="white" if confusion_matrix[i, j] > thresh else "black",
            )

    plt.tight_layout()
    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    plt.show()

In [None]:
models = {
    "MultinomialNB",
    "LogisticRegression",
    "RandomForestClassifier",
    "LinearSVC",
}
df = df.reset_index(drop=True)
X = df[["CleanTweet", "CleanTweetNoStopwords", "Lemmatized", "LemmatizedNoStopwords"]]
y = df["Sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

### 3.1 Bag of Words vs TF-IDF

In [None]:
clean_count_vectorizer = CountVectorizer()

clean_no_stopwords_count_vectorizer = CountVectorizer()

lemmatized_count_vectorizer = CountVectorizer()

lemmatized_no_stopwords_count_vectorizer = CountVectorizer()

clean_tfidf_vectorizer = TfidfVectorizer()

clean_no_stopwords_tfidf_vectorizer = TfidfVectorizer()

lemmatized_tfidf_vectorizer = TfidfVectorizer()

lemmatized_no_stopwords_tfidf_vectorizer = TfidfVectorizer()

vectorizers = {
    "CountVectorizer": {
        "CleanTweet": clean_count_vectorizer.fit(X_train["CleanTweet"]),
        "CleanTweetNoStopwords": clean_no_stopwords_count_vectorizer.fit(
            X_train["CleanTweetNoStopwords"]
        ),
        "Lemmatized": lemmatized_count_vectorizer.fit(X_train["Lemmatized"]),
        "LemmatizedNoStopwords": lemmatized_no_stopwords_count_vectorizer.fit(
            X_train["LemmatizedNoStopwords"]
        ),
    },
    "TfidfVectorizer": {
        "CleanTweet": clean_tfidf_vectorizer.fit(X_train["CleanTweet"]),
        "CleanTweetNoStopwords": clean_no_stopwords_tfidf_vectorizer.fit(
            X_train["CleanTweetNoStopwords"]
        ),
        "Lemmatized": lemmatized_tfidf_vectorizer.fit(X_train["Lemmatized"]),
        "LemmatizedNoStopwords": lemmatized_no_stopwords_tfidf_vectorizer.fit(
            X_train["LemmatizedNoStopwords"]
        ),
    },
}

In [None]:
clf = CountVectorizer()
X_train_vec = clf.fit_transform(X_train["CleanTweet"])
X_test_vec = clf.transform(X_test["CleanTweet"])

clf = MultinomialNB()

clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)

print(classification_report(y_test, y_pred))

In [None]:
def evaluate_models(fitted_models, fitted_vectorizers, X_test, y_test):
    results = {}
    for model_name, model in tqdm(fitted_models.items(), desc="Evaluating models"):
        results[model_name] = {}
        for column_name, vectorizer in tqdm(
            fitted_vectorizers.items(), desc=f"Evaluating {model_name}", leave=False
        ):
            X_test_vectorized = vectorizer.transform(X_test[column_name])
            y_pred = model[column_name].predict(X_test_vectorized)
            results[model_name][column_name] = {
                "classification_report": classification_report(
                    y_test, y_pred, target_names=y.cat.categories
                ),
                "confusion_matrix": confusion_matrix(y_test, y_pred, normalize="true"),
            }
            print(f"Evaluating {model_name} for {column_name}")

    return results


def fit_models(models, fitted_vectorizers, y_train):
    fitted_models = {}
    for model_name in tqdm(models, desc="Fitting models"):
        fitted_models[model_name] = {}
        for column_name, vectorizer in fitted_vectorizers.items():
            print(f"Fitting {model_name} for {column_name}")
            X_train_vectorized = vectorizer.transform(X_train[column_name])
            if model_name == "MultinomialNB":
                model = MultinomialNB()
            elif model_name == "LogisticRegression":
                model = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
            elif model_name == "RandomForestClassifier":
                model = RandomForestClassifier(random_state=42, n_jobs=-1)
            elif model_name == "LinearSVC":
                model = LinearSVC(dual="auto", random_state=42)
            fitted_model = model.fit(X_train_vectorized, y_train)
            fitted_models[model_name][column_name] = fitted_model
        print()
    return fitted_models

#### 3.1.1 Bag of Words

In [None]:
fitted_models = fit_models(models, vectorizers["CountVectorizer"], y_train)

In [None]:
results = evaluate_models(fitted_models, vectorizers["CountVectorizer"], X_test, y_test)

In [None]:
for model_name, model_results in results.items():
    for column_name, column_results in model_results.items():
        print(f"{model_name} - {column_name}")
        print(column_results["classification_report"])
        print("\n\n")

In [None]:
for model_name, model_results in results.items():
    for column_name, column_results in model_results.items():
        plot_confusion_matrix(
            column_results["confusion_matrix"],
            y.cat.categories,
            title=f"{model_name} - {column_name}",
        )

#### 3.1.2 TF-IDF

In [None]:
fitted_models_tfidf = fit_models(models, vectorizers["TfidfVectorizer"], y_train)

In [None]:
results_tfidf = evaluate_models(
    fitted_models_tfidf, vectorizers["TfidfVectorizer"], X_test, y_test
)

In [None]:
for model_name, model_results in results_tfidf.items():
    for column_name, column_results in model_results.items():
        print(f"{model_name} - {column_name}")
        print(column_results["classification_report"])
        print("\n\n")

In [None]:
for model_name, model_results in results_tfidf.items():
    for column_name, column_results in model_results.items():
        plot_confusion_matrix(
            column_results["confusion_matrix"],
            y.cat.categories,
            title=f"{model_name} - {column_name}",
        )

### 3.2 Word Embeddings

In [None]:
clean_tweets_vec = np.array(
    [
        doc.vector
        for doc in tqdm(
            nlp.pipe(df["CleanTweet"]), total=len(df), desc="Vectorizing tweets"
        )
    ]
)

clean_no_stopwords_vec = np.array(
    [
        doc.vector
        for doc in tqdm(
            nlp.pipe(df["CleanTweetNoStopwords"]),
            total=len(df),
            desc="Vectorizing tweets",
        )
    ]
)

lemmatized_vec = np.array(
    [
        doc.vector
        for doc in tqdm(
            nlp.pipe(df["Lemmatized"]), total=len(df), desc="Vectorizing tweets"
        )
    ]
)

lemmatized_no_stopwords_vec = np.array(
    [
        doc.vector
        for doc in tqdm(
            nlp.pipe(df["LemmatizedNoStopwords"]),
            total=len(df),
            desc="Vectorizing tweets",
        )
    ]
)

In [None]:
fitted_models = {}

for model_name in ["LogisticRegression", "RandomForestClassifier", "LinearSVC"]:
    fitted_models[model_name] = {}
    for column_name, vec in zip(
        [
            "CleanTweet",
            "CleanTweetNoStopwords",
            "Lemmatized",
            "LemmatizedNoStopwords",
        ],
        [
            clean_tweets_vec,
            clean_no_stopwords_vec,
            lemmatized_vec,
            lemmatized_no_stopwords_vec,
        ],
    ):
        print(f"Fitting {model_name} for {column_name}")
        if model_name == "LogisticRegression":
            model = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
        elif model_name == "RandomForestClassifier":
            model = RandomForestClassifier(random_state=42, n_jobs=-1)
        elif model_name == "LinearSVC":
            model = LinearSVC(dual="auto", random_state=42)

        X_train_vec = vec[y_train.index]
        fitted_model = model.fit(X_train_vec, y_train)

        X_test_vec = vec[y_test.index]
        y_pred = fitted_model.predict(X_test_vec)

        report = classification_report(y_test, y_pred, target_names=y.cat.categories)
        cm = confusion_matrix(y_test, y_pred, normalize="true")

        fitted_models[model_name][column_name] = {
            "model": fitted_model,
            "classification_report": report,
            "confusion_matrix": cm,
        }

In [None]:
for model_name, model_results in fitted_models.items():
    for column_name, column_results in model_results.items():
        print(f"{model_name} - {column_name}")
        print(column_results["classification_report"])
        print("\n\n")

In [None]:
for model_name, model_results in fitted_models.items():
    for column_name, column_results in model_results.items():
        plot_confusion_matrix(
            column_results["confusion_matrix"],
            y.cat.categories,
            title=f"{model_name} - {column_name}",
        )

### 3.2 Sentences Embeddings

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

In [None]:
clean_embeddings = model.encode(df["CleanTweet"].tolist(), show_progress_bar=True)
clean_no_stops = model.encode(
    df["CleanTweetNoStopwords"].tolist(), show_progress_bar=True
)
lemmatized_embeddings = model.encode(df["Lemmatized"].tolist(), show_progress_bar=True)
lemmatized_no_stops = model.encode(
    df["LemmatizedNoStopwords"].tolist(), show_progress_bar=True
)

In [None]:
fitted_models = {}

for model_name in ["LogisticRegression", "RandomForestClassifier", "LinearSVC"]:
    fitted_models[model_name] = {}
    for column_name, vec in zip(
        [
            "CleanTweet",
            "CleanTweetNoStopwords",
            "Lemmatized",
            "LemmatizedNoStopwords",
        ],
        [
            clean_embeddings,
            clean_no_stops,
            lemmatized_embeddings,
            lemmatized_no_stops,
        ],
    ):
        print(f"Fitting {model_name} for {column_name}")
        if model_name == "LogisticRegression":
            model = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
        elif model_name == "RandomForestClassifier":
            model = RandomForestClassifier(random_state=42, n_jobs=-1)
        elif model_name == "LinearSVC":
            model = LinearSVC(dual="auto", random_state=42)

        X_train_vec = vec[y_train.index]
        fitted_model = model.fit(X_train_vec, y_train)

        X_test_vec = vec[y_test.index]
        y_pred = fitted_model.predict(X_test_vec)

        report = classification_report(y_test, y_pred, target_names=y.cat.categories)
        cm = confusion_matrix(y_test, y_pred, normalize="true")

        fitted_models[model_name][column_name] = {
            "model": fitted_model,
            "classification_report": report,
            "confusion_matrix": cm,
        }
    print()

In [None]:
for model_name, model_results in fitted_models.items():
    for column_name, column_results in model_results.items():
        print(f"{model_name} - {column_name}")
        print(column_results["classification_report"])
        print("\n\n")

In [None]:
for model_name, model_results in fitted_models.items():
    for column_name, column_results in model_results.items():
        plot_confusion_matrix(
            column_results["confusion_matrix"],
            y.cat.categories,
            title=f"{model_name} - {column_name}",
        )