#### Imports

In [1]:
import os
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#### Remoção de StopWords

In [None]:
stop_words = set(stopwords.words('english'))

def remover_stopwords(texto):
    texto = texto.lower()
    texto = texto.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(texto)
    tokens_filtrados = [p for p in tokens if p not in stop_words]
    return ' '.join(tokens_filtrados)

def carregar_reviews_stopwords(caminho_base):
    textos = []
    sentimentos = []

    for sentimento in ['pos', 'neg']:
        pasta = os.path.join(caminho_base, sentimento)
        for nome_arquivo in os.listdir(pasta):
            caminho_arquivo = os.path.join(pasta, nome_arquivo)
            with open(caminho_arquivo, encoding='utf-8') as f:
                texto_original = f.read()
                texto_processado = remover_stopwords(texto_original)
                textos.append(texto_processado)
                sentimentos.append(1 if sentimento == 'pos' else 0)

    return pd.DataFrame({'review': textos, 'sentiment': sentimentos})

caminho_train = 'resenhas_dataset/aclImdb/train'
caminho_test = 'resenhas_dataset/aclImdb/test'

df_train = carregar_reviews_stopwords(caminho_train)
df_test = carregar_reviews_stopwords(caminho_test)

df_stopwords_removidas = pd.concat([df_train, df_test], ignore_index=True)

print(df_stopwords_removidas.head())

                                              review  sentiment
0  bromwell high cartoon comedy ran time programs...          1
1  homelessness houselessness george carlin state...          1
2  brilliant overacting lesley ann warren best dr...          1
3  easily underrated film inn brooks cannon sure ...          1
4  typical mel brooks film much less slapstick mo...          1


#### Stemming

In [10]:
stemmer = PorterStemmer()

def aplicar_stemming(texto):
    tokens = word_tokenize(texto)
    tokens_stem = [stemmer.stem(p) for p in tokens]
    return ' '.join(tokens_stem)

df_stemmer = df_stopwords_removidas.copy()
df_stemmer['review'] = df_stemmer['review'].apply(aplicar_stemming)

print(df_stemmer.head())

                                              review  sentiment
0  bromwel high cartoon comedi ran time program s...          1
1  homeless houseless georg carlin state issu yea...          1
2  brilliant overact lesley ann warren best drama...          1
3  easili underr film inn brook cannon sure flaw ...          1
4  typic mel brook film much less slapstick movi ...          1


#### Lemmatization

In [14]:
lemmatizer = WordNetLemmatizer()

def aplicar_lemmatizer(texto):
    tokens = word_tokenize(texto)
    tokens_lemma = [lemmatizer.lemmatize(p) for p in tokens]
    return ' '.join(tokens_lemma)

df_lemmatization = df_stopwords_removidas.copy()
df_lemmatization['review'] = df_lemmatization['review'].apply(aplicar_lemmatizer)

print(df_lemmatization.head())

                                              review  sentiment
0  bromwell high cartoon comedy ran time program ...          1
1  homelessness houselessness george carlin state...          1
2  brilliant overacting lesley ann warren best dr...          1
3  easily underrated film inn brook cannon sure f...          1
4  typical mel brook film much less slapstick mov...          1


#### Bag-of-Words

In [15]:
bow = CountVectorizer()

resultado_bow = bow.fit_transform(df_stemmer['review'])

print("Resultado:", resultado_bow.shape)

Resultado: (50000, 142188)


#### Bag-of-Bigrams

In [None]:
bigrams = CountVectorizer(ngram_range=(2, 2))

resultado_bigrams = bigrams.fit_transform(df_lemmatization['review'])

print("Resultado:", resultado_bigrams.shape)
print(bigrams.get_feature_names_out()[:10])

Resultado: (50000, 3053156)
Exemplo:  ['00 acting' '00 agent' '00 come' '00 including' '00 schneider' '00 worth'
 '000 000' '000 overboard' '000 produce' '0000000000001 10']


#### QUESTÃO 6 - TF-IDF e Regressão Logistica

In [25]:
X = df_stopwords_removidas['review']
y = df_stopwords_removidas['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

print("Acurácia:", accuracy_score(y_test, y_pred))

Acurácia: 0.8998
