# Sentiment Analysis

### 1) Classifier per Category

In [1]:
## Scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay

## Librerias para graficación
import matplotlib.pyplot as plt
import seaborn as sns

# NLTK es una librería particular para PLN. Tiene muchas funcionalidades entre ellas stemming y lista de palabras de parada.
import nltk
from nltk.corpus import stopwords
import re
import pandas as pd
import os
import csv

stemmer = nltk.stem.SnowballStemmer('english') # Vamos a utlizar el Snowball Stemmer para realizar stemming (nos permite llevar las palabras a una forma estandar).
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#Preprocessing of words
def processing_text(texto):
    processed_feature = re.sub(r'\W', ' ', str(texto))
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature)
    processed_feature = re.sub(r'[0-9]+', ' ', processed_feature)

    return processed_feature

In [7]:
# Define function to read and preprocess the reviews
def read_and_preprocess_reviews(folder_path):
    
    dictionary = {}

    #Define stopwords
    stopwords_english = set(stopwords.words('english'))

    for category in os.listdir(folder_path):
        category_path = os.path.join(folder_path, category) #Category path
        if os.path.isdir(category_path):
            for sentiment in ["positive","negative","unlabeled"]:
                sentiment_path = os.path.join(category_path, sentiment + '.review') #Sentiment path
                if os.path.isfile(sentiment_path):
                    with open(sentiment_path, 'r',encoding='utf-8') as file:
                        review_text = file.read()
                        # Preprocess input data
                        preprocess = processing_text(review_text)
                        words = nltk.word_tokenize(preprocess)
                        string = ""
                        train_reviews = []
                        train_labels = []
                        
                        for word in words:
                            if word == "positive" or word == "negative":
                                train_labels.append(word)
                                train_reviews.append(string)
                                string = ""
                            else:
                                string += word + " "
                        
        # list of reviews per category with their respective class(posituive or negative)     
        dictionary[category] = (train_reviews,train_labels)



    return dictionary
                

data_folder = './Multi Domain Sentiment/processed_acl'
reviews_training_set = read_and_preprocess_reviews(data_folder)


In [9]:
print(len(reviews_training_set["dvd"][0]))
print(len(reviews_training_set["dvd"][1]))

3674
3674


## Naive Bayes

In [5]:
#Bag of words
vectorizer = CountVectorizer(max_features=2500, stop_words=stopwords.words('english'))
# Vocabulario y tambien transforme nuestro texto
for categories in reviews_training_set:
    
    reviews = reviews_training_set[categories][0] #Reviews
    labels = reviews_training_set[categories][1] #Labels of the review respectively
    
    texto_features = vectorizer.fit_transform(reviews).toarray()

    # Dividir los dartos en training(70%), validation (15%), and test (15%).
    x_train, x_temp, y_train, y_temp = train_test_split(texto_features, labels, test_size=0.3, random_state=0)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=0)
    #Entrenar el modelo naive bayes
    nb = MultinomialNB()
    nb.fit(x_train, y_train)

    #Evaluar que tan buneo es el modelo
    predictions = nb.predict(x_test)

    # Ahora calculamos el score de accuracy enviando las predicciónes y los valores reales de polaridad.
    print(accuracy_score(y_test, predictions))

NameError: name 'x_test' is not defined

## Logistic Regression