# Sentiment Analysis

### 1) Classifier per Category

In [1]:
## Scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay

## Librerias para graficación
import matplotlib.pyplot as plt
import seaborn as sns

# NLTK es una librería particular para PLN. Tiene muchas funcionalidades entre ellas stemming y lista de palabras de parada.
import nltk
from nltk.corpus import stopwords
import re
import pandas as pd
import os
import csv

stemmer = nltk.stem.SnowballStemmer('english') # Vamos a utlizar el Snowball Stemmer para realizar stemming (nos permite llevar las palabras a una forma estandar).
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#Preprocessing of words
def processing_text(texto):
    processed_feature = re.sub(r'\W', ' ', str(texto))
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature)
    processed_feature = re.sub(r'[0-9]+', ' ', processed_feature)

    return processed_feature

In [3]:
# Define function to read and preprocess the reviews
def read_and_preprocess_reviews(folder_path):
    
    dictionary_training = {}
    dictionary_test = {}

    #Define stopwords
    stopwords_english = set(stopwords.words('english'))

    for category in os.listdir(folder_path):
        category_path = os.path.join(folder_path, category) #Category path
        if os.path.isdir(category_path):
            for sentiment in ["positive","negative"]:
                sentiment_path = os.path.join(category_path, sentiment + '.review') #Sentiment path
                if os.path.isfile(sentiment_path):
                    with open(sentiment_path, 'r',encoding='utf-8') as file:
                        review_text = file.read()
                        # Preprocess input data
                        preprocess = processing_text(review_text)
                        words = nltk.word_tokenize(preprocess)
                        string = ""
                        train_reviews = []
                        train_labels = []
                        
                        for word in words:
                            if word == "positive" or word == "negative":
                                train_labels.append(word)
                                train_reviews.append(string)
                                string = ""
                            else:
                                string += word + " "
                        
        # list of reviews per category with their respective class(posituive or negative)     
        dictionary_training[category] = (train_reviews,train_labels)

        test_reviews = []
        test_label = []
        #Test set
        sentiment_test_path = os.path.join(category_path, 'unlabeled.review') #Sentiment path test
        if os.path.isfile(sentiment_test_path):
            with open(sentiment_test_path, 'r',encoding='utf-8') as file:
                review_text = file.read()
                # Preprocess and remove stopwords
                preprocess = processing_text(review_text)
                words = nltk.word_tokenize(preprocess)
                for word in words:
                    if word == "positive" or word == "negative":
                        test_label.append(word)
                        test_reviews.append(string)
                        string = ""
                    else:
                        string += word + " "
                        
        dictionary_test[category] = (test_reviews,test_label)

    return dictionary_trining,test_reviews
                

data_folder = './Multi Domain Sentiment/processed_acl'
data_per_category = read_and_preprocess_reviews(data_folder)[0]
unlabeled_test_text = read_and_preprocess_reviews(data_folder)[1]

In [8]:
#print(data_per_category["kitchen"][0][0])
print(len(unlabeled_test_text))

6052


In [5]:
#Bag of words
vectorizer = CountVectorizer(max_features=2500, stop_words=stopwords.words('english'))
# Vocabulario y tambien transforme nuestro texto
for categories in data_per_category:
    reviews = data_per_category[categories][0]
    texto_features = vectorizer.fit_transform(reviews).toarray()
    
    nb = MultinomialNB()
    nb.fit(texto_features, y_train)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


NameError: name 'X_train' is not defined