# Sentiment Analysis

### 1) Classifier per Category

In [1]:
## Scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import TfidfVectorizer

## Librerias para graficación
import matplotlib.pyplot as plt
import seaborn as sns

# NLTK es una librería particular para PLN. Tiene muchas funcionalidades entre ellas stemming y lista de palabras de parada.
import nltk
from nltk.corpus import stopwords
import re
import pandas as pd
import os
# Logistic regression
from sklearn.linear_model import LogisticRegression, SGDClassifier

stemmer = nltk.stem.SnowballStemmer('english') # Vamos a utlizar el Snowball Stemmer para realizar stemming (nos permite llevar las palabras a una forma estandar).
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#Preprocessing of words
def processing_text(texto):
    processed_feature = re.sub(r'\W', ' ', str(texto))
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature)
    processed_feature = re.sub(r'[0-9]+', ' ', processed_feature)

    return processed_feature

In [3]:
# Define function to read and preprocess the reviews
def read_and_preprocess_reviews(folder_path):
    
    dictionary = {}
    merged_categories_reviews = []
    merged_categories_labels = []
    
    #Define stopwords
    stopwords_english = set(stopwords.words('english'))

    for category in os.listdir(folder_path):
        category_path = os.path.join(folder_path, category) #Category path
        if os.path.isdir(category_path):
            for sentiment in ["positive","negative","unlabeled"]:
                sentiment_path = os.path.join(category_path, sentiment + '.review') #Sentiment path
                if os.path.isfile(sentiment_path):
                    with open(sentiment_path, 'r',encoding='utf-8') as file:
                        review_text = file.read()
                        # Preprocess input data
                        preprocess = processing_text(review_text)
                        words = nltk.word_tokenize(preprocess)
                        string = ""
                        train_reviews = []
                        train_labels = []
                        
                        for word in words:
                            if word == "positive" or word == "negative":
                                train_labels.append(word)
                                train_reviews.append(string)
                                #For the overrall reviews without category discrimination
                                merged_categories_reviews.append(string)
                                merged_categories_labels.append(word)
                                string = ""
                            else:
                                string += word + " "
                        
        # list of reviews per category with their respective class(posituive or negative)     
        dictionary[category] = (train_reviews,train_labels)



    return dictionary,(merged_categories_reviews,merged_categories_labels)
                

data_folder = './Multi Domain Sentiment/processed_acl'
reviews_training_set = read_and_preprocess_reviews(data_folder)[0]
merged_reviews_data = read_and_preprocess_reviews(data_folder)[1]

In [4]:
print(len(merged_reviews_data[0]))
print(len(merged_reviews_data[1]))

28360
28360


## Naive Bayes

In [19]:
#Bag of words
vectorizer = CountVectorizer(max_features=2500, stop_words=stopwords.words('english'))
#TFIDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
for categories in reviews_training_set:
    # Usar el Tf vectorizer
    reviews = reviews_training_set[categories][0] #Reviews
    labels = reviews_training_set[categories][1] #Labels of the review respectively
    texto_features = vectorizer.fit_transform(reviews).toarray()
    # Dividir los datos en training(70%), validation (15%), and test (15%).
    x_train, x_temp, y_train, y_temp = train_test_split(texto_features, labels, test_size=0.3, random_state=0)
    x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=0)
    #Entrenar el modelo naive bayes
    nb = MultinomialNB()
    nb.fit(x_train, y_train)
    val_predictions_tf = nb.predict(x_val) # Evaluate on the validation set
    #Evaluar que tan bueno es el modelo
    test_predictions_tf = nb.predict(x_test)
    # Ahora calculamos el score de accuracy enviando las predicciónes y los valores reales de polaridad.
    print(f"{accuracy_score(y_test, test_predictions_tf)} tf representation {categories} accuracy")

    #Tf-idf representation    
    # Dividir los datos en training(70%), validation (15%), and test (15%) nuevamente
    x_train_tfidf, x_temp_tfidf, y_train_tfidf, y_temp_tfidf = train_test_split(reviews, labels, test_size=0.3, random_state=0)
    x_val_tfidf, x_test_tfidf, y_val_tfidf, y_test_tfidf = train_test_split(x_temp_tfidf, y_temp_tfidf, test_size=0.5, random_state=0)
    tfidf_vectorizer = TfidfVectorizer()
    X_train_tfidf = tfidf_vectorizer.fit_transform(x_train_tfidf)
    X_val_tfidf = tfidf_vectorizer.transform(x_val_tfidf)
    X_test_tfidf = tfidf_vectorizer.transform(x_test_tfidf)
    #Naive Bayes
    clf = MultinomialNB()
    #Train the model
    clf.fit(X_train_tfidf, y_train)
    val_predictions = clf.predict(X_val_tfidf)
    #Evaluar el modelo
    test_predictions = clf.predict(X_test_tfidf)
    # Ahora calculamos el score de accuracy enviando las predicciónes y los valores reales de polaridad.
    print(f"{accuracy_score(y_test_tfidf, test_predictions)} tfidf representation {categories} accuracy")
    
    print("-"*150) 
    print(f"{categories.upper()}")
    # Evaluation metrics precision, recall, F1, and accuracy
    print("Evalutation Metrics")
    #Tf representation
    print("TF")
    print(classification_report(y_test, test_predictions_tf, digits=4))
    #TfIdf representation
    print("TFIDF")
    print(classification_report(y_test_tfidf, test_predictions, digits=4))
    print("-"*150)  

0.8172661870503597 tf representation books accuracy
0.8633093525179856 tfidf representation books accuracy
------------------------------------------------------------------------------------------------------------------------------------------------------
BOOKS
Evalutation Metrics
TF
              precision    recall  f1-score   support

    negative     0.8142    0.8118    0.8130       340
    positive     0.8202    0.8225    0.8214       355

    accuracy                         0.8173       695
   macro avg     0.8172    0.8171    0.8172       695
weighted avg     0.8173    0.8173    0.8173       695

TFIDF
              precision    recall  f1-score   support

    negative     0.8679    0.8500    0.8588       340
    positive     0.8591    0.8761    0.8675       355

    accuracy                         0.8633       695
   macro avg     0.8635    0.8630    0.8632       695
weighted avg     0.8634    0.8633    0.8633       695

-----------------------------------------------------

### Prueba con una frase Naive Bayes

In [6]:
test = "Its the best movie ive seen in terms of quality but I loved the overrall of the movie. Recommend it!"

procesamiento = processing_text(test)

test_bow =vectorizer.transform([procesamiento]) # Ahora lo representamos como una bolsa de palabras. El vector resultante tiene 2500 posiciones.
print("PASO 2 representación:" ,test_bow)

clase_test = nb.predict(test_bow)
print("PASO 3 predecir con el modelo:" ,clase_test)

PASO 2 representación:   (0, 288)	1
  (0, 1208)	1
  (0, 1620)	1
  (0, 1660)	1
  (0, 1745)	1
PASO 3 predecir con el modelo: ['positive']


## Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [8]:
logistic_model_SGD = SGDClassifier(loss='log_loss',learning_rate='constant',eta0=0.0001 ) # Se usa el descenso de gradiente estocastico

In [28]:
#Bag of words
vectorizer = CountVectorizer(max_features=2500, stop_words=stopwords.words('english'))
#TfIdf 
vectorizer_tfIdf = TfidfVectorizer()
logistic_model_SGD = SGDClassifier(loss='squared_error',learning_rate='constant',eta0=0.0001)
for categories in reviews_training_set:
    
    reviews = reviews_training_set[categories][0] #Reviews
    labels = reviews_training_set[categories][1] #Labels of the review respectively
    
    # Usar el Tf vectorizer
    texto_features = vectorizer.fit_transform(reviews).toarray()
    # Dividir los datos en training(70%), validation (15%), and test (15%).
    x_train, x_temp, y_train, y_temp = train_test_split(texto_features, labels, test_size=0.3, random_state=0)
    x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=0)
    #Train the model
    logistic_model_SGD.fit(x_train, y_train)
    #Evaluar que tan bueno es el modelo
    y_pred_tf = logistic_model_SGD.predict(x_test)

    #Tf-idf representation    
    # Dividir los datos en training(70%), validation (15%), and test (15%) nuevamente
    x_train_tfidf, x_temp_tfidf, y_train_tfidf, y_temp_tfidf = train_test_split(reviews, labels, test_size=0.3, random_state=0)
    x_val_tfidf, x_test_tfidf, y_val_tfidf, y_test_tfidf = train_test_split(x_temp_tfidf, y_temp_tfidf, test_size=0.5, random_state=0)
    
    X_train_tfidf = vectorizer_tfIdf.fit_transform(x_train_tfidf)
    X_val_tfidf = vectorizer_tfIdf.transform(x_val_tfidf)
    X_test_tfidf = vectorizer_tfIdf.transform(x_test_tfidf)

    #Train the model
    logistic_model_SGD.fit(X_train_tfidf, y_train_tfidf)
    val_predictions = logistic_model_SGD.predict(X_val_tfidf)
    #Evaluar el modelo
    test_predictions = logistic_model_SGD.predict(X_test_tfidf)
    # Ahora calculamos el score de accuracy enviando las predicciónes y los valores reales de polaridad.
    print(f"{accuracy_score(y_test_tfidf, test_predictions)} tfidf representation {categories} accuracy")



    #METRICS
    print("-"*150) 
    print(f"{categories.upper()}")
    # Evaluation metrics precision, recall, F1, and accuracy
    print("Evalutation Metrics")
    #Tf representation
    print("TF")
    print(classification_report(y_test, y_pred_tf, digits=4))
    #TfIdf representation
    print("TFIDF")
    print(classification_report(y_test_tfidf, test_predictions, digits=4))
    print("-"*150)  

    

0.814388489208633 tfidf representation books accuracy
------------------------------------------------------------------------------------------------------------------------------------------------------
BOOKS
Evalutation Metrics
TF
              precision    recall  f1-score   support

    negative     0.8095    0.8000    0.8047       340
    positive     0.8106    0.8197    0.8151       355

    accuracy                         0.8101       695
   macro avg     0.8101    0.8099    0.8099       695
weighted avg     0.8101    0.8101    0.8100       695

TFIDF
              precision    recall  f1-score   support

    negative     0.8287    0.7824    0.8048       340
    positive     0.8021    0.8451    0.8230       355

    accuracy                         0.8144       695
   macro avg     0.8154    0.8137    0.8139       695
weighted avg     0.8151    0.8144    0.8141       695

----------------------------------------------------------------------------------------------------------

# 2) Classifier for all categories

In [31]:
#Data merged without category differentiation
reviews_merged = merged_reviews_data[0]
labels_merged = merged_reviews_data[1]

## Naive Bayes

In [30]:
# NAIVE BAYES
texto_features = vectorizer.fit_transform(reviews_merged).toarray()
# Dividir los datos en training(70%), validation (15%), and test (15%).
x_train, x_temp, y_train, y_temp = train_test_split(texto_features, labels_merged, test_size=0.3, random_state=0)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=0)
#Entrenar el modelo naive bayes
nb = MultinomialNB()
nb.fit(x_train, y_train)
val_predictions_tf = nb.predict(x_val) # Evaluate on the validation set
#Evaluar que tan bueno es el modelo
test_predictions_tf = nb.predict(x_test)
# Ahora calculamos el score de accuracy enviando las predicciónes y los valores reales de polaridad.
print(f"{accuracy_score(y_test, test_predictions_tf)} tf representation accuracy")

#Tf-idf representation    
# Dividir los datos en training(70%), validation (15%), and test (15%) nuevamente
x_train_tfidf, x_temp_tfidf, y_train_tfidf, y_temp_tfidf = train_test_split(reviews_merged, labels_merged, test_size=0.3, random_state=0)
x_val_tfidf, x_test_tfidf, y_val_tfidf, y_test_tfidf = train_test_split(x_temp_tfidf, y_temp_tfidf, test_size=0.5, random_state=0)
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(x_train_tfidf)
X_val_tfidf = tfidf_vectorizer.transform(x_val_tfidf)
X_test_tfidf = tfidf_vectorizer.transform(x_test_tfidf)
#Naive Bayes
clf = MultinomialNB()
#Train the model
clf.fit(X_train_tfidf, y_train)
val_predictions = clf.predict(X_val_tfidf)
#Evaluar el modelo
test_predictions = clf.predict(X_test_tfidf)
# Ahora calculamos el score de accuracy enviando las predicciónes y los valores reales de polaridad.
print(f"{accuracy_score(y_test_tfidf, test_predictions)} tfidf representation accuracy")

# Evaluation metrics precision, recall, F1, and accuracy
print("Evalutation Metrics")
#Tf representation
print("TF")
print(classification_report(y_test, test_predictions_tf, digits=4))
#TfIdf representation
print("TFIDF")
print(classification_report(y_test_tfidf, test_predictions, digits=4))
print("-"*150) 


0.8328631875881524 tf representation accuracy
0.8791725434884814 tfidf representation accuracy
Evalutation Metrics
TF
              precision    recall  f1-score   support

    negative     0.8492    0.8061    0.8270      2109
    positive     0.8184    0.8592    0.8383      2145

    accuracy                         0.8329      4254
   macro avg     0.8338    0.8326    0.8327      4254
weighted avg     0.8336    0.8329    0.8327      4254

TFIDF
              precision    recall  f1-score   support

    negative     0.8556    0.9099    0.8819      2109
    positive     0.9055    0.8490    0.8763      2145

    accuracy                         0.8792      4254
   macro avg     0.8805    0.8794    0.8791      4254
weighted avg     0.8807    0.8792    0.8791      4254

------------------------------------------------------------------------------------------------------------------------------------------------------


## Logistic Regression

In [32]:
# Usar el Tf vectorizer
texto_features = vectorizer.fit_transform(reviews_merged).toarray()
# Dividir los datos en training(70%), validation (15%), and test (15%).
x_train, x_temp, y_train, y_temp = train_test_split(texto_features, labels_merged, test_size=0.3, random_state=0)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=0)
#Train the model
logistic_model_SGD.fit(x_train, y_train)
#Evaluar que tan bueno es el modelo
y_pred_tf = logistic_model_SGD.predict(x_test)

#Tf-idf representation    
# Dividir los datos en training(70%), validation (15%), and test (15%) nuevamente
x_train_tfidf, x_temp_tfidf, y_train_tfidf, y_temp_tfidf = train_test_split(reviews_merged, labels_merged, test_size=0.3, random_state=0)
x_val_tfidf, x_test_tfidf, y_val_tfidf, y_test_tfidf = train_test_split(x_temp_tfidf, y_temp_tfidf, test_size=0.5, random_state=0)

X_train_tfidf = vectorizer_tfIdf.fit_transform(x_train_tfidf)
X_val_tfidf = vectorizer_tfIdf.transform(x_val_tfidf)
X_test_tfidf = vectorizer_tfIdf.transform(x_test_tfidf)

#Train the model
logistic_model_SGD.fit(X_train_tfidf, y_train_tfidf)
val_predictions = logistic_model_SGD.predict(X_val_tfidf)
#Evaluar el modelo
test_predictions = logistic_model_SGD.predict(X_test_tfidf)
# Ahora calculamos el score de accuracy enviando las predicciónes y los valores reales de polaridad.
print(f"{accuracy_score(y_test_tfidf, test_predictions)} tfidf representation accuracy")



#METRICS
print("-"*150) 
# Evaluation metrics precision, recall, F1, and accuracy
print("Evalutation Metrics")
#Tf representation
print("TF")
print(classification_report(y_test, y_pred_tf, digits=4))
#TfIdf representation
print("TFIDF")
print(classification_report(y_test_tfidf, test_predictions, digits=4))
print("-"*150)  



0.7769158439116126 tfidf representation kitchen accuracy
------------------------------------------------------------------------------------------------------------------------------------------------------
KITCHEN
Evalutation Metrics
TF
              precision    recall  f1-score   support

    negative     0.8472    0.8388    0.8430      2109
    positive     0.8430    0.8513    0.8471      2145

    accuracy                         0.8451      4254
   macro avg     0.8451    0.8450    0.8451      4254
weighted avg     0.8451    0.8451    0.8451      4254

TFIDF
              precision    recall  f1-score   support

    negative     0.7115    0.9251    0.8044      2109
    positive     0.8955    0.6312    0.7405      2145

    accuracy                         0.7769      4254
   macro avg     0.8035    0.7782    0.7724      4254
weighted avg     0.8043    0.7769    0.7722      4254

-----------------------------------------------------------------------------------------------------