# Sentiment Analysis

This notebook intends to build a sentiment classifier (positive, negative) from “Multi-Domain Sentiment Dataset” per each category (“Books”, “DVD”, “Electronics”, “Kitchen”).

### Library imports

In [None]:
pip install nltk

In [1]:
#Scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay

#Libraries to graph
import matplotlib.pyplot as plt
import seaborn as sns

#NLTK
import nltk
from nltk.corpus import stopwords
import re
import pandas as pd


stemmer = nltk.stem.SnowballStemmer('english') 
nltk.download('stopwords') 

ModuleNotFoundError: No module named 'nltk'

### Read and transform the .review's files

Run the python file "PreProcessingSentimentAnalysis.py"

### Creating the training/validation dataframe

In [None]:
def create_df (file_name):
    df = pd.read_csv(file_name, sep=',')
    return df


### Text processing function

In [None]:
def text_processing(text):
    # Step 1: Remove special characters using a regular expression (non-words).
    processed_feature = re.sub(r'\W', ' ', str(text))
    # Step 2: Remove single-character occurrences.
    processed_feature = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature)
    # Step 3: Remove numbers (very sporadic occurrences in our dataset).
    processed_feature = re.sub(r'[0-9]+', ' ', processed_feature)
    # Step 4: Simplify consecutive spaces to a single space between words.
    processed_feature = re.sub(' +', ' ', processed_feature)
    # Step 5: Convert all text to lowercase.
    processed_feature = processed_feature.lower()
    # Step 6: Apply stemming. It's a way to bring words to a common root, simplifying the vocabulary.
    # This helps to avoid having two different words with the same meaning in our vocabulary.
    processed_feature = " ".join([stemmer.stem(i) for i in processed_feature.split()])

    return processed_feature


Applying the text processing function to each data set

In [None]:
def apply_processing(category)->list:

    #Extracting the unprocessed texts and its labels
    not_processed = category['review'].values
    labels = category['label'].values

    #Creating a list to save the processed texts
    processed = []

    #Processing all the texts
    for t in range(0, len(not_processed)):
        text = text_processing(not_processed[t])
        processed.append(text)

    # Saving the processed texts in the df
    category['processed'] = processed

    #Returning the processeddf
    return category


### Text representation

In this part we must take the processed text and represent it in such way we can operate it correctly. We are going to create a bow (bag of words)

### Vectorizer

We are using the CountVectorizer in order to create the bow

In [None]:
def bow(processed_text:list):
    
    #Bag of words
    vectorizer = CountVectorizer(max_features=2500, stop_words=stopwords.words('english'))
    
    #Now we build the vocabulary and also transform our text using our dataset
    text_features = vectorizer.fit_transform(processed_text).toarray()

    return text_features

### Training and evaluation

In [None]:
def NB(features, labels):
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=0)

    #Naive Bayes
    nb = MultinomialNB()
    #Training the model
    nb.fit(X_train, y_train)

    return [nb, X_train, X_test, y_train, y_test]

In [None]:
def evaluation(model, x_test, y_test):

    predictions = model.predict(x_test)
    print(accuracy_score(y_test, predictions))

## Entire process for each category

### Books

In [None]:
book_df = create_df('books_consolidated.csv')
#print(book_df.sample(5))

In [None]:
processed_book_df = apply_processing(book_df)
#print(processed_book_df.sample(5))

Lets check that the processing was done correctly

In [None]:
print("Review 1000 before processing")
print(book_df.loc[1000]['review'])
print("Review 1000 after processing")
print(processed_book_df.loc[1000]['processed'])

In [None]:
book_features = bow(processed_book_df['processed'].values)

In [None]:
book_labels = processed_book_df['label'].values
book_model = NB(book_features, book_labels)
book_nb = book_model[0]
book_x_train = book_model[1]
book_x_test = book_model[2]
book_y_train = book_model[3]
book_y_test = book_model[4]