In [6]:
# Downloading the required libraries

import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression


# Preprocessing text

In [7]:
# Function to preprocess text
def cleanData(text):
    text = text.lower() # All lowercase

    # tokenize and remove stopwords at same time
    tokens = tokenizer.tokenize(text)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(tokens) for tokens in new_tokens] # stemming

    clean_text = " ".join(stemmed_tokens)

    return clean_text

# Train the model

In [8]:
# Uses the tab as a separator
df_amazon = pd.read_csv('amazon_cells_labelled.txt', names=['sentence', 'score'], sep = '\t', engine = 'python')

# Converting the ndarray sentences into a list to perform easier operations with it
train_sentences = df_amazon['sentence'].tolist()

In [9]:
# View sample from the data
df_amazon.head()

Unnamed: 0,sentence,score
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [10]:
# Initializing various classes needed
tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()
cv = CountVectorizer(ngram_range=(1,2))
mn = MultinomialNB()
lr = LogisticRegression()

In [11]:
# Cleaning out the text from the training data.
clean_text = [cleanData(i) for i in train_sentences]

# Vectorization. 

In [12]:
sentence_vect = cv.fit_transform(clean_text).toarray()

# Test Data

In [13]:
# Data to be used to test the model.
df_yelp = pd.read_csv('yelp_labelled.txt', names=['sentence', 'score'], sep = '\t', engine = 'python')

# Get all scores in a list also
train_score = df_amazon['score'].tolist() 

# Text to be used to test our data model.
test_text = df_yelp['sentence'].tolist()

# Performing cleaning on the test text
clean_test_text = [cleanData(i) for i in test_text]

In [14]:
# Vectorization of the test text
test_vect = cv.transform(clean_test_text).toarray()

# Classification

In [12]:
# Fitting the data using Multinomial Naive Bayes
mn.fit(sentence_vect,train_score)

# Using the model to predict the score
test_score = mn.predict(test_vect)

# Getting the original score values for the test data
original_score = df_yelp['score']


### Evaluate performance 

In [13]:
# Calculate the accuracy of the model as a percentage
accuracy = accuracy_score(original_score, test_score) * 100

# Print the accuracy
print(f"Accuracy of the model: {accuracy:.2f}%")

Accuracy of the model: 70.20%


# Classification

In [14]:
# Fitting the data using Logistic Regression 
# Using model to predict score
lr.fit(sentence_vect,train_score)
test_score = lr.predict(test_vect)

# Calculate the accuracy of the model as a percentage
accuracy = accuracy_score(original_score, test_score) * 100

# Print the confusion matrix and the accuracy
print(f"Accuracy of the model: {accuracy:.2f}%")

Accuracy of the model: 69.50%


# Conclusion