In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd
import numpy as np


# NLP Classifiers from Scraped Product Reviews

In [2]:
# Methods
def evaluate_predictions(predictions):
    counter=0
    for pred in predictions:
        print(f'The sentence "{sentences[counter]}" is {pred}')
        counter += 1

In [3]:
# Raw data read in
# Initialize variables
range_ = range(1,6)
df = pd.DataFrame()

# Combine the range of starred data into one DataFrame object
for i in range_:
    # Read in the i-th record
    r_df = pd.read_csv(f'data/scraped_{i}star_data.csv')
    # Concat with master df
    df = pd.concat([df,r_df])
# Strip everything but the first character in the 'stars' column & convert to int
df['stars'] = df['stars'].apply(lambda x: int(x[0]))
df

Unnamed: 0,date,text,stars
0,2/7/2024,Unfortunately the tv fell on me after purchasi...,1
1,11/1/2023,I love Vizio that’s always my go to brand I ha...,1
2,11/22/2023,"I ordered a Vizio 50"" Class V-Series 4L UHD LE...",1
3,12/5/2023,It didn't work right out of the box. Nothing b...,1
4,12/24/2023,"I ordered 75” Vizio TV and just put it up, bu...",1
...,...,...,...
1141,11/22/2023,Absolutely love this picture is so clear sound...,5
1142,11/6/2023,I am satisfied with my Vizio 50 inch . Perfect...,5
1143,10/28/2023,GREAT CUSTOMER SERVICE!!!,5
1144,12/9/2023,Gave as a gift. She had one previously.,5


# Binary Sentiment Analysis Model

In [4]:
# Binary Preprocess

# Create a copy of the 'stars' column where 'stars' is equal to 5
five = df.loc[ df['stars'] == 5 ].copy()
# Add a new column in the length of the DataFrame with all 1s to bin 5stars
five['bin_sent'] = pd.Series( [x/x for x in range(1,len(five)+1)] , index=five.index )
# Create a copy of the 'stars' column where 'stars' is equal to 1
one = df.loc[ df['stars'] == 1 ].copy()
# Add a new column in the length of the DataFrame with all 0s to bin 1stars
one['bin_sent'] = pd.Series( [((x/x)-1) for x in range(1,len(one)+1)] , index=one.index )
# Concat the binary sentiment df
pos_neg = pd.concat( [five,one] )

# Separate df into data & target
data = np.array(pos_neg['text'])
target = np.array(pos_neg['bin_sent'])

In [5]:
# Train test split
docs_train, docs_test, y_train, y_test = train_test_split(
    data, target, test_size=0.3)

# Construct Pipeline object
txt_clf = Pipeline(
    [
        ('vect', TfidfVectorizer(ngram_range=(1,2))),
        ('clf', MultinomialNB())
    ]
).fit(docs_train,y_train)

# Predict from Pipeline
y_pred = txt_clf.predict(docs_test)

In [6]:
# Evaluate the predictions of our model
print(f"Perceptron Model:\n{metrics.classification_report(y_test, y_pred)}")
cm = metrics.confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cm}")

Perceptron Model:
              precision    recall  f1-score   support

         0.0       0.95      0.91      0.93       290
         1.0       0.92      0.96      0.94       340

    accuracy                           0.94       630
   macro avg       0.94      0.93      0.94       630
weighted avg       0.94      0.94      0.94       630

Confusion Matrix:
[[263  27]
 [ 13 327]]


In [7]:
# Use txt_clf to predict binary sentiment of new sentences
sentences = [
    # Subjective 1st Person
    "I like this product.",
    "I dislike this product.",
    # Subjective 3rd Person
    "The product is amazing.",
    "The product is terrible.",
    # General 3rd person
    "The world is good.",
    "The world is bad."
]

new_pred = txt_clf.predict(sentences)
evaluate_predictions(new_pred)

The sentence "I like this product." is 1.0
The sentence "I dislike this product." is 1.0
The sentence "The product is amazing." is 1.0
The sentence "The product is terrible." is 0.0
The sentence "The world is good." is 1.0
The sentence "The world is bad." is 0.0


# Binned Sentiment Analysis Model

In [8]:
# Bin Preprocess

# Initialize empty column
sent = []
# Loop through actual df
for i in df['stars']:
    # Apply bin label to empty column based on value of 'stars'
    # Greater than 3 stars = 1
    if i > 3:
        sent.append(1)
    # 3 stars = 0
    elif i == 3:
        sent.append(0)
    # Less than 3 stars = -1
    else: 
        sent.append(-1)
# Append column to master df
df['sent'] = sent
    
# Separate df into data & target
data = np.array(df['text'])
target = np.array(df['sent'])

In [9]:
# Train Test Split
docs_train, docs_test, y_train, y_test = train_test_split(
    data, target, test_size=0.4)

# Construct Pipeline object
txt_clf = Pipeline(
    [
        ('vect', TfidfVectorizer(ngram_range=(1,2))),
        ('clf', SGDClassifier(loss='squared_hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          tol=None))
    ]
).fit(docs_train,y_train)

# Predict from Pipeline
y_pred = txt_clf.predict(docs_test)

In [10]:
# Evaluate the predictions of our model
print(f"Perceptron Model:\n{metrics.classification_report(y_test, y_pred)}")
cm = metrics.confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cm}")

Perceptron Model:
              precision    recall  f1-score   support

          -1       0.73      0.86      0.79       628
           0       0.53      0.27      0.36       306
           1       0.81      0.86      0.83       694

    accuracy                           0.75      1628
   macro avg       0.69      0.66      0.66      1628
weighted avg       0.73      0.75      0.73      1628

Confusion Matrix:
[[540  33  55]
 [139  82  85]
 [ 60  39 595]]


In [11]:
# Use txt_clf to predict binned sentiment of new sentences
sentences = [
    # Subjective 1st Person
    "I like this product.",
    "I dislike this product.",
    # Subjective 3rd Person
    "The product is amazing.",
    "The product is terrible.",
    # General 3rd person
    "The world is good.",
    "The world is bad."
]

new_pred = txt_clf.predict(sentences)
evaluate_predictions(new_pred)

The sentence "I like this product." is 1
The sentence "I dislike this product." is 1
The sentence "The product is amazing." is 1
The sentence "The product is terrible." is -1
The sentence "The world is good." is 1
The sentence "The world is bad." is -1


# Categorical Sentiment Analysis Model

In [12]:
# Categorical Preprocess

# Separate df into data & target
data = np.array(df['text'])
target = np.array(df['stars'])

In [13]:
# Train Test Split
docs_train, docs_test, y_train, y_test = train_test_split(
    data, target, test_size=0.4)

# Construct Pipeline object
txt_clf = Pipeline(
    [
        ('vect', TfidfVectorizer(ngram_range=(1,2))),
        ('clf', SGDClassifier(loss='squared_hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          tol=None))
    ]
).fit(docs_train,y_train)

# Predict from Pipeline
y_pred = txt_clf.predict(docs_test)

In [14]:
# Evaluate the predictions of our model
print(f"Perceptron Model:\n{metrics.classification_report(y_test, y_pred)}")
cm = metrics.confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cm}")

Perceptron Model:
              precision    recall  f1-score   support

           1       0.61      0.85      0.71       390
           2       0.43      0.18      0.25       244
           3       0.46      0.46      0.46       322
           4       0.43      0.19      0.26       233
           5       0.69      0.88      0.77       439

    accuracy                           0.58      1628
   macro avg       0.52      0.51      0.49      1628
weighted avg       0.55      0.58      0.54      1628

Confusion Matrix:
[[331  15  24   4  16]
 [ 95  43  79   9  18]
 [ 74  34 147  27  40]
 [ 18   6  63  44 102]
 [ 23   3   9  19 385]]


In [15]:
# Use txt_clf to predict categorical sentiment of new sentences
sentences = [
    # Subjective 1st Person
    "I like this product.",
    "I dislike this product.",
    # Subjective 3rd Person
    "The product is amazing.",
    "The product is terrible.",
    # General 3rd person
    "The world is good.",
    "The world is bad."
]

new_pred = txt_clf.predict(sentences)
evaluate_predictions(new_pred)

The sentence "I like this product." is 5
The sentence "I dislike this product." is 1
The sentence "The product is amazing." is 5
The sentence "The product is terrible." is 2
The sentence "The world is good." is 3
The sentence "The world is bad." is 3
