In [14]:
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,f1_score

##### Using **LiYuan/amazon-review-sentiment-analysis** pretrained model from Hugging Face 

In [15]:
#Using pipeline for transfer learning 
from transformers import pipeline

pipe = pipeline("text-classification", model="LiYuan/amazon-review-sentiment-analysis")

In [16]:
# Example text for sentiment analysis
text = "I loved the food at that restaurant, it was amazing!"

# Perform sentiment analysis using the pipeline
result = pipe(text)

print(result)

[{'label': '5 stars', 'score': 0.9333951473236084}]


#### Checking number of labels

In [17]:
# Get model configuration
model_config = pipe.model.config

# Check the number of labels/classes
num_labels = model_config.num_labels
print(f"Number of labels/classes: {num_labels}")

Number of labels/classes: 5


In [18]:
# Truncate the text to fit within maximum sequence length
max_seq_length = pipe.tokenizer.model_max_length

max_seq_length

512

#### Original Test Dataset - Vectorized

In [19]:
test_df = pd.read_csv("/content/test_df.csv")

In [20]:
print(test_df[10156:10157]['text'])

10156    friend love place much wed reception find head...
Name: text, dtype: object


In [21]:
test_df.text = test_df.text.astype(str)

In [22]:
# Sample 10% of the DataFrame
test_df_small = test_df.sample(frac=0.1, random_state=42)

In [23]:
test_df_small.dtypes

label     int64
text     object
dtype: object

In [24]:
y_true = test_df_small['label']

#### Batch processing

In [25]:
#intializing lists to store pred and labels for batch processing
preds = []

In [26]:
#iterating through test data set

for index, row in test_df_small.iterrows():

    #performing sentiment analysis on each row
    text = row['text']

    # Truncate the text to fit within maximum sequence length
    truncated_text = text[:max_seq_length]

    # Performing sentiment analysis on truncated text
    result = pipe(truncated_text)

    #appending the result to the list
    preds.append(result[0]['label'])

In [29]:
# Mapping dictionary from labels to numerical values
label_map = {
    '1 star': 0,
    '2 stars': 1,
    '3 stars': 2,
    '4 stars': 3,
    '5 stars': 4
}

# Convert predictions to numerical values using list comprehension
preds = [label_map[label] for label in preds]

In [30]:
test_df_small

Unnamed: 0,label,text
33553,4,come day ago lease sure size need guess three ...
9427,0,choose 4 queen visit la vega several reason ad...
199,3,go day wed town last minute pedicure really kn...
12447,1,strange little thing sour experience good time...
39489,4,visit several time year food always fresh well...
...,...,...
39885,2,since edc 3 venue hold last many year think gi...
17566,0,constantly search great mexican madison area n...
16062,0,former tourist current resident la vega nv bro...
48445,4,take temporary residence flood disaster home t...


##### Comparison of Original Labels vs Predicted Labels [:20]

In [46]:
print("First 20 Original labels",test_df_small['label'][:20].tolist())
print("First 20 predicted labels",preds[:20])

First 20 Original labels [4, 0, 3, 1, 4, 0, 4, 0, 1, 2, 1, 0, 3, 0, 3, 2, 2, 4, 1, 2]
First 20 predicted labels [4, 4, 4, 4, 4, 0, 4, 0, 1, 2, 0, 4, 4, 0, 4, 4, 3, 0, 4, 2]


##### Calculating Metrics

In [31]:
accuracy = accuracy_score(y_true, preds)

confusion_mat = confusion_matrix(y_true, preds)

classification_rep = classification_report(y_true, preds)

f1 = f1_score(y_true, preds, average='macro')

In [32]:
print(f"\nAccuracy: {accuracy}")
print(f"\nF1 Score: {f1}")
print("\n\nConfusion Matrix:")
print(confusion_mat)
print("\n\nClassification Report:")
print(classification_rep)



Accuracy: 0.3574

F1 Score: 0.2860476356245246


Confusion Matrix:
[[629   5  35  20 326]
 [313  55  88  76 486]
 [ 92  20 131 148 613]
 [ 34   1  32  62 883]
 [ 19   0   6  16 910]]


Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.62      0.60      1015
           1       0.68      0.05      0.10      1018
           2       0.45      0.13      0.20      1004
           3       0.19      0.06      0.09      1012
           4       0.28      0.96      0.44       951

    accuracy                           0.36      5000
   macro avg       0.44      0.36      0.29      5000
weighted avg       0.44      0.36      0.28      5000



The accuracy accounted for the pre trained model is 35% which is quite lower than our Logistic Regression, Random Forest and Naive Bayes accuracy. This has not been trained and fine tuned on our training data set yet so we can expect better results but poor performance on this dataset for now is understandable.