# **Sentiment Analysis using LLM (GPT-3.5)**

In [1]:
!pip install openai



In [2]:
import openai
import os

import pandas as pd
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,  classification_report

load_dotenv()
openai.api_key = os.environ.get("API_KEY")

### Loading Dataset 

In [3]:
# Load the dataset
data = pd.read_csv("downsampled_dataset_10k.csv")

In [4]:
columns_to_drop = ['marketplace', 'review_id', 'product_parent', 'product_category', 'vine', 'helpful_votes']

# Dropping the specified columns
data.drop(columns=columns_to_drop, inplace=True)

In [5]:
data.head()

Unnamed: 0,customer_id,product_id,product_title,star_rating,total_votes,verified_purchase,review_headline,review_body,review_date,sentiment
0,6108596,B00P8LFJ3Y,Lava HD-8000 OmniPro,5,0,Y,Five Stars,Great tv signal very good buy.I like it,2015-08-21,Positive
1,37870958,B00WBS32K4,PonoMusic Pono Portable Music Player,5,2,Y,Five Stars,Good sound. Fine Material. Simply perfect!,2015-08-29,Positive
2,38204831,B00XVT4DLO,DIVOIX® DV110 In-Ear Headphone Earbuds Lightwe...,5,0,Y,Five Stars,looks good as shown lots of base. BTW fast shi...,2015-08-22,Positive
3,26611731,B00N1KWERI,"Eclipse 8GB 2.8"" Supra Fit Bluetooth MP3 with ...",5,0,Y,Love it!,This player is totally awesome! I has all the ...,2015-08-18,Positive
4,47611641,B004LTEUDO,Mediabridge 3.5mm Male To Male Stereo Audio Ca...,5,0,Y,High Quality Cable,Very quick delivery and high quality. Sound is...,2015-08-20,Positive


### Splitting data in Train/Validation/Test Splits 

In [6]:
# split into train, val and test data 
X_train, X_test = train_test_split(data, test_size=0.2, random_state = 42, stratify=data['sentiment'])
X_val, X_test = train_test_split(X_test, test_size=0.5, random_state = 42, stratify=X_test['sentiment'])


In [7]:
X_test.head()

Unnamed: 0,customer_id,product_id,product_title,star_rating,total_votes,verified_purchase,review_headline,review_body,review_date,sentiment
9205,22614796,B00XYFD5WS,100 Pack Aa Batteries Extra Heavy Duty 1.5v Wh...,5,0,Y,Five Stars,Fantastic price.,2015-08-27,Positive
1915,107973,B00XJFU3AA,TryAceWireless Bluetooth Waterproof Shower Spe...,5,0,Y,Nice and compact. The sound is loud and crisp....,Nice and compact. The sound is loud and crisp....,2015-08-30,Positive
7073,8568048,B00C7YT3OQ,SAMSUNG-Compatible ValueView 3D Glasses. Recha...,5,1,Y,Great 3D Glasses,Delivery was fast and arrived the day it said...,2015-08-18,Positive
2102,48034147,B00LH81W9C,PARENT-PS-U,5,0,Y,Five Stars,"I love this device. In fact, this is the seco...",2015-08-20,Positive
8421,37521038,B00Q2CB5VS,Onedayshop® High Quality & Speed HDMI to YPBPR...,1,0,Y,Limited use,The product works beautifully when the input i...,2015-08-29,Negative


In [8]:
X_test['sentiment'].value_counts()

Positive    763
Negative    168
Neutral      69
Name: sentiment, dtype: int64

In [9]:
df = X_test[["star_rating", "review_body", "sentiment"]]
df.head()

Unnamed: 0,star_rating,review_body,sentiment
9205,5,Fantastic price.,Positive
1915,5,Nice and compact. The sound is loud and crisp....,Positive
7073,5,Delivery was fast and arrived the day it said...,Positive
2102,5,"I love this device. In fact, this is the seco...",Positive
8421,1,The product works beautifully when the input i...,Negative


### Using OpenAI API for sentiment generation with GPT-3.5 Model 

In [10]:

def get_sentiments_from_reviews(df, model="gpt-3.5-turbo"):
    sentiments = []

    # Iterate over each review in the DataFrame
    for review in df["review_body"]:
        # Creating a message as required by the API
        prompt = f"Analyze the sentiment of this review: '{review}'. The sentiment options are [\"Positive\", \"Negative\", \"Neutral\"]. Provide your answer without any explanation."
        messages = [{"role": "user", "content": prompt}]

        # Calling the ChatCompletion API
        response = openai.ChatCompletion.create(
            model=model,
            messages=messages,
            temperature=0,
        )

        # Extracting sentiment from the response and appending to the list
        sentiment = response.choices[0].message["content"]
        sentiments.append(sentiment)

    return sentiments


# Get sentiments for each review
sentiments = get_sentiments_from_reviews(df)


In [13]:
#print(sentiments)

['Negative', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Negative', 'Negative', 'Negative', 'Positive', 'Positive', 'Positive', 'Negative', 'Positive', 'Positive', 'Positive', 'Negative', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Negative', 'Positive', 'Positive', 'Positive', 'Negative', 'Negative', 'Negative', 'Positive', 'Negative', 'Positive', 'Neutral', 'Positive', 'Negative', 'Positive', 'Positive', 'Positive', 'Positive', 'Negative', 'Positive', 'Positive', 'Negative', 'Negative', 'Positive', 'Positive', 'Negative', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Neutral', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Negative', 'Negative', 'Positive', 'Positive', 'Negative', 'Positive', 'Negative', 'Positive', 'Positive', 'Negative', 'Negative', 'Positive', 'Positive', 'Positive', 'Negative', 'Negative', 'Positive', 'Positive', 'Positive', 'Nega

In [11]:
# Adding generated sentiments as column in test data
df["generated_sentiment"] = sentiments

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["generated_sentiment"] = sentiments


In [12]:
df

Unnamed: 0,star_rating,review_body,sentiment,generated_sentiment
9205,5,Fantastic price.,Positive,Positive
1915,5,Nice and compact. The sound is loud and crisp....,Positive,Positive
7073,5,Delivery was fast and arrived the day it said...,Positive,Positive
2102,5,"I love this device. In fact, this is the seco...",Positive,Positive
8421,1,The product works beautifully when the input i...,Negative,Negative
...,...,...,...,...
3357,5,Works fine.,Positive,Positive
2157,5,Great product.,Positive,Positive
9632,5,Work,Positive,Neutral
245,5,Use it near the pool. Great sound and can answ...,Positive,Positive


In [13]:
# Saving results as excel file for analysis
df.to_excel("llm_result.xlsx")

In [14]:

# Calculate accuracy
accuracy = accuracy_score(df["sentiment"], df["generated_sentiment"])

# Calculate precision
precision = precision_score(df["sentiment"], df["generated_sentiment"], average='weighted')

# Calculate recall
recall = recall_score(df["sentiment"], df["generated_sentiment"], average='weighted')

# Calculate F1-score
f1 = f1_score(df["sentiment"], df["generated_sentiment"], average='weighted')


# Generate the classification report
report = classification_report(df["sentiment"], df["generated_sentiment"])


# Print the scores & classification report
print("Accuracy of GPT-3.5:", accuracy)
print("\nPrecision of GPT-3.5:", precision)
print("\nRecall of GPT-3.5:", recall)
print("\nF1-score of GPT-3.5:", f1)
print("\nClassification Report:\n", report)

Accuracy of GPT-3.5: 0.875

Precision of GPT-3.5: 0.8731753007796367

Recall of GPT-3.5: 0.875

F1-score of GPT-3.5: 0.8680025145643393

Classification Report:
               precision    recall  f1-score   support

    Negative       0.67      0.98      0.80       168
     Neutral       0.18      0.10      0.13        69
    Positive       0.98      0.92      0.95       763

    accuracy                           0.88      1000
   macro avg       0.61      0.67      0.63      1000
weighted avg       0.87      0.88      0.87      1000

