# Mini-Project 2 : Customer Feedback Analysis Project

1. Importing libraries

In [1]:
import pandas as pd
import openai
from openai import OpenAI
import os
# import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns

2. Loading the dataset and preprocessing

In [2]:
reviews = pd.read_json('Musical_Instruments_5.json', lines=True)
data = reviews.copy()
data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5,No more pops when I record my vocals.,1392940800,"02 21, 2014"


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10261 entries, 0 to 10260
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   reviewerID      10261 non-null  object
 1   asin            10261 non-null  object
 2   reviewerName    10234 non-null  object
 3   helpful         10261 non-null  object
 4   reviewText      10261 non-null  object
 5   overall         10261 non-null  int64 
 6   summary         10261 non-null  object
 7   unixReviewTime  10261 non-null  int64 
 8   reviewTime      10261 non-null  object
dtypes: int64(2), object(7)
memory usage: 721.6+ KB


In [4]:
data.isnull().sum()

reviewerID         0
asin               0
reviewerName      27
helpful            0
reviewText         0
overall            0
summary            0
unixReviewTime     0
reviewTime         0
dtype: int64

In [5]:
data.tail(30)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
10231,A31RULW0KNYJ5H,B00IZCSW3M,LA,"[1, 1]","For a long time, I never thought much about gu...",5,Cadillac of Guitar Strings,1398902400,"05 1, 2014"
10232,A3KZEGBTPH6MMF,B00IZCSW3M,"Lucy Cat ""Mandy""","[0, 0]",My father is a full-time gigging musician prim...,5,Great for pickin' on the Tele or almost anythi...,1402704000,"06 14, 2014"
10233,AWCJ12KBO5VII,B00IZCSW3M,Michael L. Knapp,"[7, 7]",D'Addario has always been one of the best stri...,5,All they claim to be,1400716800,"05 22, 2014"
10234,A29B4PAIOL7HYG,B00IZCSW3M,"N. Caruso ""gibsonjunkie""","[1, 1]",I usually use Gibson Vintage Nickle Strings on...,5,Nice strings,1399766400,"05 11, 2014"
10235,A27L5L6I7OSV5B,B00IZCSW3M,Otto Correct,"[1, 2]","Excellent tone, and I'm a bit surprised by tha...",5,Great strings! They really sing!,1401494400,"05 31, 2014"
10236,AOMEH9W6LHC4S,B00IZCSW3M,Personne,"[0, 0]",I've been stringing my guitars with D'Addario ...,5,"A little skinny for me, but the quality is und...",1400976000,"05 25, 2014"
10237,A3VDSGNIS92OVZ,B00IZCSW3M,"P. Hamm ""p-squared""","[8, 10]","Don't get me wrong, the improvement over the o...",3,Noticeable Improvement... but at what cost?,1398643200,"04 28, 2014"
10238,A27H0T39U3FZB5,B00IZCSW3M,"P. MSakamoto ""boy clothes""","[0, 0]",These have a nice bright sound and are easy on...,5,Nice bright sound with easy action,1403049600,"06 18, 2014"
10239,A2PD27UKAD3Q00,B00IZCSW3M,"Wilhelmina Zeitgeist ""coolartsybabe""","[2, 2]",D'Addario's NYXL1046 Nickel Plated Electric Gu...,5,Stay In Tune Better,1402963200,"06 17, 2014"
10240,A146H4KN4LFR60,B00J4TBMVO,angelfood,"[0, 0]",Just put these on my Martin DCX1E and they sou...,5,A Good Guitar Deserves Good Strings,1405036800,"07 11, 2014"


In [6]:
# fill NAN values with 'Unknown'
data.fillna('Unknown', inplace=True)
data.isnull().sum()

reviewerID        0
asin              0
reviewerName      0
helpful           0
reviewText        0
overall           0
summary           0
unixReviewTime    0
reviewTime        0
dtype: int64

In [7]:
data['overall'].unique() # no outliers found

array([5, 3, 4, 2, 1], dtype=int64)

2. Natural Language Analysis with ChatGPT:

In [8]:
# set my api key
openai.api_key = os.getenv('OPENAI_API_KEY')
client = openai.OpenAI()

In [9]:
# Function to analyze each review
def analyze_review(review_text):
    messages = [
        {"role": "system", "content": "You are an assistant that analyzes customer reviews."},
        {"role": "user", "content": f"Analyze the following review and categorize the feedback into themes and sentiments: {review_text}"}
    ]
    
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        max_tokens=150,
        n=1,
        stop=None,
        temperature=0.5,
    )
    
    return response.choices[0].message.content.strip()

In [10]:
import tiktoken

# Initialize the tokenizer for the OpenAI model
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")

# Function to estimate tokens
def estimate_tokens(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

# Apply the function to a sample of reviews
data['TokenCount'] = data['reviewText'].apply(estimate_tokens)

# Display the average token count
average_tokens = data['TokenCount'].mean()
print(f"Average tokens per review: {average_tokens}")

Average tokens per review: 112.03352499756359


In [11]:
# Apply the function to each review in the DataFrame
data['Analysis'] = data['reviewText'].apply(analyze_review)

# Display the DataFrame with the analysis
print(data.head())

applying my funtion tooks more than 6 hours because of the large amount of data. because of that i skipped this step.

In [None]:
# Count the frequency of each theme
theme_counts = data['Themes'].value_counts()

In [None]:
# Create a bar chart for common themes
plt.figure(figsize=(10, 6))
theme_counts.plot(kind='bar')
plt.title('Common Themes in Reviews')
plt.xlabel('Themes')
plt.ylabel('Frequency')
plt.show()

# Count the frequency of each sentiment
sentiment_counts = data['Sentiments'].value_counts()

# Create a pie chart for sentiment distribution
plt.figure(figsize=(8, 8))
sentiment_counts.plot(kind='pie', autopct='%1.1f%%')
plt.title('Sentiment Distribution in Reviews')
plt.ylabel('')
plt.show()