# **Chatgpt Review Sentiment Analysis** 🤖



In [4]:
# import packages
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### **1️⃣ Data Preparation & Cleaning**

- Load and inspect the dataset.
- Address missing values and duplicates.
- Clean the text data (remove special characters, stopwords, etc).
- Convert ratings into categorical labels (e.g., Positive, Neutral, Negative) for sentiment comparison.
- Explore data by descriptive statistics.




In [6]:
# Load dataset
df = pd.read_csv('ChatGPT_Reviews.csv')

In [7]:
# Inspect the dataset
print("First 5 rows of the dataset:")
df.head()

First 5 rows of the dataset:


Unnamed: 0,Review Id,Review,Ratings,Review Date
0,6fb93778-651a-4ad1-b5ed-67dd0bd35aac,good,5.0,8/23/2024 19:30
1,81caeefd-3a28-4601-a898-72897ac906f5,good,5.0,8/23/2024 19:28
2,452af49e-1d8b-4b68-b1ac-a94c64cb1dd5,nice app,5.0,8/23/2024 19:22
3,372a4096-ee6a-4b94-b046-cef0b646c965,"nice, ig",5.0,8/23/2024 19:20
4,b0d66a4b-9bde-4b7c-8b11-66ed6ccdd7da,"this is a great app, the bot is so accurate to...",5.0,8/23/2024 19:20


In [8]:
print("\nMissing values:")
print(df.isnull().sum())


Missing values:
Review Id      0
Review         1
Ratings        1
Review Date    1
dtype: int64


In [9]:
print("\nDuplicate entries:", df.duplicated().sum())


Duplicate entries: 0


In [10]:
# Drop missing values
df = df.dropna()
print(df.isnull().sum())

Review Id      0
Review         0
Ratings        0
Review Date    0
dtype: int64


In [11]:
# Drop duplicates
df = df.drop_duplicates()
print("\nDuplicate entries:", df.duplicated().sum())


Duplicate entries: 0


In [12]:
# Text cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\[.*?\]', '', text)  # Remove text inside brackets
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    return " ".join(words)

Inspired by: https://www.analyticsvidhya.com/blog/2022/01/text-cleaning-methods-in-nlp/

In [13]:
# Apply text cleaning to the "Review" column
df["Cleaned_Review"] = df["Review"].apply(clean_text)
df["Cleaned_Review"]

Unnamed: 0,Cleaned_Review
0,good
1,good
2,nice app
3,nice ig
4,great app bot accurate anything gives tips gam...
...,...
43671,really good pretty accurate
43672,amazing
43673,difficult create image
43674,awesome 👍


In [14]:
# Convert Ratings into categorical sentiment labels
def categorize_rating(rating):
    if rating >= 4:
        return "Positive"
    elif rating == 3:
        return "Neutral"
    else:
        return "Negative"

In [15]:
df["Sentiment"] = df["Ratings"].apply(categorize_rating)
df[["Sentiment", "Ratings"]]

Unnamed: 0,Sentiment,Ratings
0,Positive,5.0
1,Positive,5.0
2,Positive,5.0
3,Positive,5.0
4,Positive,5.0
...,...,...
43671,Positive,4.0
43672,Positive,5.0
43673,Positive,4.0
43674,Positive,5.0


In [16]:
# Explore descriptive statistics
print("\nSentiment Distribution:")
print(df["Sentiment"].value_counts())


Sentiment Distribution:
Sentiment
Positive    38835
Negative     3088
Neutral      1752
Name: count, dtype: int64


In [17]:
print("\nBasic statistics for Ratings:")
df.describe()


Basic statistics for Ratings:


Unnamed: 0,Ratings
count,43675.0
mean,4.540653
std,1.037341
min,1.0
25%,5.0
50%,5.0
75%,5.0
max,5.0


In [18]:
# Save cleaned dataset
df.to_csv("Cleaned_ChatGPT_Reviews.csv", index=False)