In [None]:
import pandas as pd

# Load the Excel file
df = pd.read_csv("chatgpt_style_reviews.csv")

print("First 5 rows:")
df.head()

In [None]:
print("\nDataset info:")
print(df.info())

In [None]:
print("\nMissing values:")
print(df.isnull().sum())

In [None]:
print("\nRating distribution:")
print(df['rating'].value_counts())

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:

def clean_text(text):
    text = text.lower() # Lowercase
    text = re.sub(r'[^a-z\s]', '', text) # Remove special chars/numbers
    words = text.split() 
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words] # Tokenize and lemmatize
    return ' '.join(words)

In [None]:
df['cleaned_review'] = df['review'].apply(clean_text)

In [None]:
df.head()

In [None]:
# Add review length
df['review_length'] = df['review'].apply(len)

In [None]:
# Convert date to datetime
df['date'] = pd.to_datetime(df['date'])

In [None]:
print(df.head())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

#Rating Distribution (Bar Chart)
plt.figure(figsize=(10, 6))
sns.countplot(x='rating', data=df, palette='viridis')
plt.title('Distribution of Review Ratings (1-5 Stars)')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

In [None]:
#Helpful Reviews (Pie Chart)
helpful_threshold = 10
helpful = df[df['helpful_votes'] >= helpful_threshold]
not_helpful = df[df['helpful_votes'] < helpful_threshold]

plt.figure(figsize=(8, 8))
plt.pie([len(helpful), len(not_helpful)], 
        labels=[f'Helpful (≥{helpful_threshold} votes)', 'Not Helpful'], 
        autopct='%1.1f%%', colors=['#66b3ff', '#ff9999'])
plt.title('Proportion of Helpful Reviews')
plt.show()

In [None]:
#Keywords in Positive vs Negative Reviews (Word Clouds)
from wordcloud import WordCloud

# Positive reviews (4-5 stars)
positive_text = ' '.join(df[df['rating'] >= 4]['cleaned_review'])
wordcloud_pos = WordCloud(width=800, height=400, background_color='white').generate(positive_text)

# Negative reviews (1-2 stars)
negative_text = ' '.join(df[df['rating'] <= 2]['cleaned_review'])
wordcloud_neg = WordCloud(width=800, height=400, background_color='black').generate(negative_text)

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
ax1.imshow(wordcloud_pos, interpolation='bilinear')
ax1.set_title('Positive Reviews (4-5 Stars)')
ax1.axis('off')
ax2.imshow(wordcloud_neg, interpolation='bilinear')
ax2.set_title('Negative Reviews (1-2 Stars)')
ax2.axis('off')
plt.show()

In [None]:
#Average Rating Over Time (Line Chart)
df['month'] = df['date'].dt.to_period('M')
monthly_avg = df.groupby('month')['rating'].mean()

plt.figure(figsize=(12, 6))
monthly_avg.plot(marker='o', color='purple')
plt.title('Average Rating Trend Over Time')
plt.xlabel('Month')
plt.ylabel('Average Rating')
plt.grid(True)
plt.show()

In [None]:
#Ratings by Location (Bar Chart)
top_locations = df['location'].value_counts().head(10).index
plt.figure(figsize=(12, 6))
sns.boxplot(x='location', y='rating', data=df[df['location'].isin(top_locations)], palette='Set3')
plt.title('Rating Distribution by Top 10 Locations')
plt.xticks(rotation=45)
plt.show()

In [None]:
#Platform Comparison (Web vs Mobile)
plt.figure(figsize=(8, 6))
sns.boxplot(x='platform', y='rating', data=df, palette='pastel')
plt.title('Rating Distribution by Platform')
plt.show()

In [None]:
#Verified vs Non-Verified Users
plt.figure(figsize=(8, 6))
sns.countplot(x='verified_purchase', hue='rating', data=df, palette='coolwarm')
plt.title('Rating Distribution: Verified vs Non-Verified Users')
plt.show()

In [None]:
# Define sentiment mapping
def get_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

# Create the column
df['sentiment'] = df['rating'].apply(get_sentiment)

In [None]:
#Review Length vs Sentiment (Boxplot)
plt.figure(figsize=(10, 6))
sns.boxplot(x='sentiment', y='review_length', data=df, palette='autumn', 
            order=['negative', 'neutral', 'positive'])
plt.yscale('log')
plt.title('Review Length by Sentiment')
plt.show()

In [None]:
# Top Words in 1-Star Reviews
from collections import Counter

one_star_words = ' '.join(df[df['rating'] == 1]['cleaned_review']).split()
word_freq = Counter(one_star_words).most_common(20)

plt.figure(figsize=(12, 6))
sns.barplot(x=[word[0] for word in word_freq], y=[word[1] for word in word_freq], palette='Reds_r')
plt.title('Top 20 Words in 1-Star Reviews')
plt.xticks(rotation=45)
plt.show()

In [None]:
#Best-Rated ChatGPT Version
plt.figure(figsize=(12, 6))
df.groupby('version')['rating'].mean().sort_values().plot(kind='barh', color='teal')
plt.title('Average Rating by ChatGPT Version')
plt.xlabel('Average Rating')
plt.show()

In [None]:
#Model Training & Evaluation
#Feature Engineering
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X = tfidf.fit_transform(df['cleaned_review'])
y = df['sentiment']  # Created during EDA

In [None]:
#Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

In [None]:
#Train Models 
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier()
}

best_model = None
best_accuracy = 0

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.2f}")
    
    # Track the best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

In [None]:
#Evaluate Metrics
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Generate predictions with the best model
y_pred = best_model.predict(X_test)

# Classification report
print("Best Model Evaluation:")
print(classification_report(y_test, y_pred))

# Confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(
    confusion_matrix(y_test, y_pred), 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=['Negative', 'Neutral', 'Positive'],
    yticklabels=['Negative', 'Neutral', 'Positive']
)
plt.title("Confusion Matrix (Best Model)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
import pandas as pd
from datasets import load_dataset
from google_play_scraper import Sort, reviews_all
import os


In [None]:
# Create directory if it doesn't exist
os.makedirs('data', exist_ok=True)


In [None]:
from google_play_scraper import reviews_all, Sort

reviews = reviews_all(
    'com.openai.chatgpt',
    lang='en',
    country='us',
    sort=Sort.NEWEST,
    count=1000  # Number of reviews
)

df = pd.DataFrame(reviews)[['content', 'score', 'at']]
df.columns = ['review', 'rating', 'date']

In [None]:
df.head()

In [None]:
# Check basic stats
print(df.info())
print(df['rating'].value_counts())

In [None]:
df.to_csv('chatgpt_play_store_reviews.csv', index=False)

In [None]:
# Handle missing values
scraped_data = df.dropna(subset=['review'])

In [None]:
import re

def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

scraped_data['cleaned_review'] = scraped_data['review'].apply(clean_text)
scraped_data.head()

In [None]:
existing_df = pd.read_csv("chatgpt_style_reviews.csv") 
scraped_df = pd.read_csv("chatgpt_play_store_reviews.csv")

In [None]:
scraped_df = scraped_df.rename(columns={'content': 'review'})
scraped_df['platform'] = 'Mobile'  # Add missing column

In [None]:
# Merge vertically
combined_df = pd.concat([existing_df, scraped_df], ignore_index=True)
combined_df.head()


In [None]:
print(combined_df.info())

In [None]:
import re

def is_english_simple(text):
    text = str(text).lower()
    # Basic check: >70% typical English characters/words
    english_chars = len(re.findall(r'[a-z\\s]', text))
    return english_chars / len(text) > 0.7 if text else False

combined_df = combined_df[combined_df['review'].apply(is_english_simple)].copy()

In [None]:
combined_df.tail()

In [None]:
combined_df.isnull().sum()

In [None]:
# List all columns you want to KEEP
columns_to_keep = ['review', 'rating', 'date', 'platform']  

# Drop all other columns (in-place)
combined_df.drop(columns=combined_df.columns.difference(columns_to_keep), inplace=True)
combined_df.head()

In [None]:
combined_df.to_csv('final_data.csv', index=False)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load preprocessed English data
df = pd.read_csv('final_data.csv')

# Create sentiment labels (if not already done)
df['sentiment'] = df['rating'].apply(
    lambda x: 'positive' if x >=4 else 'negative' if x <=2 else 'neutral'
)

# Verify class distribution
print(df['sentiment'].value_counts())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Option A: TF-IDF (Best for traditional ML)
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),  # Capture phrases like "not good"
    stop_words='english'
)
X = tfidf.fit_transform(df['review'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    df['sentiment'],
    test_size=0.2,
    stratify=df['sentiment'],  # Preserve class balance
    random_state=43
)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": MultinomialNB()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(f"{name} Accuracy: {model.score(X_test, y_test):.2f}")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# For best traditional model
best_model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
y_pred = best_model.predict(X_test)

print(classification_report(y_test, y_pred))

# Confusion Matrix
sns.heatmap(confusion_matrix(y_test, y_pred), 
            annot=True, fmt='d',
            xticklabels=best_model.classes_,
            yticklabels=best_model.classes_)

In [None]:
import pickle
# Save both TF-IDF and model
with open('sentiment_pipeline.pkl', 'wb') as f:
    pickle.dump({
        'tfidf': tfidf,
        'model': best_model
    }, f)

In [None]:
import pickle
from textblob import TextBlob

def load_model():
    """Load the sentiment analysis pipeline"""
    try:
        with open('sentiment_pipeline.pkl', 'rb') as f:
            pipeline = pickle.load(f)
        return pipeline['model'], pipeline['tfidf']
    except FileNotFoundError:
        print("Model file not found. Using TextBlob as fallback.")
        return None, None

def predict_sentiment(text, model=None, vectorizer=None):
    """Predict sentiment with confidence score"""
    if model and vectorizer:
        vec = vectorizer.transform([text])
        proba = model.predict_proba(vec)[0]
        sentiment = model.predict(vec)[0]
        confidence = round(max(proba) * 100, 1)
    else:
        analysis = TextBlob(text)
        sentiment = 'positive' if analysis.sentiment.polarity > 0 else 'negative'
        confidence = round(abs(analysis.sentiment.polarity) * 100, 1)
    return sentiment, confidence

def main():
    print("\n" + "="*50)
    print("💬 CHATGPT REVIEW SENTIMENT ANALYZER")
    print("="*50)
    print("Type a ChatGPT review and press Enter to analyze")
    print("Type 'quit' to exit\n")
    
    model, vectorizer = load_model()
    
    while True:
        review = input("\nEnter a ChatGPT review (or 'quit' to exit): ").strip()
        
        # Exit condition
        if review.lower() == 'quit':
            print("\nThank you for using the analyzer! Goodbye! 👋")
            break  # This exits the loop
        
        if not review:
            print("⚠️ Please enter a valid review")
            continue
            
        sentiment, confidence = predict_sentiment(review, model, vectorizer)
        
        print("\n" + "="*50)
        print(f"📝 REVIEW: {review}")
        print("-"*50)
        print(f"🧠 SENTIMENT: {'👍 POSITIVE' if sentiment == 'positive' else '👎 NEGATIVE' if sentiment == 'negative' else '😐 NEUTRAL'}")
        print(f"🎯 CONFIDENCE: {confidence}%")
        print("="*50)

if __name__ == "__main__":
    main()  # Program will exit completely after loop breaks