In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tkinter as tk
from tkinter import messagebox
import webbrowser
import os
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

# Define positive and negative word lists (simplified for demonstration)
positive_words = ['good', 'great', 'excellent', 'love', 'happy', 'best', 'awesome', 'amazing', 'positive', 'fantastic']
negative_words = ['bad', 'terrible', 'hate', 'worst', 'awful', 'horrible', 'negative', 'fail', 'poor', 'disappointed']

def get_sentiment(text):
    """Function to classify sentiment of a review text using keyword matching"""
    text = text.lower()
    positive_score = sum(word in text for word in positive_words)
    negative_score = sum(word in text for word in negative_words)
    
    if positive_score > negative_score:
        return 'positive'
    elif negative_score > positive_score:
        return 'negative'
    else:
        return 'neutral'

def generate_visualization():
    try:
        # Load your dataset
        data = pd.read_csv('US_APPLE_APPS_REVIEWS.csv')

        # Ensure the dataset has the required columns
        required_columns = ['applicationName', 'review', 'rating']
        if not all(col in data.columns for col in required_columns):
            raise ValueError(f"Dataset is missing required columns. Required columns: {required_columns}")

        # Step 1: Filter apps with more than 1,000 reviews
        app_review_counts = data['applicationName'].value_counts()
        apps_with_enough_reviews = app_review_counts[app_review_counts > 1000].index
        filtered_data = data[data['applicationName'].isin(apps_with_enough_reviews)]

        # Step 2: Perform sentiment analysis on reviews
        filtered_data['sentiment'] = filtered_data['review'].apply(get_sentiment)

        # Step 3: Create rating groups
        def get_rating_group(rating):
            if rating <= 2:
                return '1-2 stars'
            elif rating <= 4:
                return '3-4 stars'
            else:
                return '4-5 stars'

        filtered_data['rating_group'] = filtered_data['rating'].apply(get_rating_group)

        # Step 4: Identify top 5 applications
        top_apps = filtered_data['applicationName'].value_counts().head(5).index
        final_data = filtered_data[filtered_data['applicationName'].isin(top_apps)]

        # Step 5: Prepare data for visualization
        pivot_table = final_data.pivot_table(
            index=['applicationName', 'rating_group'],
            columns='sentiment',
            aggfunc='size',
            fill_value=0
        ).reset_index()

        # Use list instead of tuple for column selection
        total_reviews = pivot_table.groupby('applicationName')[['negative', 'neutral', 'positive']].transform('sum')
        pivot_table['negative'] = pivot_table['negative'] / total_reviews['negative'] * 100
        pivot_table['neutral'] = pivot_table['neutral'] / total_reviews['neutral'] * 100
        pivot_table['positive'] = pivot_table['positive'] / total_reviews['positive'] * 100

        # Step 6: Create visualization
        fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(24, 14))
        fig.suptitle('Sentiment Distribution by Rating Group and Application', fontsize=20, y=0.95)

        axes = axes.flatten()

        for i, app in enumerate(top_apps):
            app_data = pivot_table[pivot_table['applicationName'] == app]

            bars = axes[i].bar(
                app_data['rating_group'],
                app_data['negative'],
                color='tomato',
                edgecolor='white',
                label='Negative'
            )

            bars2 = axes[i].bar(
                app_data['rating_group'],
                app_data['neutral'],
                bottom=app_data['negative'],
                color='goldenrod',
                edgecolor='white',
                label='Neutral'
            )

            bars3 = axes[i].bar(
                app_data['rating_group'],
                app_data['positive'],
                bottom=app_data['negative'] + app_data['neutral'],
                color='forestgreen',
                edgecolor='white',
                label='Positive'
            )

            axes[i].set_title(f'Application: {app}', fontsize=14)
            axes[i].set_xlabel('Rating Group', fontsize=12)
            axes[i].set_ylabel('Percentage of Reviews (%)', fontsize=12)
            axes[i].set_ylim(0, 100)
            axes[i].legend(loc='upper left')
            axes[i].grid(True, axis='y', linestyle='--', alpha=0.7)

            for bar in bars + bars2 + bars3:
                height = bar.get_height()
                if height > 5:
                    axes[i].annotate(f'{height:.1f}%',
                                   (bar.get_x() + bar.get_width() / 2, bar.get_y() + height / 2),
                                   ha='center', va='center', fontsize=9, color='white')

        for j in range(i+1, len(axes)):
            fig.delaxes(axes[j])

        plt.tight_layout(rect=[0, 0, 1, 0.95])

        plt.savefig('sentiment_distribution.png', dpi=300, bbox_inches='tight')
        plt.close()

        messagebox.showinfo("Success", "Visualization generated successfully!")
        webbrowser.open('file://' + os.path.realpath('index.html'))

    except Exception as e:
        messagebox.showerror("Error", str(e))

# Create Tkinter GUI
root = tk.Tk()
root.title("Sentiment Analysis Visualization")

frame = tk.Frame(root)
frame.pack(pady=20)

button = tk.Button(
    frame,
    text="Generate Visualization",
    command=generate_visualization,
    font=("Arial", 12),
    bg="#4CAF50",
    fg="white",
    padx=20,
    pady=10
)
button.pack()

root.mainloop()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ckish\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['sentiment'] = filtered_data['review'].apply(get_sentiment)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['rating_group'] = filtered_data['rating'].apply(get_rating_group)
