In [3]:
# ===============================
# IMPORT LIBRARIES
# ===============================
import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
import webbrowser
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')

# ===============================
# HTML OUTPUT SETUP
# ===============================
html_files_path = "./html_outputs"
os.makedirs(html_files_path, exist_ok=True)
plot_containers = ""
plot_width = 400
plot_height = 300

def save_plot_as_html(fig, filename, insight):
    global plot_containers
    filepath = os.path.join(html_files_path, filename)
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    plot_containers += f'''
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    '''
    fig.write_html(filepath, full_html=False, include_plotlyjs='inline')

# ===============================
# LOAD DATA
# ===============================
apps_df = pd.read_csv('googleplaystore.csv')
reviews_df = pd.read_csv('googleplaystore_user_reviews.csv')

# ===============================
# CLEAN DATA
# ===============================
apps_df['Rating'] = apps_df['Rating'].fillna(apps_df['Rating'].mean())
# Remove commas and plus signs
apps_df['Installs'] = apps_df['Installs'].str.replace('[+,]', '', regex=True)

# Convert to numeric, invalid parsing will become NaN
apps_df['Installs'] = pd.to_numeric(apps_df['Installs'], errors='coerce')

# Fill NaN with 0
apps_df['Installs'] = apps_df['Installs'].fillna(0).astype(int)
apps_df['Price'] = apps_df['Price'].str.replace('[$,]', '', regex=True)
apps_df['Price'] = pd.to_numeric(apps_df['Price'], errors='coerce').fillna(0)
apps_df['Size'] = apps_df['Size'].apply(lambda x: float(str(x).replace('M','')) if 'M' in str(x) else (float(str(x).replace('k',''))/1024 if 'k' in str(x) else np.nan))
apps_df['Revenue'] = apps_df['Price'] * apps_df['Installs']
apps_df['Log_Installs'] = np.log1p(apps_df['Installs'])
apps_df['Reviews'] = pd.to_numeric(apps_df['Reviews'], errors='coerce').fillna(0).astype(int)
apps_df['Log_Reviews'] = np.log1p(apps_df['Reviews'])
apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'], errors='coerce')
apps_df['Year'] = apps_df['Last Updated'].dt.year

def rating_group(rating):
    if rating >= 4: return 'Top rated app'
    elif rating >= 3: return 'Above average'
    elif rating >= 2: return 'Average'
    else: return 'Below Average'
apps_df['Rating_Group'] = apps_df['Rating'].apply(rating_group)

# ===============================
# SENTIMENT ANALYSIS
# ===============================
sia = SentimentIntensityAnalyzer()
reviews_df['Sentiment_Score'] = reviews_df['Translated_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

# Map compound score to simple sentiment
reviews_df['Sentiment_Label'] = reviews_df['Sentiment_Score'].apply(lambda x: 'Positive' if x>0 else ('Negative' if x<0 else 'Neutral'))

# ===============================
# FIGURE 1: Top Categories
# ===============================
category_counts = apps_df['Category'].value_counts().nlargest(10)
fig1 = px.bar(x=category_counts.index, y=category_counts.values,
              title='Top Categories on Play Store',
              labels={'x':'Category','y':'Count'},
              color=category_counts.index, color_discrete_sequence=px.colors.sequential.Plasma,
              width=plot_width, height=plot_height)
fig1.update_layout(plot_bgcolor='black', paper_bgcolor='black', font_color='white')
save_plot_as_html(fig1, "Category_Graph_1.html", "Top categories: tools, entertainment, productivity.")

# ===============================
# FIGURE 2: App Type Distribution
# ===============================
type_counts = apps_df['Type'].value_counts()
fig2 = px.pie(values=type_counts.values, names=type_counts.index,
              title='App Type Distribution', color_discrete_sequence=px.colors.sequential.RdBu,
              width=plot_width, height=plot_height)
fig2.update_layout(plot_bgcolor='black', paper_bgcolor='black', font_color='white')
save_plot_as_html(fig2, "Type_Graph_2.html", "Most apps are free.")

# ===============================
# FIGURE 3: Rating Distribution
# ===============================
fig3 = px.histogram(apps_df, x='Rating', nbins=20,
                    title='Rating Distribution', color_discrete_sequence=['#636EFA'],
                    width=plot_width, height=plot_height)
fig3.update_layout(plot_bgcolor='black', paper_bgcolor='black', font_color='white')
save_plot_as_html(fig3, "Rating_Graph_3.html", "Ratings skewed towards higher values.")

# ===============================
# FIGURE 4: Sentiment Distribution
# ===============================
sentiment_counts = reviews_df['Sentiment_Label'].value_counts()
fig4 = px.bar(x=sentiment_counts.index, y=sentiment_counts.values,
              title='Sentiment Distribution', labels={'x':'Sentiment','y':'Count'},
              color=sentiment_counts.index, color_discrete_sequence=px.colors.sequential.RdBu,
              width=plot_width, height=plot_height)
fig4.update_layout(plot_bgcolor='black', paper_bgcolor='black', font_color='white')
save_plot_as_html(fig4, "Sentiment_Graph_4.html", "Mostly positive reviews with some neutral/negative.")

# ===============================
# FIGURE 5: Installs by Category
# ===============================
installs_by_category = apps_df.groupby('Category')['Installs'].sum().nlargest(10)
fig5 = px.bar(x=installs_by_category.index, y=installs_by_category.values,
              title='Installs by Category', labels={'x':'Category','y':'Installs'},
              color=installs_by_category.index, color_discrete_sequence=px.colors.sequential.Blues,
              width=plot_width, height=plot_height)
fig5.update_layout(plot_bgcolor='black', paper_bgcolor='black', font_color='white')
save_plot_as_html(fig5, "Installs_Graph_5.html", "Social and communication apps lead in installs.")

# ===============================
# FIGURE 6: Updates per Year
# ===============================
updates_per_year = apps_df['Year'].value_counts().sort_index()
fig6 = px.line(x=updates_per_year.index, y=updates_per_year.values,
               title='Number of Updates per Year', labels={'x':'Year','y':'Number of Updates'},
               color_discrete_sequence=['#d8b3fa'], width=plot_width, height=plot_height)
fig6.update_layout(plot_bgcolor='black', paper_bgcolor='black', font_color='white')
save_plot_as_html(fig6, "Updates_Graph_6.html", "Developers actively maintain apps over time.")

# ===============================
# FIGURE 7: Revenue by Category
# ===============================
revenue_by_category = apps_df.groupby('Category')['Revenue'].sum().nlargest(10)
fig7 = px.bar(x=revenue_by_category.index, y=revenue_by_category.values,
              title='Revenue by Category', labels={'x':'Category','y':'Revenue'},
              color=revenue_by_category.index, color_discrete_sequence=px.colors.sequential.Greens,
              width=plot_width, height=plot_height)
fig7.update_layout(plot_bgcolor='black', paper_bgcolor='black', font_color='white')
save_plot_as_html(fig7, "Revenue_Graph_7.html", "Business & Productivity apps lead in revenue.")

# ===============================
# FIGURE 8: Top Genres
# ===============================
genre_counts = apps_df['Genres'].str.split(';', expand=True).stack().value_counts().nlargest(10)
fig8 = px.bar(x=genre_counts.index, y=genre_counts.values,
              title='Top Genres', labels={'x':'Genre','y':'Count'},
              color=genre_counts.index, color_discrete_sequence=px.colors.sequential.OrRd,
              width=plot_width, height=plot_height)
fig8.update_layout(plot_bgcolor='black', paper_bgcolor='black', font_color='white')
save_plot_as_html(fig8, "Genre_Graph_8.html", "Action and Casual are most common genres.")

# ===============================
# FIGURE 9: Rating vs Last Update
# ===============================
fig9 = px.scatter(apps_df, x='Last Updated', y='Rating', color='Type',
                  title='Rating vs Last Update', color_discrete_sequence=px.colors.qualitative.Vivid,
                  width=plot_width, height=plot_height)
fig9.update_layout(plot_bgcolor='black', paper_bgcolor='black', font_color='white')
save_plot_as_html(fig9, "Update_Graph_9.html", "No strong correlation between last update and rating.")

# ===============================
# FIGURE 10: Rating Paid vs Free
# ===============================
fig10 = px.box(apps_df, x='Type', y='Rating', color='Type',
               title='Rating for Paid vs Free Apps', color_discrete_sequence=px.colors.qualitative.Pastel,
               width=plot_width, height=plot_height)
fig10.update_layout(plot_bgcolor='black', paper_bgcolor='black', font_color='white')
save_plot_as_html(fig10, "Paid_Free_Graph_10.html", "Paid apps generally have higher ratings.")

# ===============================
# CREATE DASHBOARD
# ===============================
dashboard_html = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Google Play Store Analytics</title>
<style>
body {{background-color:#333; color:#fff; font-family: Arial; margin:0; padding:0;}}
.container {{display:flex; flex-wrap:wrap; justify-content:center; padding:20px;}}
.plot-container {{border:2px solid #555; margin:10px; padding:10px; width:{plot_width}px; height:{plot_height}px; overflow:hidden; position:relative; cursor:pointer;}}
.insights {{display:none; position:absolute; right:10px; top:10px; background-color:rgba(0,0,0,0.7); padding:5px; border-radius:5px; color:#fff;}}
.plot-container:hover .insights {{display:block;}}
</style>
<script>
function openPlot(filename) {{window.open(filename, '_blank');}}
</script>
</head>
<body>
<div class="container">
{plot_containers}
</div>
</body>
</html>
"""

dashboard_path = os.path.join(html_files_path, "dashboard.html")
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(dashboard_html)

webbrowser.open('file://' + os.path.realpath(dashboard_path))


[nltk_data] Error loading vader_lexicon: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


True