<a href="https://colab.research.google.com/github/laxmivaishnavee22/OasisInfobyte/blob/main/Android_App_Market.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()

Saving apps.csv to apps.csv
Saving user_reviews.csv to user_reviews.csv


In [None]:
!pip install -q vaderSentiment wordcloud openpyxl

import os
import re
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud, STOPWORDS

OUTDIR = 'outputs'
os.makedirs(OUTDIR, exist_ok=True)

apps_fname = 'apps.csv'
reviews_fname = 'user_reviews.csv'

def read_csv_or_excel(fname):
    if not os.path.exists(fname):
        raise FileNotFoundError(f"{fname} not found in workspace. Upload it or change the filename.")
    if fname.lower().endswith(('.xls', '.xlsx')):
        return pd.read_excel(fname)
    else:
        return pd.read_csv(fname)

print("Loading datasets...")
apps = read_csv_or_excel(apps_fname)
reviews = read_csv_or_excel(reviews_fname)
print("Apps shape:", apps.shape)
print("Reviews shape:", reviews.shape)

apps.columns = [c.strip() for c in apps.columns]
reviews.columns = [c.strip() for c in reviews.columns]

def find_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c

    cols_lower = {col.lower(): col for col in df.columns}
    for c in candidates:
        if c.lower() in cols_lower:
            return cols_lower[c.lower()]
    return None

print("\napps columns:", apps.columns.tolist())
print("reviews columns:", reviews.columns.tolist())

app_name_col = find_col(apps, ['App', 'app', 'app_name', 'name'])
apps_category_col = find_col(apps, ['Category', 'category'])
apps_rating_col = find_col(apps, ['Rating', 'rating', 'average_rating'])
apps_installs_col = find_col(apps, ['Installs', 'installs', 'Number of installs', 'Installs'])
apps_size_col = find_col(apps, ['Size', 'size'])
apps_price_col = find_col(apps, ['Price', 'price'])
apps_reviews_count_col = find_col(apps, ['Reviews','Number of Reviews','Reviews'])
apps_updated_col = find_col(apps, ['Last Updated', 'Updated', 'last updated'])

reviews_app_col = find_col(reviews, ['App', 'app', 'app_name', 'App Name'])
review_text_col = find_col(reviews, ['Translated_Review', 'Content', 'Review', 'review', 'translated_review', 'review_text'])
review_sentiment_col = find_col(reviews, ['Sentiment', 'sentiment'])  # if labelled already

print("\nDetected columns mapping:")
print("app_name_col:", app_name_col)
print("apps_category_col:", apps_category_col)
print("apps_rating_col:", apps_rating_col)
print("apps_installs_col:", apps_installs_col)
print("apps_size_col:", apps_size_col)
print("reviews_app_col:", reviews_app_col)
print("review_text_col:", review_text_col)
print("review_sentiment_col (if any):", review_sentiment_col)

apps_clean = apps.copy()

if apps_installs_col:
    def parse_installs(x):
        if pd.isna(x):
            return np.nan
        s = str(x)
        s = s.replace(',', '').replace('+','').strip()
        try:
            return int(s)
        except:

            return np.nan
    apps_clean['installs_num'] = apps_clean[apps_installs_col].map(parse_installs)
else:
    apps_clean['installs_num'] = np.nan

def parse_size(s):
    if pd.isna(s):
        return np.nan
    s = str(s).strip()
    if s.lower() in ('varies with device','not available', 'nan', ''):
        return np.nan
    try:
        if s.endswith('M'):
            return float(s[:-1]) * 1e6
        if s.endswith('k') or s.endswith('K'):
            return float(s[:-1]) * 1e3

        return float(s)
    except:
        return np.nan

if apps_size_col:
    apps_clean['size_bytes'] = apps_clean[apps_size_col].map(parse_size)
else:
    apps_clean['size_bytes'] = np.nan


if apps_rating_col:
    apps_clean['rating'] = pd.to_numeric(apps_clean[apps_rating_col], errors='coerce')
else:
    apps_clean['rating'] = np.nan


if apps_price_col:
    def parse_price(x):
        if pd.isna(x): return 0.0
        s = str(x).strip().replace('$','')
        try:
            return float(s)
        except:
            return 0.0
    apps_clean['price'] = apps_clean[apps_price_col].map(parse_price)
else:
    apps_clean['price'] = 0.0


if apps_reviews_count_col:
    apps_clean['reviews_count'] = pd.to_numeric(apps_clean[apps_reviews_count_col], errors='coerce')
else:
    apps_clean['reviews_count'] = np.nan


if apps_category_col:
    apps_clean['category'] = apps_clean[apps_category_col].astype(str)
else:
    apps_clean['category'] = 'Unknown'


if app_name_col:
    apps_clean['app_name'] = apps_clean[app_name_col].astype(str)
else:
    raise KeyError("App name column not detected in apps.csv. Expected 'App' or similar.")


apps_clean = apps_clean.drop_duplicates(subset=['app_name']).reset_index(drop=True)
print("\nCleaned apps sample:")
display(apps_clean[['app_name','category','rating','installs_num','size_bytes','price','reviews_count']].head())


reviews_clean = reviews.copy()

if review_text_col:
    reviews_clean['review_text'] = reviews_clean[review_text_col].astype(str)
else:
    raise KeyError("Review text column not detected in user_reviews.csv. Expected 'Translated_Review' or 'Review'.")


if reviews_app_col:
    reviews_clean['app_name'] = reviews_clean[reviews_app_col].astype(str)
else:

    raise KeyError("App column not detected in user_reviews.csv. Expected 'App' or similar.")


reviews_clean = reviews_clean[~reviews_clean['review_text'].isna()].reset_index(drop=True)
print("\nSample reviews:")
display(reviews_clean[['app_name','review_text']].head())


analyzer = SentimentIntensityAnalyzer()

def vader_sentiment_label(text):
    s = analyzer.polarity_scores(str(text))
    comp = s['compound']
    if comp >= 0.05:
        return 'positive', comp
    elif comp <= -0.05:
        return 'negative', comp
    else:
        return 'neutral', comp


if review_sentiment_col:
    reviews_clean['orig_sentiment'] = reviews_clean[review_sentiment_col].astype(str)
reviews_clean[['sentiment_label','sentiment_score']] = reviews_clean['review_text'].apply(lambda t: pd.Series(vader_sentiment_label(t)))
print("\nSentiment distribution (computed):")
print(reviews_clean['sentiment_label'].value_counts(normalize=True).mul(100).round(2))


apps_clean['app_name_lc'] = apps_clean['app_name'].str.lower().str.strip()
reviews_clean['app_name_lc'] = reviews_clean['app_name'].str.lower().str.strip()


agg = reviews_clean.groupby('app_name_lc').agg(
    n_reviews=('review_text','count'),
    pct_positive=('sentiment_label', lambda s: (s=='positive').sum()/len(s)),
    pct_neutral=('sentiment_label', lambda s: (s=='neutral').sum()/len(s)),
    pct_negative=('sentiment_label', lambda s: (s=='negative').sum()/len(s)),
    mean_sentiment_score=('sentiment_score','mean'),
).reset_index()


apps_summary = apps_clean.merge(agg, left_on='app_name_lc', right_on='app_name_lc', how='left')


apps_summary[['n_reviews','pct_positive','pct_neutral','pct_negative','mean_sentiment_score']] = \
    apps_summary[['n_reviews','pct_positive','pct_neutral','pct_negative','mean_sentiment_score']].fillna(0)

print("\nMerged app summary sample:")
display(apps_summary[['app_name','category','rating','installs_num','n_reviews','pct_positive','mean_sentiment_score']].head())

cat_counts = apps_summary['category'].value_counts().head(12)
plt.figure(figsize=(8,4))
cat_counts.plot(kind='bar')
plt.title('Top categories by number of apps')
plt.xlabel('Category')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, 'top_categories.png'))
plt.close()


plt.figure(figsize=(6,4))
apps_summary['rating'].dropna().hist(bins=20)
plt.title('App rating distribution')
plt.xlabel('Rating')
plt.ylabel('Number of apps')
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, 'rating_distribution.png'))
plt.close()


plt.figure(figsize=(6,5))
x = apps_summary['installs_num'].replace(0, np.nan)
y = apps_summary['rating']
mask = ~x.isna() & ~y.isna()
plt.scatter(x[mask].apply(lambda v: np.log1p(v)), y[mask], alpha=0.5)
plt.xlabel('log(1 + installs)')
plt.ylabel('rating')
plt.title('Rating vs installs (log-scale)')
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, 'rating_vs_installs.png'))
plt.close()


top_installed = apps_summary.sort_values('installs_num', ascending=False).head(15)[['app_name','installs_num','rating']]
top_installed.to_csv(os.path.join(OUTDIR, 'top_installed_apps.csv'), index=False)
print("\nTop installed apps saved to outputs/top_installed_apps.csv")


all_positive_text = " ".join(reviews_clean.loc[reviews_clean['sentiment_label']=='positive', 'review_text'].astype(str).tolist())[:1000000]
if len(all_positive_text) > 100:
    wc = WordCloud(width=800, height=400, stopwords=STOPWORDS, collocations=False).generate(all_positive_text)
    plt.figure(figsize=(10,5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title('WordCloud - Positive Reviews (global)')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTDIR, 'wordcloud_positive_reviews.png'))
    plt.close()
    print("Saved positive reviews wordcloud.")
else:
    print("Not enough positive review text for a global wordcloud.")


top_cats = apps_summary.groupby('category').agg(
    n_apps=('app_name','count'),
    avg_rating=('rating','mean'),
    avg_sentiment=('mean_sentiment_score','mean'),
    avg_installs=('installs_num','median'),
).sort_values('n_apps', ascending=False).head(12)
top_cats.to_csv(os.path.join(OUTDIR,'top_categories_summary.csv'))
print("Saved category summary to outputs/top_categories_summary.csv")

plt.figure(figsize=(8,4))
top_cats['avg_rating'].plot(kind='bar')
plt.title('Average rating for top categories')
plt.ylabel('avg rating')
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR,'top_categories_avg_rating.png'))
plt.close()


apps_summary_out = apps_summary[['app_name','category','rating','installs_num','price','reviews_count','n_reviews','pct_positive','pct_neutral','pct_negative','mean_sentiment_score','size_bytes']]
apps_summary_out.to_csv(os.path.join(OUTDIR,'app_review_summary.csv'), index=False)
print("Saved app-level review summary to outputs/app_review_summary.csv")


print("\n--- Quick insights ---")

frac_with_reviews = (apps_summary['n_reviews']>0).mean()
print(f"Fraction of apps with at least one review in reviews.csv: {frac_with_reviews:.2%}")


min_reviews_filter = 30
candidates = apps_summary.loc[apps_summary['n_reviews']>=min_reviews_filter].sort_values('pct_positive', ascending=False)
print("\nTop apps by positive review percentage (min reviews={}):".format(min_reviews_filter))
display(candidates[['app_name','category','n_reviews','pct_positive','mean_sentiment_score']].head(10))


corr_cols = ['rating','installs_num','n_reviews','pct_positive','mean_sentiment_score','size_bytes','price']
corr_df = apps_summary_out[corr_cols].copy()
corr = corr_df.corr()
plt.figure(figsize=(7,6))
plt.imshow(corr, interpolation='nearest')
plt.colorbar()
plt.xticks(range(len(corr_cols)), corr_cols, rotation=90)
plt.yticks(range(len(corr_cols)), corr_cols)
plt.title('Correlation matrix (app-level)')
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR,'app_correlation_matrix.png'))
plt.close()
print("Saved correlation matrix to outputs/app_correlation_matrix.png")

print("\nAll done. Check the 'outputs/' folder for CSVs and images.")
print("If you want, I can now: (a) split this into a Colab notebook with markdown cells, (b) add deeper NLP / topic modeling, (c) add time-based review trends.")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hLoading datasets...
Apps shape: (9659, 14)
Reviews shape: (64295, 5)

apps columns: ['Unnamed: 0', 'App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
reviews columns: ['App', 'Translated_Review', 'Sentiment', 'Sentiment_Polarity', 'Sentiment_Subjectivity']

Detected columns mapping:
app_name_col: App
apps_category_col: Category
apps_rating_col: Rating
apps_installs_col: Installs
apps_size_col: Size
reviews_app_col: App
review_text_col: Translated_Review
review_sentiment_col (if any): Sentiment

Cleaned apps sample:


Unnamed: 0,app_name,category,rating,installs_num,size_bytes,price,reviews_count
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,10000,19.0,0.0,159
1,Coloring book moana,ART_AND_DESIGN,3.9,500000,14.0,0.0,967
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,5000000,8.7,0.0,87510
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,50000000,25.0,0.0,215644
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,100000,2.8,0.0,967



Sample reviews:


Unnamed: 0,app_name,review_text
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...
1,10 Best Foods for You,This help eating healthy exercise regular basis
2,10 Best Foods for You,
3,10 Best Foods for You,Works great especially going grocery store
4,10 Best Foods for You,Best idea us



Sentiment distribution (computed):
sentiment_label
neutral     48.48
positive    39.82
negative    11.70
Name: proportion, dtype: float64

Merged app summary sample:


Unnamed: 0,app_name,category,rating,installs_num,n_reviews,pct_positive,mean_sentiment_score
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,10000,0.0,0.0,0.0
1,Coloring book moana,ART_AND_DESIGN,3.9,500000,58.0,0.448276,0.152845
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,5000000,0.0,0.0,0.0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,50000000,0.0,0.0,0.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,100000,0.0,0.0,0.0



Top installed apps saved to outputs/top_installed_apps.csv
Saved positive reviews wordcloud.
Saved category summary to outputs/top_categories_summary.csv
Saved app-level review summary to outputs/app_review_summary.csv

--- Quick insights ---
Fraction of apps with at least one review in reviews.csv: 10.59%

Top apps by positive review percentage (min reviews=30):


Unnamed: 0,app_name,category,n_reviews,pct_positive,mean_sentiment_score
1102,Down Dog: Great Yoga Anywhere,HEALTH_AND_FITNESS,40.0,0.975,0.82651
244,Crew - Free Messaging and Scheduling,BUSINESS,80.0,0.95,0.617143
930,Current debit card and app made for teens,FINANCE,40.0,0.95,0.625955
1851,Baritastic - Bariatric Tracker,MEDICAL,40.0,0.95,0.642732
572,Duolingo: Learn Languages Free,EDUCATION,240.0,0.929167,0.689193
1071,Couch to 10K Running Trainer,HEALTH_AND_FITNESS,40.0,0.925,0.636123
2396,850 Sports News Digest,SPORTS,40.0,0.925,0.652375
1486,Honkai Impact 3rd,GAME,80.0,0.9,0.73113
824,Goldstar: Live Event Tickets,EVENTS,40.0,0.9,0.578767
2775,Easy Voice Recorder,PRODUCTIVITY,40.0,0.9,0.614648


Saved correlation matrix to outputs/app_correlation_matrix.png

All done. Check the 'outputs/' folder for CSVs and images.
If you want, I can now: (a) split this into a Colab notebook with markdown cells, (b) add deeper NLP / topic modeling, (c) add time-based review trends.
