<a href="https://colab.research.google.com/github/kenchin138/DA-and-DS-Projects/blob/main/Google_Play_Store_Analysis/Google_Play_Store_App_Analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building Environment

In [2]:
import pandas as pd
import plotly.express as px
import os
import numpy as np
pd.options.mode.chained_assignment = None
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
pwd = os.getcwd()

# Loading Dataset

The dataset contains 10,841 entries and 13 features in total.

In [4]:
data = pd.read_csv(pwd + '/googleplaystore.csv', encoding='unicode_escape')

In [6]:
data.sample(10)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content_Rating,Genres,Last_Updated,Current_Ver,Android_Ver
2736,Candy Crush Saga,GAME,4.4,22428456,74.0,"500,000,000+",Free,0.0,Everyone,Casual,5-Jul-18,1.129.0.2,4.1 and up
2201,Shadow Fight 2,GAME,4.6,10981850,88.0,"100,000,000+",Free,0.0,Everyone 10+,Action,2-Jul-18,1.9.38,3.0 and up
8038,Blood Donor,MEDICAL,4.2,4476,20.0,"500,000+",Free,0.0,Everyone,Medical,31-Jul-18,1.6.2,4.1 and up
104,Who,COMMUNICATION,4.3,2451093,,"100,000,000+",Free,0.0,Teen,Communication,3-Aug-18,Varies with device,Varies with device
10269,MegaNDS (NDS Emulator),GAME,3.4,2218,11.0,"500,000+",Free,0.0,Everyone,Arcade,18-Oct-17,2,4.0 and up
8253,Stickman and Axe,GAME,4.4,69,2.8,"50,000+",Free,0.0,Teen,Action,25-Nov-17,1,4.0.3 and up
9311,EGW Writings 2,BOOKS_AND_REFERENCE,4.7,6547,16.0,"100,000+",Free,0.0,Everyone,Books & Reference,8-Dec-17,4.0.3,4.0.3 and up
6172,EP Gem Hunter,GAME,4.6,79,31.0,"1,000+",Free,0.0,Everyone 10+,Action,17-Jan-16,1.1.0,2.3 and up
126,True Contact - Real Caller ID,COMMUNICATION,4.1,32283,,"1,000,000+",Free,0.0,Everyone,Communication,26-Jul-18,Varies with device,Varies with device
3164,Where's My Water?,FAMILY,4.7,188740,69.0,"1,000,000+",Paid,1.99,Everyone,Puzzle;Brain Games,5-Jul-18,1.16.0,4.2 and up


In [5]:
data.dtypes

App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs           object
Type               object
Price              object
Content_Rating     object
Genres             object
Last_Updated       object
Current_Ver        object
Android_Ver        object
dtype: object

In [72]:
data.shape

(10841, 13)

# Data Preprocessing

## Dropping Unused Columns
Since `Last_Updated`, `Current_Ver`, and `Android_Ver` are not relevant to the analysis, they will be dropped.

In [5]:
data_cleaned = data.copy()
data_cleaned.drop(['Last_Updated', 'Current_Ver', 'Android_Ver'], axis=1, inplace=True)

##Formatting Values

In [6]:
data_cleaned['Category'].replace({'_':' ', 'AND':'&'}, regex=True, inplace=True)
data_cleaned['Category'] = data_cleaned['Category'].str.title()
data_cleaned['Installs'].replace({'\+':'', ',':''}, regex=True, inplace=True)
data_cleaned.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content_Rating,Genres
0,Floor Plan Creator,Art & Design,4.1,36639,,5000000,Free,0,Everyone,Art & Design
1,Textgram - write on photos,Art & Design,4.4,295221,,10000000,Free,0,Everyone,Art & Design
2,Used Cars and Trucks for Sale,Auto & Vehicles,4.6,17057,,1000000,Free,0,Everyone,Auto & Vehicles
3,Ulysse Speedometer,Auto & Vehicles,4.3,40211,,5000000,Free,0,Everyone,Auto & Vehicles
4,REPUVE,Auto & Vehicles,3.9,356,,100000,Free,0,Everyone,Auto & Vehicles


## Checking Inconsistencies

`Rating` is the only feature that has a numerical data type while `Reviews`, `Size`, `Installs`, and `Price` should also be numerical. These features may contain strings that should be removed.

`Rating` also has an outlier of 19 when ratings should only go up to 5.

`Type` should have only two categories, `free` and `paid`. However, there are three categories.

Taking a closer look reveals that a single row with missing data in `Category` caused its values to shift to the left. Removing this row resolves the above errors.

The data also contains duplicates. The total count for `App` does not match the unique count. Duplicates with the highest reviews are considered to be the most up to date and will be kept.

In [7]:
data_cleaned.describe()

Unnamed: 0,Rating
count,9367.0
mean,4.19
std,0.54
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,19.0


In [10]:
data_cleaned.describe(include='object')

Unnamed: 0,App,Category,Reviews,Size,Installs,Type,Price,Content_Rating,Genres
count,10841,10841,10841,9146,10841,10840,10841,10840,10841
unique,9660,34,6002,461,21,3,93,6,120
top,ROBLOX,Family,0,11,1000000,Free,0,Everyone,Tools
freq,9,1972,596,198,1579,10039,10040,8714,842


In [22]:
data_cleaned.loc[data_cleaned['Reviews'] == '3.0M']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content_Rating,Genres
10840,Life Made WI-Fi Touchscreen Photo Frame,,19.0,3.0M,1000,Free,0,Everyone,,11-Feb-18


In [8]:
data_cleaned.drop(data_cleaned.index[10840], inplace=True)

In [9]:
data_cleaned['Reviews'] = data_cleaned['Reviews'].astype(int)
data_cleaned['Price'] = data_cleaned['Price'].astype(float)
data_cleaned['Installs'] = data_cleaned['Installs'].astype(int)
data_cleaned['Size'] = data_cleaned['Size'].astype(float)
data_cleaned.describe()

Unnamed: 0,Rating,Reviews,Size,Installs,Price
count,9366.0,10840.0,9145.0,10840.0,10840.0
mean,4.19,444152.9,21.52,15464338.88,1.03
std,0.52,2927760.6,22.59,85029361.4,15.95
min,1.0,0.0,0.01,0.0,0.0
25%,4.0,38.0,4.9,1000.0,0.0
50%,4.3,2094.0,13.0,100000.0,0.0
75%,4.5,54775.5,30.0,5000000.0,0.0
max,5.0,78158306.0,100.0,1000000000.0,400.0


In [25]:
data_cleaned.describe(include='object')

Unnamed: 0,App,Category,Type,Content_Rating,Genres
count,10840,10840,10839,10840,10840
unique,9659,33,2,6,119
top,ROBLOX,Family,Free,Everyone,Tools
freq,9,1972,10039,8714,842


## Removing Duplicates


In [9]:
dupe_rows = data_cleaned.duplicated(subset=['App'], keep='first')
data_cleaned = data_cleaned[~dupe_rows]
data_cleaned.shape


(9659, 10)

In [10]:
data_cleaned = data_cleaned.sort_values(by='Reviews').drop_duplicates('App').sort_index()
data_cleaned.shape

(9659, 10)

## Removing NaNs

Null values are represented by `NaN` in the dataset.

There are over 1,400 missing data for `Rating` and over 1,600 missing for `Size`. It's likely that newer apps may not have a rating yet, resulting in NaNs, or users may choose not to include a rating with their review. Apps can also vary in size between devices; this inconsistency would lead to NaNs in the data.

In [14]:
data_cleaned.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size              1695
Installs             0
Type                 1
Price                0
Content_Rating       0
Genres               0
dtype: int64

In [11]:
data_cleaned = data_cleaned.dropna()
data_cleaned.shape

(7027, 10)

#Exploratory Analytics

## Which Apps Are Rated Highest?
Apps on the Play Store have a 4.16 rating on average, with a standard deviation of 0.56. Given that the median, 4.3, is close to the average, the data is not strongly skewed by outliers.

Ranking the top 10 highest rated apps with the most reviews shows that apps rated at 5 have at most 141 reviews. The average number of reviews for apps in this rating threshold is 8 compared to over 197,000 reviews for apps in the next highest threshold (between 4 and 5 stars).

The top 10 highest rated apps also tend to have between 1,000 to 5,000 installs with the highest number of installs being between 10,000 to 50,000. However, the median range of installs for 5 star reviewed apps is actually between 100 and 500.

The top 10 highest rated apps cost between \$1.99 to \$29.99. Among the 266 5-star rated apps, only 28 are paid which equates to 10.5%. The average cost of paid apps with 5 stars is \$3.64.

In [56]:
print ('The mean rating for apps is ' + str(data_cleaned['Rating'].mean().round(2)) + ', and the standard deviation is ' + str(data_cleaned['Rating'].std().round(2)) + '. The median is ' + str(data_cleaned['Rating'].median()) + '.' )

The mean rating for apps is 4.16, and the standard deviation is 0.56. The median is 4.3.


In [57]:
data_cleaned[['App', 'Rating', 'Reviews', 'Category']].sort_values(by=['Rating', 'Reviews'], ascending=False).head(10)

Unnamed: 0,App,Rating,Reviews,Category
9504,R??os de Fe,5.0,141,Lifestyle
8579,"FD Calculator (EMI, SIP, RD & Loan Eligilibility)",5.0,104,Finance
6375,Oraci??n CX,5.0,103,Lifestyle
10388,Barisal University App-BU Face,5.0,100,Family
9122,CL REPL,5.0,47,Tools
8250,AJ Cam,5.0,44,Photography
3682,Ek Vote,5.0,43,Productivity
6677,CS & IT Interview Questions,5.0,43,Family
8532,AI Today : Artificial Intelligence News & AI 101,5.0,43,News & Magazines
5517,Quran Khmer Offline AY,5.0,41,Family


In [12]:
data_cleaned['Rating_Threshold'] = np.where((data_cleaned['Rating'] >= 1) & (data_cleaned['Rating'] < 2), 1,
                                 np.where((data_cleaned['Rating'] >= 2) & (data_cleaned['Rating'] < 3), 2,
                                 np.where((data_cleaned['Rating'] >= 3) & (data_cleaned['Rating'] < 4), 3,
                                 np.where((data_cleaned['Rating'] >= 4) & (data_cleaned['Rating'] < 5), 4, 5))))

data_cleaned[['Rating_Threshold', 'Reviews']].groupby('Rating_Threshold').mean()

Unnamed: 0_level_0,Reviews
Rating_Threshold,Unnamed: 1_level_1
1,157.06
2,1112.79
3,14776.7
4,197784.94
5,8.43


In [54]:
data_cleaned[['App', 'Rating', 'Installs', 'Category']].sort_values(by=['Rating', 'Installs'], ascending=False).head(10)

Unnamed: 0,App,Rating,Installs,Category
6872,Ek Bander Ne Kholi Dukan,5.0,10000,Family
6375,Oraci??n CX,5.0,5000,Lifestyle
6740,CL Keyboard - Myanmar Keyboard (No Ads),5.0,5000,Tools
10059,"Superheroes, Marvel, DC, Comics, TV, Movies News",5.0,5000,Comics
2105,BM SPM Practice,5.0,1000,Family
2142,ReactNative BG Geolocation,5.0,1000,Tools
2455,Fr. Daoud Lamei,5.0,1000,Family
2605,Jigsaw Volvo FH 16 Trucks,5.0,1000,Family
2880,GKPB FP Online Church,5.0,1000,Lifestyle
3169,Florida Wildflowers,5.0,1000,Family


In [53]:
data_cleaned[['Rating_Threshold', 'Installs']].groupby(['Rating_Threshold']).median()

Unnamed: 0_level_0,Installs
Rating_Threshold,Unnamed: 1_level_1
1,1000.0
2,10000.0
3,50000.0
4,100000.0
5,100.0


In [52]:
data_cleaned[['App', 'Rating', 'Price', 'Category']].sort_values(by=['Rating', 'Price'], ascending=False).head(10)

Unnamed: 0,App,Rating,Price,Category
1794,AP Art History Flashcards,5.0,29.99,Family
5066,USMLE Step 2 CK Flashcards,5.0,19.99,Family
3257,meStudying: AP English Lit,5.0,4.99,Family
3297,Hey AJ! It's Bedtime!,5.0,4.99,Family
7156,TI-84 CE Graphing Calculator Manual TI 84,5.0,4.99,Family
4115,Hey AJ! It's Saturday!,5.0,3.99,Books & Reference
10789,AC DC Power Monitor,5.0,3.04,Lifestyle
7657,Super Hearing Secret Voices Recorder PRO,5.0,2.99,Medical
10765,FHR 5-Tier 2.0,5.0,2.99,Medical
3618,ADS-B Driver,5.0,1.99,Tools


In [51]:
data_cleaned[['Rating_Threshold', 'Type', 'App']].groupby(['Rating_Threshold', 'Type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,App
Rating_Threshold,Type,Unnamed: 2_level_1
1,Free,49
1,Paid,4
2,Free,201
2,Paid,15
3,Free,1369
3,Paid,92
4,Free,4628
4,Paid,401
5,Free,238
5,Paid,28


In [13]:
print ('Free apps account for', (data_cleaned['App'][(data_cleaned['Rating_Threshold'] == 5) & (data_cleaned['Type'] == 'Paid')].count()/data_cleaned['App'][data_cleaned['Rating_Threshold'] == 5].count()).round(3) * 100, '% of 5-star apps.')

Free apps account for 10.5 % of 5-star apps.


In [81]:
free_subgroup = data_cleaned.drop(data_cleaned[data_cleaned['Type'] == 'Free'].index)
free_subgroup[['Rating_Threshold', 'Price']].groupby(['Rating_Threshold']).mean()

Unnamed: 0_level_0,Price
Rating_Threshold,Unnamed: 1_level_1
1,2.74
2,29.76
3,33.86
4,11.39
5,3.64


## Which Apps Are Largest In Size?

The largest apps on the Play Store don't break 100 mB. This is due to a 100 mB file size limit imposed on developers. However, apps have no minimum limit and can be as small as 0.008 mB (8.5 kB).

There are 14 apps at the maximum size limit, the majority of which are gaming apps.

In [50]:
data_cleaned[['App', 'Size', 'Category']].sort_values('Size', ascending=False).head(10)

Unnamed: 0,App,Size,Category
10460,Vi Trainer,100.0,Health & Fitness
10462,The Walking Dead: Our World,100.0,Game
10468,Stickman Legends: Shadow Wars,100.0,Game
10467,Draft Simulator for FUT 18,100.0,Sports
10466,Car Crash III Beam DH Real Damage Simulator 2018,100.0,Game
10465,SimCity BuildIt,100.0,Family
10463,Miami crime simulator,100.0,Game
10464,Gangster Town: Vice District,100.0,Family
10461,Ultimate Tennis,100.0,Sports
10458,Navi Radiography Pro,100.0,Medical


In [35]:
data_cleaned[['App', 'Size', 'Category']].sort_values('Size', ascending=True).head(10).style.format(precision=3)

Unnamed: 0,App,Size,Category
2492,Essential Resources,0.008,Libraries & Demo
10309,Market Update Helper,0.011,Libraries & Demo
9717,My baby firework (Remove ad),0.014,Family
9176,ExDialer PRO Key,0.017,Communication
9175,Ad Remove Plugin for App2SD,0.017,Productivity
9013,Fill 'er Up,0.018,Travel & Local
8151,Morse Code Reader,0.02,Communication
7746,Plugin:AOT v5.0,0.022,Business
7623,EM Launcher Pro,0.023,Personalization
7482,Bixby Button Remapper - bxActions Pro / Coffee,0.024,Tools


In [49]:
print ('There are', data_cleaned['App'][data_cleaned['Size'] == 100].count(), 'apps at the maximum size.')

There are 14 apps at the maximum size.


In [80]:
maxsize_subset = data_cleaned.drop(data_cleaned[data_cleaned['Size'] != 100].index)
maxsize_subset[['Category', 'App']].groupby(['Category']).count()

Unnamed: 0_level_0,App
Category,Unnamed: 1_level_1
Family,2
Finance,1
Game,6
Health & Fitness,1
Lifestyle,1
Medical,1
Sports,2


## Which Apps Have The Most Reviews?

Ranking the top apps by number of reviews reveals that apps with the most reviews tend to be free gaming apps. More in-depth analysis on differences between free/paid apps and between app categories will follow in a dedicated section.

In [37]:
data_cleaned[['App', 'Reviews', 'Type', 'Category']].sort_values('Reviews', ascending=False).head(10)

Unnamed: 0,App,Reviews,Type,Category
1736,Clash of Clans,44891723,Free,Game
2697,Subway Surfers,27722264,Free,Game
1757,Clash Royale,23133508,Free,Game
2733,Candy Crush Saga,22426677,Free,Game
5020,UC Browser - Fast Download Private & Secure,17712922,Free,Communication
4015,8 Ball Pool,14198297,Free,Game
9683,DU Battery Saver - Battery Charger & Battery Life,13479633,Free,Tools
9402,Cache Cleaner-DU Speed Booster (booster & clea...,12759663,Free,Tools
2191,Shadow Fight 2,10979062,Free,Game
7511,Pou,10485308,Free,Game


In [82]:
data_cleaned[['App', 'Category', 'Type']].groupby(['Category', 'Type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,App
Category,Type,Unnamed: 2_level_1
Art & Design,Free,56
Art & Design,Paid,3
Auto & Vehicles,Free,63
Beauty,Free,37
Books & Reference,Free,134
Books & Reference,Paid,7
Business,Free,214
Business,Paid,8
Comics,Free,47
Communication,Free,170


# How Many Apps Had Over 1 Billion Installations?

Two apps, Subway Surfers and Google News, reached 1 billion installs. The majority of apps have between 10 thousand and 1 million installs.

In [89]:
data_cleaned[['App', 'Category', 'Rating', 'Reviews', 'Size', 'Type', 'Price', 'Content_Rating'] ][data_cleaned['Installs'] == 1000000000]

Unnamed: 0,App,Category,Rating,Reviews,Size,Type,Price,Content_Rating
2697,Subway Surfers,Game,4.5,27722264,76.0,Free,0.0,Everyone 10+
9779,Google News,News & Magazines,3.9,877635,13.0,Free,0.0,Teen


In [40]:
data_cleaned[['App', 'Installs']].groupby('Installs').count()

Unnamed: 0_level_0,App
Installs,Unnamed: 1_level_1
1,3
5,9
10,67
50,56
100,297
500,195
1000,673
5000,413
10000,947
50000,427


## What Are The Most Expensive Apps?

Examining `Price` more closely, the most expensive apps are populated by I am Rich alternatives. These apps have minimal features and are likely created as joke apps. After filtering these apps, the most expensive app costs \$79.99.

An estimate of app revenue, `Gross Revenue`, can be included by multiplying the price of the app and the number of installs.

The highest grossing paid apps tend to be gaming apps. Out of the top 10 highest grossing paid apps, how many are games?


In [85]:
data_cleaned[['App', 'Price', 'Rating', 'Reviews', 'Installs']].sort_values('Price', ascending=False).head(20)

Unnamed: 0,App,Price,Rating,Reviews,Installs
3029,I'm Rich - Trump Edition,400.0,3.6,275,10000
6354,I am Rich,399.99,4.3,180,5000
7813,I am Rich!,399.99,3.8,93,1000
2403,I am Rich Plus,399.99,4.0,856,10000
1806,I am rich(premium),399.99,3.5,472,5000
8315,I am rich (Most expensive app),399.99,4.1,129,1000
10650,most expensive app (H),399.99,4.3,6,100
8314,I Am Rich Pro,399.99,4.4,201,5000
5181,I Am Rich Premium,399.99,4.1,1867,50000
4986,I AM RICH PRO PLUS,399.99,4.0,36,1000


In [16]:
under100_subset = data_cleaned[data_cleaned['Price'] < 100]
under100_subset[['App', 'Price', 'Rating', 'Reviews', 'Installs']].sort_values('Price', ascending=False).head(10)

Unnamed: 0,App,Price,Rating,Reviews,Installs
6058,Vargo Anesthesia Mega App,79.99,4.6,92,1000
10725,LTC AS Legal,39.99,4.0,6,100
10530,I am Rich Person,37.99,4.2,134,1000
3176,A Manual of Acupuncture,33.99,3.5,214,1000
1794,AP Art History Flashcards,29.99,5.0,1,10
7390,Golfshot Plus: Golf GPS,29.99,4.1,3387,50000
4980,PTA Content Master,29.99,4.2,64,1000
8468,EMT PASS,29.99,3.4,51,1000
7375,Human Anatomy Atlas 2018: Complete 3D Human Body,24.99,4.5,2921,100000
7376,"Muscle Premium - Human Anatomy, Kinesiology, B...",24.99,4.2,168,10000


In [24]:
under100_subset['Gross_Revenue'] = data_cleaned.Installs.mul(data_cleaned.Price)
under100_subset[['App', 'Gross_Revenue', 'Reviews', 'Rating', 'Category']].sort_values(['Gross_Revenue', 'Reviews', 'Rating'], ascending=False).head(10)

Unnamed: 0,App,Gross_Revenue,Reviews,Rating,Category
6920,Hitman Sniper,9900000.0,408292,4.6,Game
7267,Grand Theft Auto: San Andreas,6990000.0,348962,4.4,Game
4619,Facetune - For Free,5990000.0,49553,4.4,Photography
2223,Sleep as Android Unlock,5990000.0,23966,4.5,Lifestyle
10069,DraStic DS Emulator,4990000.0,87766,4.6,Game
1845,Bloons TD 5,2990000.0,190086,4.6,Family
7656,Card Wars - Adventure Time,2990000.0,129603,4.3,Family
4117,Five Nights at Freddy's,2990000.0,100805,4.6,Game
7375,Human Anatomy Atlas 2018: Complete 3D Human Body,2499000.0,2921,4.5,Medical
4909,NBA JAM by EA SPORTS???,2495000.0,56444,4.3,Family


## Are There Differences Between Categories?

**Family apps** account for the highest number of apps on the Play Store, with **gaming** and **tools** coming in at second and third, respectively. The other categories have roughly the same volume of apps.  

However, gaming apps are the most popular apps in terms of installs. Tools and social apps are second and third in popularity, while family apps come in at fourth.

In [None]:
top10_category = data_cleaned.Category.value_counts()[:10]
top10_category

Family             1617
Game                974
Tools               634
Medical             324
Personalization     280
Lifestyle           280
Finance             266
Sports              247
Business            246
Photography         236
Name: Category, dtype: int64

In [25]:
print ('Family apps account for', data_cleaned[data_cleaned['Category'] == 'Family'].count(), '% of the data.')

Family apps account for App                 1534
Category            1534
Rating              1534
Reviews             1534
Size                1534
Installs            1534
Type                1534
Price               1534
Content_Rating      1534
Genres              1534
Rating_Threshold    1534
dtype: int64 % of the data.


In [None]:
bar = px.bar(x=top10_category.index,
             y=top10_category.values)
bar.show()

In [None]:
category_installs = data_cleaned.groupby('Category').agg({'Installs':pd.Series.sum})
category_installs.sort_values('Installs', ascending=True, inplace=True)

In [None]:
h_bar = px.bar(x=category_installs.Installs, y=category_installs.index, orientation='h', title='Category Popularity by Number of Downloads')
h_bar.update_layout(xaxis_title='Number of Installs', yaxis_title='Category')
h_bar.show()

### Downloads vs. Competition

In [None]:
category_info = df_apps_clean.groupby('Category').agg({'App':pd.Series.count, 'Installs':pd.Series.sum})
category_info.sort_values('Installs', ascending=False).head()

Unnamed: 0_level_0,App,Installs
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
GAME,910,13858762717
COMMUNICATION,257,11039241530
TOOLS,719,8099724500
PRODUCTIVITY,301,5788070180
SOCIAL,203,5487841475


In [None]:
scatter = px.scatter(category_info, # data
                    x='App', # column name
                    y='Installs',
                    title='Concentration of Categories',
                    size='App',
                    hover_name=category_info.index,
                    color='Installs')

scatter.update_layout(xaxis_title="Number of Apps (Lower=More Concentrated)",
                      yaxis_title="Installs",
                      yaxis=dict(type='log'))

scatter.show()

# Furthur Dividing the Genres

**Genres**: How many different types of genres are there? Can an app belong to more than one genre? Use `.stack()` and `.split()` along with `.value_counts()` to see how many apps in level 2 genres.

In [None]:
# Split the strings on the semi-colon and then .stack them.
stack = df_apps_clean.Genres.str.split(';', expand=True).stack()
print(f'We now have a single column with shape: {stack.shape}')
num_genres = stack.value_counts()
print(f'Number of genres: {len(num_genres)}')

We now have a single column with shape: (8564,)
Number of genres: 53


# Plot the divided Genres Info

In [None]:
bar = px.bar(x = num_genres.index[:15],
             y = num_genres.values[:15],
             title='Top Genres',
             hover_name=num_genres.index[:15],
             color=num_genres.values[:15],
             color_continuous_scale='Agsunset')

bar.update_layout(xaxis_title='Genre',yaxis_title='Number of Apps',coloraxis_showscale=False)
bar.show()

# What Are The Differences Between Paid and Free Apps?

Paid apps cost \$14.05 on average. They are also rated slighly higher than free apps on average. However, they have far less popularity, as their reviews and installs account for

Free apps dominate the Play Store in each category. Beauty, Comics, Events, and House/Home apps have no paid apps.

In [None]:
data_cleaned[['Type', 'Price', 'Reviews', 'Installs', 'Rating']].groupby('Type').mean(numeric_only=True)

Unnamed: 0_level_0,Price,Reviews,Installs,Rating
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Free,0.0,234242.33,8446665.47,4.17
Paid,14.05,8736.37,75978.65,4.26


In [None]:
#free_vs_paid = data_cleaned.groupby(["Category", "Type"], as_index=False).agg({'App': pd.Series.count})
free_vs_paid = data_cleaned[['Category', 'Type', 'App']].groupby(['Category', 'Type']).count()

free_vs_paid

Unnamed: 0_level_0,Unnamed: 1_level_0,App
Category,Type,Unnamed: 2_level_1
Art & Design,Free,61
Art & Design,Paid,3
Auto & Vehicles,Free,82
Auto & Vehicles,Paid,3
Beauty,Free,53
...,...,...
Travel & Local,Paid,12
Video Players,Free,159
Video Players,Paid,4
Weather,Free,71


ERROR:root:Did not find quickchart key chart-c93a491c-7b3c-461a-b208-99955478ebf1 in chart cache


In [None]:
g_bar = px.bar(data_cleaned, x='Category', y='App', title='Free vs. Paid Apps by Category', color='Type', barmode='group')
g_bar.update_layout(xaxis_title='Category', yaxis_title='Number of Apps', xaxis={'categoryorder':'total descending'}, yaxis=dict(type='log'))
g_bar.show()

# Revenue by App Category


In [None]:
paid_apps = data_cleaned[data_cleaned['Type'] == 'Paid']
box = px.box(data_cleaned,
             x='Category',
             y='Revenue_Estimate',
             title='How Much Can Paid Apps Earn?')

box.update_layout(xaxis_title='Category',
                  yaxis_title='Paid App Ballpark Revenue',
                  xaxis={'categoryorder':'min ascending'},
                  yaxis=dict(type='log'))
box.show()

ValueError: ignored

# How Much Do Developers Charge?

While the overall median price for an app is 2.99, Paid App Median Price vs. Paid App Price Distribution per Category, to show which category is about the median paid app price.

In [None]:
paid_apps.Price.median()

NameError: ignored

In [None]:
box = px.box(df_paid_apps,
             x='Category',
             y="Price",
             title='Price per Category')

box.update_layout(xaxis_title='Category',
                  yaxis_title='Paid App Price',
                  xaxis={'categoryorder':'max descending'},
                  yaxis=dict(type='log'))
box.show()

# Visualing Content Ratings

Apps intended for users of all ages are the most popular in the Play Store. This is expected, given that the popular of potential users is higher

In [None]:
ratings = data_cleaned.Content_Rating.value_counts()
ratings.head()

Everyone           6172
Teen                868
Mature 17+          368
Everyone 10+        318
Adults only 18+       2
Name: Content_Rating, dtype: int64

In [None]:
fig = px.pie(labels=ratings.index, values=ratings.values, names=ratings.index, title='Content Rating', hole=0.6)
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.show()


Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.



#End Notebook