In [1]:
# uses googleplaystore.csv in the google drive
# mounting from google drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
# path of the .csv file change if needed
path = "/content/drive/MyDrive/Colab Notebooks/googleplaystore.csv"

In [3]:
# 1) import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# 2) read csv files 
df = pd.read_csv(path)

In [5]:
# testing
df.iloc[10472]

App               Life Made WI-Fi Touchscreen Photo Frame
Category                                              1.9
Rating                                               19.0
Reviews                                              3.0M
Size                                               1,000+
Installs                                             Free
Type                                                    0
Price                                            Everyone
Content Rating                                        NaN
Genres                                  February 11, 2018
Last Updated                                       1.0.19
Current Ver                                    4.0 and up
Android Ver                                           NaN
Name: 10472, dtype: object

In [6]:
# 3) display top 5 rows of dataset
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [7]:
# 4) Check the last 3 rows of the dataset
df.tail(3)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10838,Parkinson Exercices FR,MEDICAL,,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device
10840,iHoroscope - 2018 Daily Horoscope & Astrology,LIFESTYLE,4.5,398307,19M,"10,000,000+",Free,0,Everyone,Lifestyle,"July 25, 2018",Varies with device,Varies with device


In [8]:
# 5) Find shape of our dataset (number of rows & number of columns)
row, col = df.shape
print("the rows of our dataset is {} \nthe columns in our dataset is {}".format(row,col))

the rows of our dataset is 10841 
the columns in our dataset is 13


In [9]:
# 6) Get information about our dataset like total number rows, 
# total number of columns, datatypes of each column and memory requirement.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


From this info we can see that our dataset has 10841 rows(entries) and a total off 13 columns. </br>
The datatype of all but 1 column(rating) is an datatype of object. The memory usage is about a bit over 1.1MB but less than 1.2MB

In [10]:
str(df.memory_usage().sum())+" Bytes which is 1.127592 MegaBytes"

'1127592 Bytes which is 1.127592 MegaBytes'

In [11]:
# 7) Get overall statistics about the data frame
# 7a) Use parameter to show stats of numerical and categorical columns
df.describe(include='all')

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
count,10841,10841,9367.0,10841.0,10841,10841,10840,10841.0,10840,10841,10841,10833,10838
unique,9660,34,,6002.0,462,22,3,93.0,6,120,1378,2832,33
top,ROBLOX,FAMILY,,0.0,Varies with device,"1,000,000+",Free,0.0,Everyone,Tools,"August 3, 2018",Varies with device,4.1 and up
freq,9,1972,,596.0,1695,1579,10039,10040.0,8714,842,326,1459,2451
mean,,,4.193338,,,,,,,,,,
std,,,0.537431,,,,,,,,,,
min,,,1.0,,,,,,,,,,
25%,,,4.0,,,,,,,,,,
50%,,,4.3,,,,,,,,,,
75%,,,4.5,,,,,,,,,,


In [12]:
# 8) Total number of app titles which contain ‘astrology’
# 8a) Show columns names first and find out on which column you have to work
df.columns
# the App title must be in App column

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [13]:
# 8b) Check the column where you can find that word
# 8c) Use contains to find that word 
# 8d) Do case sensitive and insensitive search
df['App'].str.contains('Astrology').sum(), df['App'].str.contains('astrology').sum()# case sensitive search

(3, 0)

In [14]:
df['App'].str.contains('astrology',na=False, case=False).sum() # case insensitive search

3

In [15]:
# 8e) Find the length of data frame by which one can get to know that how many time this word is used
df.App.str.count("Astrology").sum()

3

In [16]:
# all occurences of Astrology seem to be 3
df.App.str.contains(r'Astrology|astrology').sum()

3

In [17]:
# 9) Find average app rating
# 9a) Show all columns names to find out on which column you have to work
df.columns
# we are using Rating column

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [18]:
# 9b) Use mean method to find average app rating
df['Rating'].mean()

4.193338315362443

In [19]:
# 10) Find total number of unique categories
# 10a) Find names of columns
df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [20]:
# 10b) Show how many unique are there
set(df.columns)

{'Android Ver',
 'App',
 'Category',
 'Content Rating',
 'Current Ver',
 'Genres',
 'Installs',
 'Last Updated',
 'Price',
 'Rating',
 'Reviews',
 'Size',
 'Type'}

In [21]:
# going to redo 10 im guessing you mean show the number of unique values in the column "Category"
df.Category.unique()

array(['ART_AND_DESIGN', 'AUTO_AND_VEHICLES', 'BEAUTY',
       'BOOKS_AND_REFERENCE', 'BUSINESS', 'COMICS', 'COMMUNICATION',
       'DATING', 'EDUCATION', 'ENTERTAINMENT', 'EVENTS', 'FINANCE',
       'FOOD_AND_DRINK', 'HEALTH_AND_FITNESS', 'HOUSE_AND_HOME',
       'LIBRARIES_AND_DEMO', 'LIFESTYLE', 'GAME', 'FAMILY', 'MEDICAL',
       'SOCIAL', 'SHOPPING', 'PHOTOGRAPHY', 'SPORTS', 'TRAVEL_AND_LOCAL',
       'TOOLS', 'PERSONALIZATION', 'PRODUCTIVITY', 'PARENTING', 'WEATHER',
       'VIDEO_PLAYERS', 'NEWS_AND_MAGAZINES', 'MAPS_AND_NAVIGATION',
       '1.9'], dtype=object)

In [22]:
# 11) Which category getting the highest average rating?
# 11a) Group by the category and mean the rating 
tmp = df.groupby([df['Category']]).mean()
tmp

Unnamed: 0_level_0,Rating
Category,Unnamed: 1_level_1
1.9,19.0
ART_AND_DESIGN,4.358065
AUTO_AND_VEHICLES,4.190411
BEAUTY,4.278571
BOOKS_AND_REFERENCE,4.346067
BUSINESS,4.121452
COMICS,4.155172
COMMUNICATION,4.158537
DATING,3.970769
EDUCATION,4.389032


In [23]:
# 11)b Sort them by descending order using to find which category has highest average rating
df2 = tmp.sort_values('Rating', ascending=False)
df2

Unnamed: 0_level_0,Rating
Category,Unnamed: 1_level_1
1.9,19.0
EVENTS,4.435556
EDUCATION,4.389032
ART_AND_DESIGN,4.358065
BOOKS_AND_REFERENCE,4.346067
PERSONALIZATION,4.335987
PARENTING,4.3
GAME,4.286326
BEAUTY,4.278571
HEALTH_AND_FITNESS,4.277104


In [24]:
# 12) Find total number of app having 5-star rating
df.loc[df['Rating'] == 5.0].shape[0]

274

In [25]:
# 13) Find average value of reviews
# 13a) Show all columns
df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [26]:
# 13b) Find data type of related column
df.Reviews.head()

0       159
1       967
2     87510
3    215644
4       967
Name: Reviews, dtype: object

In [27]:
# 13c) Convert data type with integer or float
#df["Reviews"] = pd.to_numeric(df["Reviews"])
# "3.0M" at row 10472
#df.loc[df['Reviews'] == "3.0M"] = "3000000"
#df.loc[df['App'] == "Life Made WI-Fi Touchscreen Photo Frame"]
#df.loc[df['Reviews'] == "3000000"]
#df.loc[df['Reviews'] == "3.0M"]
df.loc[10472]

App               Life Made WI-Fi Touchscreen Photo Frame
Category                                              1.9
Rating                                               19.0
Reviews                                              3.0M
Size                                               1,000+
Installs                                             Free
Type                                                    0
Price                                            Everyone
Content Rating                                        NaN
Genres                                  February 11, 2018
Last Updated                                       1.0.19
Current Ver                                    4.0 and up
Android Ver                                           NaN
Name: 10472, dtype: object

In [28]:
df.at[10472,'Reviews']='3000000'

In [29]:
df.loc[10472]

App               Life Made WI-Fi Touchscreen Photo Frame
Category                                              1.9
Rating                                               19.0
Reviews                                           3000000
Size                                               1,000+
Installs                                             Free
Type                                                    0
Price                                            Everyone
Content Rating                                        NaN
Genres                                  February 11, 2018
Last Updated                                       1.0.19
Current Ver                                    4.0 and up
Android Ver                                           NaN
Name: 10472, dtype: object

In [30]:
df["Reviews"] = pd.to_numeric(df["Reviews"])

In [31]:
# 13e) After having solution kindly convert data type 
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  int64  
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 1.1+ MB


In [32]:
# 13f) Time to find average
df.Reviews.mean()

444388.6535374965

In [33]:
# 14) Find total number of free and paid apps
df.Type.unique()

array(['Free', 'Paid', nan, '0'], dtype=object)

In [34]:
df.Price.unique()

array(['0', '$4.99', '$3.99', '$6.99', '$1.49', '$2.99', '$7.99', '$5.99',
       '$3.49', '$1.99', '$9.99', '$7.49', '$0.99', '$9.00', '$5.49',
       '$10.00', '$24.99', '$11.99', '$79.99', '$16.99', '$14.99',
       '$1.00', '$29.99', '$12.99', '$2.49', '$10.99', '$1.50', '$19.99',
       '$15.99', '$33.99', '$74.99', '$39.99', '$3.95', '$4.49', '$1.70',
       '$8.99', '$2.00', '$3.88', '$25.99', '$399.99', '$17.99',
       '$400.00', '$3.02', '$1.76', '$4.84', '$4.77', '$1.61', '$2.50',
       '$1.59', '$6.49', '$1.29', '$5.00', '$13.99', '$299.99', '$379.99',
       '$37.99', '$18.99', '$389.99', '$19.90', '$8.49', '$1.75',
       '$14.00', '$4.85', '$46.99', '$109.99', '$154.99', '$3.08',
       '$2.59', '$4.80', '$1.96', '$19.40', '$3.90', '$4.59', '$15.46',
       '$3.04', '$4.29', '$2.60', '$3.28', '$4.60', '$28.99', '$2.95',
       '$2.90', '$1.97', '$200.00', '$89.99', '$2.56', '$30.99', '$3.61',
       '$394.99', '$1.26', 'Everyone', '$1.20', '$1.04'], dtype=object)

In [35]:
# i was going to sort by a $ vs [0,'Everyone'] in prices as you can see here but
# so here i am at a dilemma beacuse nan could be used to define a product that no longer exists but its price is still there
# what im planning to do here is change the 0 in Type to Free and only count Paid and Free products and ignore the nans types


In [36]:
# 14a) First find out the column on which you must work ( the Type column)
# 14b) Show the number of free and paid in customized format
# first im going to convert all '0' types -> 'Free'
df.Type.unique()

array(['Free', 'Paid', nan, '0'], dtype=object)

In [37]:
df['Type'] = df['Type'].replace('0','Free')

In [38]:
df.Type.unique()

array(['Free', 'Paid', nan], dtype=object)

In [39]:
# now im going to count all Free and Paid apps by grouping them as Free and Paid then counting
CountFreeAndPaid = df.groupby(df["Type"])
CountFreeAndPaid.count().iloc[:,-1:]

Unnamed: 0_level_0,Android Ver
Type,Unnamed: 1_level_1
Free,10038
Paid,799


In [40]:
# 15) Which app has maximum reviews?
df['Reviews'].max()

78158306

In [41]:
# 15a) Show the name of that app which has maximum reviews
df.loc[df['Reviews'].idxmax()].App

'Facebook'

In [42]:
df.loc[df['Reviews'].idxmax()]

App                         Facebook
Category                      SOCIAL
Rating                           4.1
Reviews                     78158306
Size              Varies with device
Installs              1,000,000,000+
Type                            Free
Price                              0
Content Rating                  Teen
Genres                        Social
Last Updated          August 3, 2018
Current Ver       Varies with device
Android Ver       Varies with device
Name: 2544, dtype: object

In [43]:
# 16) Display top 5 apps having highest reviews
df.nlargest(5,'Reviews')

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
2544,Facebook,SOCIAL,4.1,78158306,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"August 3, 2018",Varies with device,Varies with device
3943,Facebook,SOCIAL,4.1,78128208,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"August 3, 2018",Varies with device,Varies with device
336,WhatsApp Messenger,COMMUNICATION,4.4,69119316,Varies with device,"1,000,000,000+",Free,0,Everyone,Communication,"August 3, 2018",Varies with device,Varies with device
381,WhatsApp Messenger,COMMUNICATION,4.4,69119316,Varies with device,"1,000,000,000+",Free,0,Everyone,Communication,"August 3, 2018",Varies with device,Varies with device
3904,WhatsApp Messenger,COMMUNICATION,4.4,69109672,Varies with device,"1,000,000,000+",Free,0,Everyone,Communication,"August 3, 2018",Varies with device,Varies with device


In [44]:
# we dont want facebook to show up twice or whatsapp showing up 3 times
df2 = df.drop_duplicates(subset=["App"])
df2.nlargest(5,'Reviews')

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
2544,Facebook,SOCIAL,4.1,78158306,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"August 3, 2018",Varies with device,Varies with device
336,WhatsApp Messenger,COMMUNICATION,4.4,69119316,Varies with device,"1,000,000,000+",Free,0,Everyone,Communication,"August 3, 2018",Varies with device,Varies with device
2545,Instagram,SOCIAL,4.5,66577313,Varies with device,"1,000,000,000+",Free,0,Teen,Social,"July 31, 2018",Varies with device,Varies with device
335,Messenger – Text and Video Chat for Free,COMMUNICATION,4.0,56642847,Varies with device,"1,000,000,000+",Free,0,Everyone,Communication,"August 1, 2018",Varies with device,Varies with device
1670,Clash of Clans,GAME,4.6,44891723,98M,"100,000,000+",Free,0,Everyone 10+,Strategy,"July 15, 2018",10.322.16,4.1 and up


In [45]:
# 17) Find average rating of free and paid apps
# 17a) Use mean function for that
CountFreeAndPaid.mean()

Unnamed: 0_level_0,Rating,Reviews
Type,Unnamed: 1_level_1,Unnamed: 2_level_1
Free,4.187901,478912.225398
Paid,4.266615,11673.3125


In [46]:
# 18) Display top 5 apps having maximum installs
# 18a) If need convert the data type
# 18b) If required kindly replace values as well
# 18c) Use sorting, indexing , iloc for final output
# 18d) Output should be simple enough in quite easily readable format.
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [47]:
df.iloc[[152,335,336]]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
152,Google Play Books,BOOKS_AND_REFERENCE,3.9,1433233,Varies with device,"1,000,000,000+",Free,0,Teen,Books & Reference,"August 3, 2018",Varies with device,Varies with device
335,Messenger – Text and Video Chat for Free,COMMUNICATION,4.0,56642847,Varies with device,"1,000,000,000+",Free,0,Everyone,Communication,"August 1, 2018",Varies with device,Varies with device
336,WhatsApp Messenger,COMMUNICATION,4.4,69119316,Varies with device,"1,000,000,000+",Free,0,Everyone,Communication,"August 3, 2018",Varies with device,Varies with device


In [48]:
#removing the + at the end of every installs and replacing
df["Installs"] = df["Installs"].str[:-1]

In [49]:
#installs = df.groupby(df["Installs"])
#df.nlargest(5,'Installs')
df["Installs"].replace(',','', regex=True, inplace=True)

In [50]:
df.iloc[10472]

App               Life Made WI-Fi Touchscreen Photo Frame
Category                                              1.9
Rating                                               19.0
Reviews                                           3000000
Size                                               1,000+
Installs                                              Fre
Type                                                 Free
Price                                            Everyone
Content Rating                                        NaN
Genres                                  February 11, 2018
Last Updated                                       1.0.19
Current Ver                                    4.0 and up
Android Ver                                           NaN
Name: 10472, dtype: object

In [51]:
# after checking df.iloc[10472] the inital install is Free which makes no sense so im just going to remove this row from the data
update_df = df.drop([df.index[10472]])

In [52]:
df["Installs"] = pd.to_numeric(update_df["Installs"])

In [53]:
df.nlargest(5,'Installs')

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
152,Google Play Books,BOOKS_AND_REFERENCE,3.9,1433233,Varies with device,1000000000.0,Free,0,Teen,Books & Reference,"August 3, 2018",Varies with device,Varies with device
335,Messenger – Text and Video Chat for Free,COMMUNICATION,4.0,56642847,Varies with device,1000000000.0,Free,0,Everyone,Communication,"August 1, 2018",Varies with device,Varies with device
336,WhatsApp Messenger,COMMUNICATION,4.4,69119316,Varies with device,1000000000.0,Free,0,Everyone,Communication,"August 3, 2018",Varies with device,Varies with device
338,Google Chrome: Fast & Secure,COMMUNICATION,4.3,9642995,Varies with device,1000000000.0,Free,0,Everyone,Communication,"August 1, 2018",Varies with device,Varies with device
340,Gmail,COMMUNICATION,4.3,4604324,Varies with device,1000000000.0,Free,0,Everyone,Communication,"August 2, 2018",Varies with device,Varies with device
