# Data Analysis for Mobile Apps

We work as data analysts for a company that builds Android and iOS mobile apps. We make our apps available on Google Play and the App Store.

We only build apps that are free to download and install, and our main source of revenue consists of in-app ads. This means our revenue for any given app is mostly influenced by the number of users who use our app — the more users that see and engage with the ads, the better. 

Our goal for this project is to analyze data to help our developers understand what type of apps are likely to attract more users.

In [1]:
from csv import reader

### The Google Play data set ###
opened_file = open('googleplaystore.csv', encoding = 'utf-8')
read_file = reader(opened_file)
android = list(read_file)
android_header = android[0]
android = android[1:]

### The App Store data set ###
opened_file = open('AppleStore.csv', encoding = 'utf-8')
read_file = reader(opened_file)
ios = list(read_file)
ios_header = ios[0]
ios = ios[1:]

In [2]:
# Function to print rows repeatedly
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [3]:
print(android_header)
explore_data(android,0,3,True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13


In [4]:
print(ios_header)
print('\n')
# False will not give rows and columns
explore_data(ios, 0, 3, True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 7197
Number of columns: 16


In [5]:
print(android_header)
explore_data(android,10472,10473)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']




In [6]:
print(len(android))
del android[10472]
print(len(android))

10841
10840


Below code is to find the duplicate apps. Looping through list of android apps and if app is coming as duplicate, appending to another list

# Remove Duplicates: Part One

In [7]:
duplicate_apps = []
unique_apps = []
for app in android:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
        
print('Number of duplicate android apps:', len(duplicate_apps))
print(duplicate_apps[:10])
print('Number of unique android apps:', len(unique_apps))


Number of duplicate android apps: 1181
['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack']
Number of unique android apps: 9659


In [8]:
duplicate_ios_apps = []
unique_ios_apps = []
for app in ios:
    name = app[0]
    if name in unique_ios_apps:
        duplicate_ios_apps.append(name)
    else:
        unique_ios_apps.append(name)
        
print('Number of duplicate ios apps:', len(duplicate_ios_apps))
print(duplicate_ios_apps[:10])
print('Number of unique ios apps:', len(unique_ios_apps))


Number of duplicate ios apps: 0
[]
Number of unique ios apps: 7197


# Remove Duplicates: Part Two

In [9]:
reviews_max = {}
for app in android:
    name = app[0]
    # If the below is not float or numeric, it doesn't work
    n_reviews = float(app[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        # Value for key "name" is n_reviews
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews
print(len(reviews_max))

# Iterate first few items in dictionary
iterator = iter(reviews_max.items())
for i in range(3):
    print(next(iterator))

9659
('Photo Editor & Candy Camera & Grid & ScrapBook', 159.0)
('Coloring book moana', 974.0)
('U Launcher Lite – FREE Live Cool Themes, Hide Apps', 87510.0)


# Remove Non-English characters apps

In [10]:
android_clean = []
already_added = []
# Loop through Google Play data set
for app in android:
    name = app[0]
    n_reviews = float(app[3])
    if (reviews_max[name] == n_reviews) and (name not in already_added):
        android_clean.append(app)
        already_added.append(name)

In [11]:
# Explore data
explore_data(android_clean,0,3, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9659
Number of columns: 13


In [12]:
# IOS Clean is not needed as they don't have number of reviews

In [13]:
def checkCharacter(string):
    for character in string:
        if ord(character) > 127:
            return False
    # This line below indentation is important, if tabbed, it will 
    # only check 1st character
    return True
        
print(checkCharacter('Instagram'))
print(checkCharacter('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(checkCharacter('Docs To Go™ Free Office Suite'))
print(checkCharacter('Instachat 😜'))

True
False
False
False


In [14]:
android_english = []
ios_english = []

for app in android_clean:
    name = app[0]
    if checkCharacter(name):
        android_english.append(app)
        
for app in ios:
    name = app[1]
    if checkCharacter(name):
         ios_english.append(app)
        
print(android_header)
explore_data(android_english, 0, 3, True)
print('\n')
print(ios_header)
explore_data(ios_english, 0, 3, True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']


Number of rows: 9117
Number of columns: 13


['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']
['284882215', 'Facebook', '

# Isolating the Free Apps

In [15]:
print(android_header)
explore_data(android_english,0,2)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']




In [16]:
android_final = []
for app in android_english:
    price = app[7]
    if price == '0':
        android_final.append(app)
explore_data(android_final,0,2,True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 8408
Number of columns: 13


In [17]:
print(ios_header)
ios_final = []
for app in ios_english:
    price = app[4]
    # Below need to make sure price is floating point
    if price == '0.0':
        ios_final.append(app)
explore_data(ios_final,0,2,True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']
['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 2922
Number of columns: 16


# Most Common Apps by Genre

In [18]:
# Function to return frequency table
def freq_table(dataset, index):
    table = {}
    total = 0
    
    for row in dataset:
        total += 1
        value = row[index]
        if value in table:
            table[value] += 1
        else:
            table[value] = 1
    
    table_percentages = {}
    for key in table:
        percentage = (table[key] / total) * 100
        table_percentages[key] = percentage 
    
    return table_percentages

In [19]:
# Display table function
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])


In [20]:
# Use above function to display freq table of prime_genre for Apple App Store
display_table(ios_final,11)

Games : 59.171800136892536
Entertainment : 7.529089664613278
Photo & Video : 5.133470225872689
Education : 3.8329911019849416
Social Networking : 3.1143052703627654
Shopping : 2.4982888432580426
Utilities : 2.2587268993839835
Music : 2.1560574948665296
Sports : 2.0533880903490758
Health & Fitness : 1.9849418206707734
Productivity : 1.7111567419575633
Lifestyle : 1.4715947980835045
News : 1.3347022587268993
Travel : 1.1293634496919918
Finance : 1.0951403148528405
Weather : 0.8898015058179329
Food & Drink : 0.8898015058179329
Reference : 0.5133470225872689
Business : 0.5133470225872689
Book : 0.2737850787132101
Medical : 0.20533880903490762
Navigation : 0.13689253935660506
Catalogs : 0.10266940451745381


In [21]:
# Use above function to display freq table of Category for Android
display_table(android_final,1)

FAMILY : 18.803520456707897
GAME : 9.60989533777355
TOOLS : 8.575166508087536
BUSINESS : 4.709800190294957
PRODUCTIVITY : 3.9724072312083734
LIFESTYLE : 3.8891531874405327
FINANCE : 3.73453853472883
MEDICAL : 3.6393910561370126
PERSONALIZATION : 3.306374881065652
SPORTS : 3.258801141769743
COMMUNICATION : 3.2231208372978117
HEALTH_AND_FITNESS : 3.1279733587059946
PHOTOGRAPHY : 3.0090390104662226
NEWS_AND_MAGAZINES : 2.7949571836346334
SOCIAL : 2.664129400570885
TRAVEL_AND_LOCAL : 2.3073263558515698
SHOPPING : 2.247859181731684
BOOKS_AND_REFERENCE : 2.1883920076117986
DATING : 1.8315889628924835
VIDEO_PLAYERS : 1.7602283539486203
MAPS_AND_NAVIGATION : 1.3558515699333968
FOOD_AND_DRINK : 1.2012369172216937
EDUCATION : 1.165556612749762
ENTERTAINMENT : 0.939581351094196
AUTO_AND_VEHICLES : 0.939581351094196
LIBRARIES_AND_DEMO : 0.9039010466222646
HOUSE_AND_HOME : 0.8087535680304472
WEATHER : 0.7968601332064701
EVENTS : 0.7136060894386299
ART_AND_DESIGN : 0.6660323501427212
PARENTING : 0.6

In [22]:
# Use above function to display freq table of Category for Android
display_table(android_final,9)

Tools : 8.563273073263558
Entertainment : 6.089438629876309
Education : 5.387725975261656
Business : 4.709800190294957
Productivity : 3.9724072312083734
Lifestyle : 3.8772597526165553
Finance : 3.73453853472883
Medical : 3.6393910561370126
Sports : 3.3301617507136063
Personalization : 3.306374881065652
Communication : 3.2231208372978117
Health & Fitness : 3.1279733587059946
Action : 3.116079923882017
Photography : 3.0090390104662226
News & Magazines : 2.7949571836346334
Social : 2.664129400570885
Travel & Local : 2.3073263558515698
Shopping : 2.247859181731684
Books & Reference : 2.1883920076117986
Simulation : 2.0813510941960036
Dating : 1.8315889628924835
Arcade : 1.8315889628924835
Casual : 1.7721217887725977
Video Players & Editors : 1.736441484300666
Maps & Navigation : 1.3558515699333968
Food & Drink : 1.2012369172216937
Puzzle : 1.1298763082778307
Racing : 1.0228353948620361
Role Playing : 0.939581351094196
Auto & Vehicles : 0.939581351094196
Strategy : 0.9039010466222646
Librar

For Android, Family and Game category apps are most common
For IOS, Games is followed by Entertainment, then Photo and Video

In [23]:
# Generate a frequency table for IOS prime_genre
prime_genre_freq = freq_table(ios_final, 11)
prime_genre_freq

{'Social Networking': 3.1143052703627654,
 'Photo & Video': 5.133470225872689,
 'Games': 59.171800136892536,
 'Music': 2.1560574948665296,
 'Reference': 0.5133470225872689,
 'Health & Fitness': 1.9849418206707734,
 'Weather': 0.8898015058179329,
 'Travel': 1.1293634496919918,
 'Shopping': 2.4982888432580426,
 'News': 1.3347022587268993,
 'Navigation': 0.13689253935660506,
 'Lifestyle': 1.4715947980835045,
 'Entertainment': 7.529089664613278,
 'Food & Drink': 0.8898015058179329,
 'Sports': 2.0533880903490758,
 'Finance': 1.0951403148528405,
 'Education': 3.8329911019849416,
 'Productivity': 1.7111567419575633,
 'Utilities': 2.2587268993839835,
 'Book': 0.2737850787132101,
 'Business': 0.5133470225872689,
 'Catalogs': 0.10266940451745381,
 'Medical': 0.20533880903490762}

# IOS Apps Analysis
From above, Games are the most frequent apps

In [24]:
for genre in prime_genre_freq:
    # variable to store sum of user ratings
    total = 0
    len_genre = 0
    for app in ios_final:
        genre_app = app[11]
        if genre_app == genre:
            user_rating = float(app[5])
            total += user_rating
            len_genre += 1
    avg_genre = total / len_genre
    print(genre, ': ', avg_genre)

Social Networking :  78567.30769230769
Photo & Video :  29249.766666666666
Games :  21560.75072296125
Music :  55396.01587301587
Reference :  89562.6
Health & Fitness :  19418.620689655174
Weather :  48275.57692307692
Travel :  34115.57575757576
Shopping :  28877.575342465752
News :  23382.17948717949
Navigation :  125037.25
Lifestyle :  17260.53488372093
Entertainment :  15006.227272727272
Food & Drink :  33333.92307692308
Sports :  25791.666666666668
Finance :  26038.6875
Education :  6103.464285714285
Productivity :  22842.22
Utilities :  11571.69696969697
Book :  16671.0
Business :  6839.6
Catalogs :  5195.0
Medical :  612.0


In [25]:
# Generate a frequency table for Android Category
android_category_table = freq_table(android_final, 1)
android_category_table

# Loop over unique genres of Android from freq table above
for category in android_category_table:
    total = 0
    len_category = 0
    # Loop over data set apps
    for app in android_final:
        category_app = app[1]
        if category_app == category:
            installs = app[5]
            installs = installs.replace('+','')
            installs = installs.replace(',','')
            installs = float(installs)
            total += installs
            len_category += 1
    avg_installs = int(total / len_category)
    print('Category is:'+ category + ', Avg Installs:'
          + str(avg_installs))

Category is:ART_AND_DESIGN, Avg Installs:1932519
Category is:AUTO_AND_VEHICLES, Avg Installs:645317
Category is:BEAUTY, Avg Installs:513151
Category is:BOOKS_AND_REFERENCE, Avg Installs:8504745
Category is:BUSINESS, Avg Installs:1602958
Category is:COMICS, Avg Installs:880440
Category is:COMMUNICATION, Avg Installs:36106662
Category is:DATING, Avg Installs:764959
Category is:EDUCATION, Avg Installs:1844897
Category is:ENTERTAINMENT, Avg Installs:12346329
Category is:EVENTS, Avg Installs:232885
Category is:FINANCE, Avg Installs:1348224
Category is:FOOD_AND_DRINK, Avg Installs:1974937
Category is:HEALTH_AND_FITNESS, Avg Installs:4263642
Category is:HOUSE_AND_HOME, Avg Installs:1391211
Category is:LIBRARIES_AND_DEMO, Avg Installs:674917
Category is:LIFESTYLE, Avg Installs:1375297
Category is:GAME, Avg Installs:15434835
Category is:FAMILY, Avg Installs:3633707
Category is:MEDICAL, Avg Installs:119216
Category is:SOCIAL, Avg Installs:24441088
Category is:SHOPPING, Avg Installs:7307823
Categ

# Android App Analysis
From above analysis of categories, VIDEO_PLAYERS has highest average installs, and MEDICAL has lowest
