# Data Analysis: Android and iOS Apps

In this project, we will analyze data to understand what type of apps are likely to attract users. This will let parties know where to focus to maximize revenue.

In [1]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print('\n')
        
    if rows_and_columns:
        print('Number of rows: ', len(dataset))
        print('Number of columns: ', len(dataset[0]))

In [2]:
apple_ds = open('AppleStore.csv')
gplay_ds = open('googleplaystore.csv')

from csv import reader
apple_apps_data = list(reader(apple_ds))
gplay_apps_data = list(reader(gplay_ds))

In [3]:
explore_data(apple_apps_data, 1, 5, True)
explore_data(gplay_apps_data, 1, 5, True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


Number of rows:  7198
Number of columns:  16
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Liv

## Data Set Columns
More info about the datasets:

- [App Store](https://www.kaggle.com/ramamet4/app-store-apple-data-set-10k-apps)
- [Google Play](https://www.kaggle.com/lava18/google-play-store-apps)

In [4]:
print(apple_apps_data[0])
print(gplay_apps_data[0])

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


In [5]:
print(gplay_apps_data[10473])

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


## Clean Data

Per [this](https://www.kaggle.com/lava18/google-play-store-apps/discussion/66015) discussion, there is a missing column in the row with index 10473 (with header). We delete it.

In [6]:
del gplay_apps_data[10473]

Let's check for duplicates in both datasets

In [7]:
def duplicates(dataset, dup_basis_index, has_header=True):
    unique = []
    dup = []

    apps_data = dataset[1:] if has_header else dataset
    
    for app in apps_data:
        reference = app[dup_basis_index]
        
        if reference not in unique:
            unique.append(reference)
        else:
            dup.append(reference)
    
    return unique, dup

gplay_unique, gplay_dup = duplicates(gplay_apps_data, 0)
print(len(gplay_dup))

1181


Let's deal with Android duplicates first

In [8]:
print(gplay_dup[:10])

['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack']


In [9]:
for app in gplay_apps_data[1:]:
    name = app[0]
    if name == 'Slack':
        print(app)

['Slack', 'BUSINESS', '4.4', '51507', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'August 2, 2018', 'Varies with device', 'Varies with device']
['Slack', 'BUSINESS', '4.4', '51507', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'August 2, 2018', 'Varies with device', 'Varies with device']
['Slack', 'BUSINESS', '4.4', '51510', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'August 2, 2018', 'Varies with device', 'Varies with device']


We're not going to remove rows randomly. We're going to retain the one with the highest number of reviews.

In [10]:
reviews_max = {}

for app in gplay_apps_data[1:]:
    name = app[0]
    n_reviews = float(app[3])
    
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews
        
print(len(reviews_max))

9659


In [11]:
android_clean = []
already_added = []

for app in gplay_apps_data[1:]:
    name = app[0]
    n_reviews = float(app[3])
    
    if n_reviews == reviews_max[name] and name not in already_added:
        android_clean.append(app)
        already_added.append(name)
        
print(len(android_clean))

for app in android_clean:
    name = app[0]
    if name == 'Slack':
        print(app)

9659
['Slack', 'BUSINESS', '4.4', '51510', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'August 2, 2018', 'Varies with device', 'Varies with device']


There are no duplicates in the iOS apps dataset

In [12]:
ios_unique, ios_dup = duplicates(apple_apps_data, 0)

print(len(ios_dup))

0


In [13]:
ios_clean = apple_apps_data[1:]

### Remove non-english apps

In [14]:
def contains_only_english_characters(name):
    non_english_count = 0
    for character in name:
        if ord(character) > 127:
            non_english_count += 1   
            if non_english_count > 3:
                return False
    
    return True


print(contains_only_english_characters('Instagram'))
print(contains_only_english_characters('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(contains_only_english_characters('Docs To Go™ Free Office Suite'))
print(contains_only_english_characters('Instachat 😜'))

True
False
True
True


In [15]:
def english_apps(dataset, name_index):
    apps = []
    for app in dataset:
        name = app[name_index]
        if contains_only_english_characters(name):
            apps.append(app)
    return apps

android_clean = english_apps(android_clean, 0)
ios_clean = english_apps(ios_clean, 1)

print(len(android_clean))
print(len(ios_clean))

9614
6183


### Isolate free apps

In [16]:
def separate_free(dataset, reference_index):
    free = []
    non_free = []
    
    for app in dataset:
        price = float(app[reference_index].replace('$',''))
        if price > 0:
            non_free.append(app)
        else:
            free.append(app)
    
    return free, non_free

android_free, android_paid = separate_free(android_clean, 7)
ios_free, ios_paid = separate_free(ios_clean, 4)

print(len(android_free))
print(len(ios_free))

8864
3222


## What's clicking?
The strategy we may want to employ is to quickly release an app in the Google Play Store, and then if it becomes profitable, release an iOS version as well. Therefore, we need to find app profiles that are successful in both the iOS and Android platforms.

That's what we're going to do in this part of the notebook. We start by building frequency tables.

### Build Frequency Tables

In [17]:
def freq_table(dataset, index):
    table = {}
    
    for app in dataset:
        val = app[index]
        if val in table:
            table[val] += 1
        else:
            table[val] = 1
            
    return table

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

print("ANDROID CATEGORIES")
print()
display_table(android_clean, 1)
print()
print("ANDROID GENRES")
print()
display_table(android_clean, 9)
print()
print("IOS PRIME GENRE")
print()
display_table(ios_clean, 11)

ANDROID CATEGORIES

FAMILY : 1858
GAME : 944
TOOLS : 828
BUSINESS : 419
MEDICAL : 395
PERSONALIZATION : 375
PRODUCTIVITY : 373
LIFESTYLE : 364
FINANCE : 345
SPORTS : 325
COMMUNICATION : 314
HEALTH_AND_FITNESS : 288
PHOTOGRAPHY : 280
NEWS_AND_MAGAZINES : 250
SOCIAL : 239
TRAVEL_AND_LOCAL : 219
BOOKS_AND_REFERENCE : 218
SHOPPING : 201
DATING : 170
VIDEO_PLAYERS : 163
MAPS_AND_NAVIGATION : 129
FOOD_AND_DRINK : 112
EDUCATION : 106
ENTERTAINMENT : 87
LIBRARIES_AND_DEMO : 84
AUTO_AND_VEHICLES : 84
WEATHER : 79
HOUSE_AND_HOME : 73
EVENTS : 64
PARENTING : 60
ART_AND_DESIGN : 60
COMICS : 55
BEAUTY : 53

ANDROID GENRES

Tools : 827
Entertainment : 557
Education : 503
Business : 419
Medical : 395
Personalization : 375
Productivity : 373
Lifestyle : 363
Finance : 345
Sports : 331
Communication : 314
Action : 299
Health & Fitness : 288
Photography : 280
News & Magazines : 250
Social : 239
Travel & Local : 218
Books & Reference : 218
Shopping : 201
Simulation : 190
Arcade : 184
Dating : 170
Casual :

## Analysis
These apply to english-language apps only

### iOS
Far and away the most number of free apps in the Apple App Store are games. Next to it is Entertainment. 

It would seem that for an app to have a chance at success, practical purposes like News, Weather, Navigation and Travel should take a back seat in favor of apps that are meant to entertain.

But we should take note that this only shows the *number* of apps in the data, it doesn't show the number of users.

Although we can theorize that many apps of a type in the app store mean a lot of users, we still need to verify that with data.

### Android
The Categories and Genres data in the Android dataset shows some differences from what we saw in its iOS counterpart. Here we see that although the most common Category and Genre are Family and Tools, respectively, the numbers show that the app categories are a bit more balanced.

## Average users per genre

In [18]:
ios_prime_genre = freq_table(ios_clean, 11)

rating_counts = {}

for genre in ios_prime_genre:
    total = 0
    len_genre = 0
    
    for app in ios_clean:
        genre_app = app[11]
        if genre == genre_app:
            rating_count = float(app[5])
            total += rating_count
            len_genre += 1
    
    avg_ratings_count = total / len_genre
    rating_counts[genre] = avg_ratings_count

In [19]:
table = rating_counts
table_display = []
for key in table:
    key_val_as_tuple = (table[key], key)
    table_display.append(key_val_as_tuple)

table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
    print(entry[1], ':', entry[0])

Social Networking : 60253.84920634921
Music : 29047.109489051094
Reference : 27037.188679245282
Shopping : 26635.011764705883
Finance : 23353.530612244896
Weather : 23145.246376811596
Food & Drink : 19934.386363636364
Navigation : 19370.821428571428
Travel : 19030.183333333334
News : 16980.315789473683
Games : 15586.759433962265
Sports : 15350.913461538461
Photo & Video : 14688.715542521993
Health & Fitness : 10802.157575757576
Book : 10359.2
Lifestyle : 8930.373737373737
Entertainment : 8862.409799554565
Productivity : 8508.089285714286
Utilities : 7927.525821596244
Business : 5149.320754716981
Catalogs : 3465.0
Education : 2472.278048780488
Medical : 648.952380952381


In [20]:
for app in ios_clean:
    if app[-5] == 'Social Networking':
        print(app[1], ':', app[5]) # print name and number of ratings

Facebook : 2974676
Pinterest : 1061624
Skype for iPhone : 373519
Messenger : 351466
Tumblr : 334293
WhatsApp Messenger : 287589
Kik : 260965
ooVoo – Free Video Call, Text and Voice : 177501
TextNow - Unlimited Text + Calls : 164963
Viber Messenger – Text & Call : 164249
Followers - Social Analytics For Instagram : 112778
MeetMe - Chat and Meet New People : 97072
We Heart It - Fashion, wallpapers, quotes, tattoos : 90414
InsTrack for Instagram - Analytics Plus More : 85535
Tango - Free Video Call, Voice and Chat : 75412
LinkedIn : 71856
Match™ - #1 Dating App. : 60659
Skype for iPad : 60163
POF - Best Dating App for Conversations : 52642
Timehop : 49510
Find My Family, Friends & iPhone - Life360 Locator : 43877
Whisper - Share, Express, Meet : 39819
Hangouts : 36404
LINE PLAY - Your Avatar World : 34677
WeChat : 34584
Badoo - Meet New People, Chat, Socialize. : 34428
Followers + for Instagram - Follower Analytics : 28633
GroupMe : 28260
Marco Polo Video Walkie Talkie : 27662
Miitomo : 2

## iOS Notes
We shouldn't blindly make recommendations based on the above data because the numbers might be inflated by giants(Facebook, Spotify, etc). And therefore the genre might not be as popular as the raw numbers suggest.

We could suggest a Reference-type app with game elements?

In [21]:
display_table(android_clean, 5)

1,000,000+ : 1414
100,000+ : 1106
10,000+ : 1021
10,000,000+ : 937
1,000+ : 880
100+ : 704
5,000,000+ : 605
500,000+ : 504
5,000+ : 465
50,000+ : 463
10+ : 384
500+ : 328
50,000,000+ : 204
50+ : 204
100,000,000+ : 189
5+ : 82
1+ : 66
500,000,000+ : 24
1,000,000,000+ : 20
0+ : 13
0 : 1


In [27]:
# 1, 10
android_category_freq = freq_table(android_clean, 1)

android_install_counts = {}

for category in android_category_freq:
    total = 0
    len_category = 0
    
    for app in android_clean:
        category_app = app[1]
        if category == category_app:
            installs = app[5].replace(',', '')
            installs = installs.replace('+', '')
            total += float(installs)
            len_category += 1
    
    android_install_counts[category] = total / len_category
    
    print(category + " " + str(total / len_category))

VIDEO_PLAYERS 24121489.079754602
HOUSE_AND_HOME 1331540.5616438356
MAPS_AND_NAVIGATION 3900634.7286821706
ART_AND_DESIGN 1887285.0
FOOD_AND_DRINK 1891060.2767857143
SHOPPING 6966908.880597015
WEATHER 4570892.658227848
ENTERTAINMENT 11375402.298850575
SOCIAL 22961790.384937238
PARENTING 525351.8333333334
MEDICAL 96944.49873417722
TRAVEL_AND_LOCAL 13218662.767123288
PRODUCTIVITY 15530942.008042896
BEAUTY 513151.88679245283
EVENTS 249580.640625
DATING 828971.2176470588
BUSINESS 1663758.627684964
NEWS_AND_MAGAZINES 9472807.04
FAMILY 3345018.516684607
AUTO_AND_VEHICLES 632501.3214285715
HEALTH_AND_FITNESS 3972300.388888889
LIFESTYLE 1369954.7774725275
EDUCATION 1782566.0377358492
LIBRARIES_AND_DEMO 630903.6904761905
TOOLS 9785955.211352658
PHOTOGRAPHY 16636241.267857144
SPORTS 3373767.6861538463
GAME 14256217.600635594
PERSONALIZATION 4086652.4853333333
COMICS 817657.2727272727
COMMUNICATION 35153714.17515924
BOOKS_AND_REFERENCE 7641777.871559633
FINANCE 1319851.4028985507


In [28]:
table = android_install_counts
table_display = []
for key in table:
    key_val_as_tuple = (table[key], key)
    table_display.append(key_val_as_tuple)

table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
    print(entry[1], ':', entry[0])

COMMUNICATION : 35153714.17515924
VIDEO_PLAYERS : 24121489.079754602
SOCIAL : 22961790.384937238
PHOTOGRAPHY : 16636241.267857144
PRODUCTIVITY : 15530942.008042896
GAME : 14256217.600635594
TRAVEL_AND_LOCAL : 13218662.767123288
ENTERTAINMENT : 11375402.298850575
TOOLS : 9785955.211352658
NEWS_AND_MAGAZINES : 9472807.04
BOOKS_AND_REFERENCE : 7641777.871559633
SHOPPING : 6966908.880597015
WEATHER : 4570892.658227848
PERSONALIZATION : 4086652.4853333333
HEALTH_AND_FITNESS : 3972300.388888889
MAPS_AND_NAVIGATION : 3900634.7286821706
SPORTS : 3373767.6861538463
FAMILY : 3345018.516684607
FOOD_AND_DRINK : 1891060.2767857143
ART_AND_DESIGN : 1887285.0
EDUCATION : 1782566.0377358492
BUSINESS : 1663758.627684964
LIFESTYLE : 1369954.7774725275
HOUSE_AND_HOME : 1331540.5616438356
FINANCE : 1319851.4028985507
DATING : 828971.2176470588
COMICS : 817657.2727272727
AUTO_AND_VEHICLES : 632501.3214285715
LIBRARIES_AND_DEMO : 630903.6904761905
PARENTING : 525351.8333333334
BEAUTY : 513151.88679245283
EV

## Android R