This project is designed to take two datasets (Applestore and googleplaystore), and analyse them to see what type of apps are likely to attract more users.

In [1]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [2]:
def open_dataset(file_name):
    
    opened_file = open(file_name)
    from csv import reader
    read_file = reader(opened_file)
    data = list(read_file)
    
    return data

In [3]:
appledata = open_dataset('AppleStore.csv')

In [4]:
googdata = open_dataset('googleplaystore.csv')

In [5]:
appledata[155]

['367623543',
 'Fox News',
 '64705536',
 'USD',
 '0.0',
 '132703',
 '394',
 '4.0',
 '3.0',
 '2.6',
 '4+',
 'News',
 '37',
 '5',
 '1',
 '0']

In [6]:
del(googdata[10473])

In [7]:
googdata[10473]

['osmino Wi-Fi: free WiFi',
 'TOOLS',
 '4.2',
 '134203',
 '4.1M',
 '10,000,000+',
 'Free',
 '0',
 'Everyone',
 'Tools',
 'August 7, 2018',
 '6.06.14',
 '4.4 and up']

Let's look at duplicates. The Google Play dataset likely has a few. 

In [8]:
duplicate_apps = []
unique_apps = []

for i in googdata: 
    appname = i[0]
    if appname in unique_apps:
        duplicate_apps.append(appname)
    else:
        unique_apps.append(appname)
        
print('The number of duplicate apps is:', len(duplicate_apps))
duplicate_apps[1:15]

googdata[10473]

The number of duplicate apps is: 1181


['osmino Wi-Fi: free WiFi',
 'TOOLS',
 '4.2',
 '134203',
 '4.1M',
 '10,000,000+',
 'Free',
 '0',
 'Everyone',
 'Tools',
 'August 7, 2018',
 '6.06.14',
 '4.4 and up']

In [9]:
reviews_max = {}

for i in googdata[1:]:
    name = i[0]
    n_reviews = float(i[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    if name not in reviews_max:
        reviews_max[name] = n_reviews
        
len(reviews_max)
    

9659

In [10]:
android_clean = []
already_added = []

for i in googdata[1:]:
    name = i[0]
    n_reviews = float(i[3])
    if n_reviews == reviews_max[name] and name not in already_added:
        android_clean.append(i)
        already_added.append(name)

android_clean[1:10]
len(android_clean)
    


9659

In [11]:
def uncommon_remover(check_string):
    reginald = 0
    for i in check_string:
        if ord(i) > 127:
            reginald += 1
        if reginald > 3:
            return False
    return True

In [12]:
english_apps_android = []

for x in android_clean:
    if uncommon_remover(x[0]) == True:
        english_apps_android.append(x)

len(english_apps_android)

9614

In [13]:
english_apps_apple = []

for x in appledata[1:]:
    if uncommon_remover(x[1]) == True:
        english_apps_apple.append(x)
        
len(english_apps_apple)

6183

In [14]:
## 7 for goog
## 4 for apple

free_android_apps = []

for i in english_apps_android:
    if i[7] == '0':
        free_android_apps.append(i)

free_apple_apps = []

for i in english_apps_apple:
    if i[4] == '0.0':
        free_apple_apps.append(i)
        
len(free_apple_apps)


3222

Why do we want an app that fits both the App Store and Google Play? It's because our idea is simple: Build an Android version first, and if a good response is had after six months, build an iOS version. So consequently it makes sense to find an app that works in both markets.

In [15]:
googdata[0]

['App',
 'Category',
 'Rating',
 'Reviews',
 'Size',
 'Installs',
 'Type',
 'Price',
 'Content Rating',
 'Genres',
 'Last Updated',
 'Current Ver',
 'Android Ver']

In [16]:
appledata[0]

['id',
 'track_name',
 'size_bytes',
 'currency',
 'price',
 'rating_count_tot',
 'rating_count_ver',
 'user_rating',
 'user_rating_ver',
 'ver',
 'cont_rating',
 'prime_genre',
 'sup_devices.num',
 'ipadSc_urls.num',
 'lang.num',
 'vpp_lic']

We have 'Category' and 'Genres' in the Google data. We also have 'prime_genre' in the Apple state. This is probably our best option for seeing the similarities between the two data sets.

In [17]:
def freq_table(dataset, index):
    content_ratings = {}
    total = 0
    for i in dataset:
        total += 1
        c_rating = i[index]
        if c_rating in content_ratings:
            content_ratings[c_rating] += 1
        elif c_rating not in content_ratings:
            content_ratings[c_rating] = 1
    content_percentages = {}
    for i in content_ratings:
        percentage = (content_ratings[i] / total * 100)
        content_percentages[i] = percentage 
    return content_percentages

In [18]:
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [19]:
display_table(free_android_apps, 1)

FAMILY : 18.907942238267147
GAME : 9.724729241877256
TOOLS : 8.461191335740072
BUSINESS : 4.591606498194946
LIFESTYLE : 3.9034296028880866
PRODUCTIVITY : 3.892148014440433
FINANCE : 3.7003610108303246
MEDICAL : 3.531137184115524
SPORTS : 3.395758122743682
PERSONALIZATION : 3.3167870036101084
COMMUNICATION : 3.2378158844765346
HEALTH_AND_FITNESS : 3.0798736462093865
PHOTOGRAPHY : 2.944494584837545
NEWS_AND_MAGAZINES : 2.7978339350180503
SOCIAL : 2.6624548736462095
TRAVEL_AND_LOCAL : 2.33528880866426
SHOPPING : 2.2450361010830324
BOOKS_AND_REFERENCE : 2.1435018050541514
DATING : 1.861462093862816
VIDEO_PLAYERS : 1.7937725631768955
MAPS_AND_NAVIGATION : 1.3989169675090252
FOOD_AND_DRINK : 1.2409747292418771
EDUCATION : 1.1620036101083033
ENTERTAINMENT : 0.9589350180505415
LIBRARIES_AND_DEMO : 0.9363718411552346
AUTO_AND_VEHICLES : 0.9250902527075812
HOUSE_AND_HOME : 0.8235559566787004
WEATHER : 0.8009927797833934
EVENTS : 0.7107400722021661
PARENTING : 0.6543321299638989
ART_AND_DESIGN : 

In [20]:
display_table(free_android_apps, 9)

Tools : 8.449909747292418
Entertainment : 6.069494584837545
Education : 5.347472924187725
Business : 4.591606498194946
Productivity : 3.892148014440433
Lifestyle : 3.892148014440433
Finance : 3.7003610108303246
Medical : 3.531137184115524
Sports : 3.463447653429603
Personalization : 3.3167870036101084
Communication : 3.2378158844765346
Action : 3.1024368231046933
Health & Fitness : 3.0798736462093865
Photography : 2.944494584837545
News & Magazines : 2.7978339350180503
Social : 2.6624548736462095
Travel & Local : 2.3240072202166067
Shopping : 2.2450361010830324
Books & Reference : 2.1435018050541514
Simulation : 2.0419675090252705
Dating : 1.861462093862816
Arcade : 1.8501805054151623
Video Players & Editors : 1.7712093862815883
Casual : 1.7599277978339352
Maps & Navigation : 1.3989169675090252
Food & Drink : 1.2409747292418771
Puzzle : 1.128158844765343
Racing : 0.9927797833935018
Role Playing : 0.9363718411552346
Libraries & Demo : 0.9363718411552346
Auto & Vehicles : 0.9250902527075

In [21]:
display_table(free_apple_apps, 11)

Games : 58.16263190564867
Entertainment : 7.883302296710118
Photo & Video : 4.9658597144630665
Education : 3.662321539416512
Social Networking : 3.2898820608317814
Shopping : 2.60707635009311
Utilities : 2.5139664804469275
Sports : 2.1415270018621975
Music : 2.0484171322160147
Health & Fitness : 2.0173805090006205
Productivity : 1.7380509000620732
Lifestyle : 1.5828677839851024
News : 1.3345747982619491
Travel : 1.2414649286157666
Finance : 1.1173184357541899
Weather : 0.8690254500310366
Food & Drink : 0.8069522036002483
Reference : 0.5586592178770949
Business : 0.5276225946617008
Book : 0.4345127250155183
Navigation : 0.186219739292365
Medical : 0.186219739292365
Catalogs : 0.12414649286157665


The most common genre in the App Store is games by far. Entertainment and Photo and Video are runners-up. Most apps are designed for entertainment, games, etc. At least 60%+ of the apps are entertainment. That being said, a large number of apps does not imply a large number of users per app. Likewise, this only speaks about free apps - perhaps games are more likely to be free.

For the Google Play set, the patterns are much less extreme. Family and Tools head our two datasets, and while entertainment is certainly there, there is a much lower domination of the store by entertainment apps. Business and Productivity apps have a much higher relative frequency.

As it stands there is a dominating category in Apple - games. The only real recommendation to be made is for games on the basis

In [33]:
genre_table = freq_table(free_apple_apps, 11)

for genre in genre_table:
    total = 0
    len_genre = 0
    for i in free_apple_apps:
        genre_app = i[11]
        if genre_app == genre:
            user_rating = float(i[5])
            total += user_rating
            len_genre += 1
    avg_norating = total / len_genre
    print(genre)
    print(avg_norating)
    

Finance
31467.944444444445
Education
7003.983050847458
Business
7491.117647058823
Entertainment
14029.830708661417
Reference
74942.11111111111
Productivity
21028.410714285714
Photo & Video
28441.54375
Catalogs
4004.0
Medical
612.0
Social Networking
71548.34905660378
Book
39758.5
Lifestyle
16485.764705882353
Food & Drink
33333.92307692308
Travel
28243.8
Utilities
18684.456790123455
Games
22788.6696905016
Weather
52279.892857142855
Shopping
26919.690476190477
News
21248.023255813954
Music
57326.530303030304
Health & Fitness
23298.015384615384
Sports
23008.898550724636
Navigation
86090.33333333333


Navigation, Social Networking, and Reference are some of the most popular downloads per app. This may be due to network effects (social networking is primarily used because of other people using said software), or due to superiority to other apps - shopping apps may be rivalrous in the sense that they offer different marketplaces, but one reference app may simply be a *better* offering than the other. Likewise for navigation. 

A reference app may be the way to go.

In [36]:
category_table = freq_table(free_android_apps, 1)

for category in category_table:
    total = 0
    len_category = 0
    for i in free_android_apps:
        category_app = i[1]
        if category_app == category:
            installs = i[5].replace('+', '')
            installs = float(installs.replace(',',''))
            total += installs
            len_category += 1
    avg_norating = total / len_category
    print(category)
    print(avg_norating)

COMICS
817657.2727272727
COMMUNICATION
38456119.167247385
BUSINESS
1712290.1474201474
FAMILY
3695641.8198090694
FINANCE
1387692.475609756
BEAUTY
513151.88679245283
ART_AND_DESIGN
1986335.0877192982
SOCIAL
23253652.127118643
LIBRARIES_AND_DEMO
638503.734939759
PERSONALIZATION
5201482.6122448975
DATING
854028.8303030303
PHOTOGRAPHY
17840110.40229885
PARENTING
542603.6206896552
VIDEO_PLAYERS
24727872.452830188
MEDICAL
120550.61980830671
MAPS_AND_NAVIGATION
4056941.7741935486
EDUCATION
1833495.145631068
TRAVEL_AND_LOCAL
13984077.710144928
ENTERTAINMENT
11640705.88235294
NEWS_AND_MAGAZINES
9549178.467741935
LIFESTYLE
1437816.2687861272
AUTO_AND_VEHICLES
647317.8170731707
BOOKS_AND_REFERENCE
8767811.894736841
GAME
15588015.603248259
PRODUCTIVITY
16787331.344927534
HOUSE_AND_HOME
1331540.5616438356
SPORTS
3638640.1428571427
SHOPPING
7036877.311557789
HEALTH_AND_FITNESS
4188821.9853479853
TOOLS
10801391.298666667
WEATHER
5074486.197183099
FOOD_AND_DRINK
1924897.7363636363
EVENTS
253542.2222222