### Profitable App Profiles for the App Store and Google Play Markets ###

Analyzing the apps for profit.

Suggest the apps to be built.

In [1]:
from csv import reader

opened_file = open('AppleStore.csv')
read_file = reader(opened_file)
ios = list(read_file)
ios_header = ios[0]
ios = ios[1:]

opened_file = open('googleplaystore.csv')
read_file = reader(opened_file)
android = list(read_file)
android_header = android[0]
android = android[1:]

In [2]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [3]:
explore_data(ios, 1, 5, rows_and_columns=True)
print('\n')
explore_data(android, 1, 5, rows_and_columns=True)

['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


['284035177', 'Pandora - Music & Radio', '130242560', 'USD', '0.0', '1126879', '3594', '4.0', '4.5', '8.4.1', '12+', 'Music', '37', '4', '1', '1']


Number of rows: 7197
Number of columns: 16


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Dra

In [4]:
print(android[10472])  # incorrect row
print('\n')
print(android_header)  # header
print('\n')
print(android[0])  #correct row

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


In [5]:
print(len(android))
del android[10472]
print(len(android))

10841
10840


There are some duplicates in the Play Store dataset. We are going to remove them. Please see some examples below. 

In [6]:
duplicate_apps = []
unique_apps = []

for app in android:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
        
print('Number of duplicate apps: ', len(duplicate_apps))
print('\n')
print('Examples of duplicate apps: ', duplicate_apps[:10])

Number of duplicate apps:  1181


Examples of duplicate apps:  ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack']


We are going to keep the apps with the most of reviews and delete the others. 

In [7]:
print('Expected length: ', len(android) - len(duplicate_apps))

Expected length:  9659


We now store the apps with the most reviews in a dict. 

In [8]:
reviews_max = {}
for app in android:
    name = app[0]
    n_reviews = float(app[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    if name not in reviews_max: 
        reviews_max[name] = n_reviews
        
print(len(reviews_max))

9659


Create two list so we can keep the cleaned apps.

In [9]:
android_clean = []
already_added = []

Store the app into the clean list only when the review numbers match the max number in the dict and it's not already in the added list.

In [10]:
for app in android:
    name = app[0]
    n_reviews = float(app[3])
    
    if n_reviews == reviews_max[name] and name not in already_added:
        android_clean.append(app)
        already_added.append(name)

In [11]:
explore_data(android_clean, 1, 5)
print(len(android_clean))
print(len(android))

['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']


['Paper flowers instructions', 'ART_AND_DESIGN', '4.4', '167', '5.6M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'March 26, 2017', '1.0', '2.3 and up']


9659
10840


Now we will try to filter out the non English apps but keep those with special chars as many as possible.

In [12]:
def is_english(string):
    
    non_eng_letter = 0
    
    for letter in string:
        if ord(letter) > 127:
            non_eng_letter += 1
            
    if non_eng_letter > 3:
        return False
    else:
        return True

In [13]:
print(is_english('Instagram'))   
print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(is_english('Docs To Go™ Free Office Suit'))
print(is_english('Instachat 😜'))

True
False
True
True


Now we clean out those non-Englishg apps 

In [14]:
eng_android = []

for app in android_clean:
    
    name = app[0]
    
    if is_english(name):
        eng_android.append(app)
        
eng_ios = []
        
for app in ios:
    name = app[1]
    
    if is_english(name):
        eng_ios.append(app)
        
explore_data(eng_android, 1, 5, True)
print('\n')  
explore_data(eng_ios, 1, 5, True)


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']


['Paper flowers instructions', 'ART_AND_DESIGN', '4.4', '167', '5.6M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'March 26, 2017', '1.0', '2.3 and up']


Number of rows: 9614
Number of columns: 13


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5',

In [18]:
print(ios_header)
print('\n')
print(android_header)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


Now we want to only filter out those free apps.

In [22]:
ios_final = []

for app in eng_ios:
    
    price = app[4]
    
    if price == '0.0':
        ios_final.append(app)
        
explore_data(ios_final, 1, 5, True)

android_final = []

for app in eng_android:
    
    price = app[7]
    
    if price == '0':
        android_final.append(app)
        
explore_data(android_final, 1, 5, True)

['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


['284035177', 'Pandora - Music & Radio', '130242560', 'USD', '0.0', '1126879', '3594', '4.0', '4.5', '8.4.1', '12+', 'Music', '37', '4', '1', '1']


Number of rows: 3222
Number of columns: 16
['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Num

We want to build an app that is profitable for both platforms. We will test water with an app for Play Store first then after six month we can go with App Store.

Now we want to see the frenquce for each genre.

In [23]:
print(ios_header)
print('\n')
print(android_header)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


In [49]:
def freq_table(dataset, index):
    table = {}
    total_apps = len(dataset)
    
    for data in dataset:
        key = data[index]
        if key in table:
            table[key] += 1
        else:
            table[key] = 1
    
    for key in table:
        table[key] = round(table[key]/total_apps*100, 3)
           
    return table

In [39]:
ios_genre_freq_table = freq_table(ios_final, 11)
print(ios_genre_freq_table)

{'Social Networking': 0.0329, 'Photo & Video': 0.04966, 'Games': 0.58163, 'Music': 0.02048, 'Reference': 0.00559, 'Health & Fitness': 0.02017, 'Weather': 0.00869, 'Utilities': 0.02514, 'Travel': 0.01241, 'Shopping': 0.02607, 'News': 0.01335, 'Navigation': 0.00186, 'Lifestyle': 0.01583, 'Entertainment': 0.07883, 'Food & Drink': 0.00807, 'Sports': 0.02142, 'Book': 0.00435, 'Finance': 0.01117, 'Education': 0.03662, 'Productivity': 0.01738, 'Business': 0.00528, 'Catalogs': 0.00124, 'Medical': 0.00186}


In [50]:
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [51]:
display_table(ios_final, 11)
print('\n')
display_table(android_final,1)
print('\n')
display_table(android_final,9)

Games : 58.163
Entertainment : 7.883
Photo & Video : 4.966
Education : 3.662
Social Networking : 3.29
Shopping : 2.607
Utilities : 2.514
Sports : 2.142
Music : 2.048
Health & Fitness : 2.017
Productivity : 1.738
Lifestyle : 1.583
News : 1.335
Travel : 1.241
Finance : 1.117
Weather : 0.869
Food & Drink : 0.807
Reference : 0.559
Business : 0.528
Book : 0.435
Navigation : 0.186
Medical : 0.186
Catalogs : 0.124


FAMILY : 18.908
GAME : 9.725
TOOLS : 8.461
BUSINESS : 4.592
LIFESTYLE : 3.903
PRODUCTIVITY : 3.892
FINANCE : 3.7
MEDICAL : 3.531
SPORTS : 3.396
PERSONALIZATION : 3.317
COMMUNICATION : 3.238
HEALTH_AND_FITNESS : 3.08
PHOTOGRAPHY : 2.944
NEWS_AND_MAGAZINES : 2.798
SOCIAL : 2.662
TRAVEL_AND_LOCAL : 2.335
SHOPPING : 2.245
BOOKS_AND_REFERENCE : 2.144
DATING : 1.861
VIDEO_PLAYERS : 1.794
MAPS_AND_NAVIGATION : 1.399
FOOD_AND_DRINK : 1.241
EDUCATION : 1.162
ENTERTAINMENT : 0.959
LIBRARIES_AND_DEMO : 0.936
AUTO_AND_VEHICLES : 0.925
HOUSE_AND_HOME : 0.824
WEATHER : 0.801
EVENTS : 0.711
PARE

**Most popular genres of App Store**

In [53]:
genres_ios = freq_table(ios_final, 11)

for genre in genres_ios:
    total = 0
    len_genre = 0
    
    for app in ios_final:
        genre_app = app[11]
        if genre_app == genre:
            total += float(app[5])
            len_genre += 1
    
    avg_num_user_rting = total/len_genre
    
    print(genre, avg_num_user_rting)

Social Networking 71548.34905660378
Photo & Video 28441.54375
Games 22788.6696905016
Music 57326.530303030304
Reference 74942.11111111111
Health & Fitness 23298.015384615384
Weather 52279.892857142855
Utilities 18684.456790123455
Travel 28243.8
Shopping 26919.690476190477
News 21248.023255813954
Navigation 86090.33333333333
Lifestyle 16485.764705882353
Entertainment 14029.830708661417
Food & Drink 33333.92307692308
Sports 23008.898550724636
Book 39758.5
Finance 31467.944444444445
Education 7003.983050847458
Productivity 21028.410714285714
Business 7491.117647058823
Catalogs 4004.0
Medical 612.0


In [55]:
for app in ios_final: 
    if app[11] == 'Reference':
        print(app[1], ': ', app[5])

Bible :  985920
Dictionary.com Dictionary & Thesaurus :  200047
Dictionary.com Dictionary & Thesaurus for iPad :  54175
Google Translate :  26786
Muslim Pro: Ramadan 2017 Prayer Times, Azan, Quran :  18418
New Furniture Mods - Pocket Wiki & Game Tools for Minecraft PC Edition :  17588
Merriam-Webster Dictionary :  16849
Night Sky :  12122
City Maps for Minecraft PE - The Best Maps for Minecraft Pocket Edition (MCPE) :  8535
LUCKY BLOCK MOD ™ for Minecraft PC Edition - The Best Pocket Wiki & Mods Installer Tools :  4693
GUNS MODS for Minecraft PC Edition - Mods Tools :  1497
Guides for Pokémon GO - Pokemon GO News and Cheats :  826
WWDC :  762
Horror Maps for Minecraft PE - Download The Scariest Maps for Minecraft Pocket Edition (MCPE) Free :  718
VPN Express :  14
Real Bike Traffic Rider Virtual Reality Glasses :  8
教えて!goo :  0
Jishokun-Japanese English Dictionary & Translator :  0


In [68]:
categories_android = freq_table(android_final, 1)
print(categories_android)

{'ART_AND_DESIGN': 0.643, 'AUTO_AND_VEHICLES': 0.925, 'BEAUTY': 0.598, 'BOOKS_AND_REFERENCE': 2.144, 'BUSINESS': 4.592, 'COMICS': 0.62, 'COMMUNICATION': 3.238, 'DATING': 1.861, 'EDUCATION': 1.162, 'ENTERTAINMENT': 0.959, 'EVENTS': 0.711, 'FINANCE': 3.7, 'FOOD_AND_DRINK': 1.241, 'HEALTH_AND_FITNESS': 3.08, 'HOUSE_AND_HOME': 0.824, 'LIBRARIES_AND_DEMO': 0.936, 'LIFESTYLE': 3.903, 'GAME': 9.725, 'FAMILY': 18.908, 'MEDICAL': 3.531, 'SOCIAL': 2.662, 'SHOPPING': 2.245, 'PHOTOGRAPHY': 2.944, 'SPORTS': 3.396, 'TRAVEL_AND_LOCAL': 2.335, 'TOOLS': 8.461, 'PERSONALIZATION': 3.317, 'PRODUCTIVITY': 3.892, 'PARENTING': 0.654, 'WEATHER': 0.801, 'VIDEO_PLAYERS': 1.794, 'NEWS_AND_MAGAZINES': 2.798, 'MAPS_AND_NAVIGATION': 1.399}


In [71]:
for category in categories_android:
    total = 0
    len_category = 0
    
    for app in android_final:
        category_app = app[1]
        
        if category_app == category:
            n_installs = app[5]
            n_installs = n_installs.replace('+', '')
            n_installs = n_installs.replace(',', '')
            total += float(n_installs)
            len_category += 1
            
    avg_num_installs = total/len_category
    
    print(category, ': ', avg_num_installs)
            

ART_AND_DESIGN :  1986335.0877192982
AUTO_AND_VEHICLES :  647317.8170731707
BEAUTY :  513151.88679245283
BOOKS_AND_REFERENCE :  8767811.894736841
BUSINESS :  1712290.1474201474
COMICS :  817657.2727272727
COMMUNICATION :  38456119.167247385
DATING :  854028.8303030303
EDUCATION :  1833495.145631068
ENTERTAINMENT :  11640705.88235294
EVENTS :  253542.22222222222
FINANCE :  1387692.475609756
FOOD_AND_DRINK :  1924897.7363636363
HEALTH_AND_FITNESS :  4188821.9853479853
HOUSE_AND_HOME :  1331540.5616438356
LIBRARIES_AND_DEMO :  638503.734939759
LIFESTYLE :  1437816.2687861272
GAME :  15588015.603248259
FAMILY :  3695641.8198090694
MEDICAL :  120550.61980830671
SOCIAL :  23253652.127118643
SHOPPING :  7036877.311557789
PHOTOGRAPHY :  17840110.40229885
SPORTS :  3638640.1428571427
TRAVEL_AND_LOCAL :  13984077.710144928
TOOLS :  10801391.298666667
PERSONALIZATION :  5201482.6122448975
PRODUCTIVITY :  16787331.344927534
PARENTING :  542603.6206896552
WEATHER :  5074486.197183099
VIDEO_PLAYERS 