# App Market Analysis

We're tyring to find insights that would better serve our dev and marketing team figure out the best road to take with our apps.

The goal is to optimize resources by deriving knowledge from these data sets.

In [4]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))
        
from csv import reader
data1 = open('AppleStore.csv')
data2 = open('googleplaystore.csv')
read_file1 = reader(data1)
read_file2 = reader(data2)
appStore = list(read_file1)
gStore = list(read_file2)

In [5]:
explore_data(appStore[1:], 0, 10, rows_and_columns=True)
explore_data(gStore[1:], 0, 10, rows_and_columns=True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


['284035177', 'Pandora - Music & Radio', '130242560', 'USD', '0.0', '1126879', '3594', '4.0', '4.5', '8.4.1', '12+', 'Music', '37', '4', '1', '1']


['429047995', 'Pinterest', '74778624', 'USD', '0.0', '1061624', '1814', '4.5', '4.0', '6.26', '12+', 'Social Networking', '37', '5', '27', '1']


['282935706', 'Bible', '92774400', 'USD', '0.0', '985920', '5320', '4.5', '5.0', '7.5.1', '4+', 'Reference', '37', '5', '45', '1']


['5538347

In [6]:
print(appStore[0], "\n","\n", gStore[0])

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'] 
 
 ['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


Here we have the names of the different columns. We think Size, Price, 

In [7]:
# Here we extract the tags and index for each column

def indexTags(dataset):
    indexDict = {}
    index = 0
    for a in dataset[0]:
        indexDict[index] = a
        index += 1
    return indexDict

Acolumns = indexTags(appStore)
Gcolumns = indexTags(gStore)

In [8]:
print("AppStore Columns")
Acolumns

AppStore Columns


{0: 'id',
 1: 'track_name',
 2: 'size_bytes',
 3: 'currency',
 4: 'price',
 5: 'rating_count_tot',
 6: 'rating_count_ver',
 7: 'user_rating',
 8: 'user_rating_ver',
 9: 'ver',
 10: 'cont_rating',
 11: 'prime_genre',
 12: 'sup_devices.num',
 13: 'ipadSc_urls.num',
 14: 'lang.num',
 15: 'vpp_lic'}

In [9]:
print("GStore Columns")
Gcolumns

GStore Columns


{0: 'App',
 1: 'Category',
 2: 'Rating',
 3: 'Reviews',
 4: 'Size',
 5: 'Installs',
 6: 'Type',
 7: 'Price',
 8: 'Content Rating',
 9: 'Genres',
 10: 'Last Updated',
 11: 'Current Ver',
 12: 'Android Ver'}

In [10]:
duplicate_apps = []
unique_apps = []

for app in gStore:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
        
print("Number of duplicate apps: ", len(duplicate_apps))
print("\n")
print("Examples of duplicate apps: ", duplicate_apps[:15])

Number of duplicate apps:  1181


Examples of duplicate apps:  ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack', 'FreshBooks Classic', 'Insightly CRM', 'QuickBooks Accounting: Invoicing & Expenses', 'HipChat - Chat Built for Teams', 'Xero Accounting Software']


The Google App Store is plagued with duplicates, here are some of them:

In [11]:
print("Examples of duplicate apps: ", duplicate_apps[:15])

Examples of duplicate apps:  ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack', 'FreshBooks Classic', 'Insightly CRM', 'QuickBooks Accounting: Invoicing & Expenses', 'HipChat - Chat Built for Teams', 'Xero Accounting Software']


We won't be removing the duplicates *randomly*, but using a heuristic that allows us to find the latest version of the duplicates by checking the amount of reviews.

In [12]:
uniqueAppsList = {}

for app in gStore[1:]:
    name = app[0]
    reviews = app[3]
    if name in uniqueAppsList and uniqueAppsList[name] <= reviews:
            uniqueAppsList[name] = reviews
    elif name not in uniqueAppsList:
        uniqueAppsList[name] = reviews

In [13]:
androidClean = []
alreadyAdded = []


for app in gStore[1:]:
    name = app[0]
    reviews = app[3]
    if (reviews == uniqueAppsList[name]) and (name not in alreadyAdded):
        androidClean.append(app)
        alreadyAdded.append(name)

len(androidClean)

9660

In [14]:
def is_english(string):
    non_ascii = 0
    
    for character in string:
        if ord(character) > 127:
            non_ascii += 1
    
    if non_ascii > 3:
        return False
    else:
        return True

In [15]:
iosEng = []
androEng = []

for app in androidClean:
    name = app[0]
    if is_english(name):
        androEng.append(app)

for app in appStore[1:]:
    name = app[1]
    if is_english(name):
        iosEng.append(app)     
        
print(len(iosEng), len(androEng))


6183 9615


4 and 7 are the price columns

In [16]:
ios = []
andro = []

for app in androEng:
    price = app[7]
    if price == "0":
        andro.append(app)
        
for app in iosEng:
    price = app[4]
    if price == "0.0":
        ios.append(app)
        
print(len(ios), len(andro))
        

3222 8862


We need to find apps that are successful in both stores because we want to minimize costs and maximize revenue.

We'll focus on genre, to find the most successful ones. Genre row for iOS is 11 and for Android is 1 for Category, and 9 for Genre.

In [17]:
def freq_table(dataset, index):
    table = {}
    total = 0
    
    for row in dataset:
        total += 1
        value = row[index]
        if value in table:
            table[value] += 1
        else:
            table[value] = 1
    
    table_percentages = {}
    for key in table:
        percentage = (table[key] / total) * 100
        table_percentages[key] = round(percentage,2)
    
    return table_percentages

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)
        
    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0], "%")
    


In [18]:
iosGenres = display_table(ios, 11)

Games : 58.16 %
Entertainment : 7.88 %
Photo & Video : 4.97 %
Education : 3.66 %
Social Networking : 3.29 %
Shopping : 2.61 %
Utilities : 2.51 %
Sports : 2.14 %
Music : 2.05 %
Health & Fitness : 2.02 %
Productivity : 1.74 %
Lifestyle : 1.58 %
News : 1.33 %
Travel : 1.24 %
Finance : 1.12 %
Weather : 0.87 %
Food & Drink : 0.81 %
Reference : 0.56 %
Business : 0.53 %
Book : 0.43 %
Navigation : 0.19 %
Medical : 0.19 %
Catalogs : 0.12 %


In [19]:
androGenres = display_table(andro, 1) #Category

FAMILY : 18.93 %
GAME : 9.69 %
TOOLS : 8.45 %
BUSINESS : 4.59 %
LIFESTYLE : 3.9 %
PRODUCTIVITY : 3.89 %
FINANCE : 3.7 %
MEDICAL : 3.52 %
SPORTS : 3.4 %
PERSONALIZATION : 3.32 %
COMMUNICATION : 3.24 %
HEALTH_AND_FITNESS : 3.08 %
PHOTOGRAPHY : 2.95 %
NEWS_AND_MAGAZINES : 2.8 %
SOCIAL : 2.66 %
TRAVEL_AND_LOCAL : 2.34 %
SHOPPING : 2.25 %
BOOKS_AND_REFERENCE : 2.14 %
DATING : 1.86 %
VIDEO_PLAYERS : 1.79 %
MAPS_AND_NAVIGATION : 1.4 %
FOOD_AND_DRINK : 1.24 %
EDUCATION : 1.17 %
ENTERTAINMENT : 0.96 %
LIBRARIES_AND_DEMO : 0.94 %
AUTO_AND_VEHICLES : 0.93 %
HOUSE_AND_HOME : 0.82 %
WEATHER : 0.8 %
EVENTS : 0.71 %
PARENTING : 0.65 %
ART_AND_DESIGN : 0.64 %
COMICS : 0.62 %
BEAUTY : 0.6 %


In [20]:
Acolumns

{0: 'id',
 1: 'track_name',
 2: 'size_bytes',
 3: 'currency',
 4: 'price',
 5: 'rating_count_tot',
 6: 'rating_count_ver',
 7: 'user_rating',
 8: 'user_rating_ver',
 9: 'ver',
 10: 'cont_rating',
 11: 'prime_genre',
 12: 'sup_devices.num',
 13: 'ipadSc_urls.num',
 14: 'lang.num',
 15: 'vpp_lic'}

In [21]:
def installs_table(dataset, indexGenre, indexInstalls):
    table = {}
    total = 0
    installs = 0
    
    for row in dataset:
        value = row[indexGenre]
        total += 1
        if value in table:
            installs += row[indexInstalls]
        else:
            installs = row[indexInstalls]
    
    install_AVG = {}
    for key in table:
        installsAVG = (installs / total)
        install_AVG[key] = round(installsAVG,2)
    
    return install_AVG

def display_installs(dataset, indexGenre, indexInstalls):
    table = installs_table(dataset, indexGenre, indexInstalls)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)
        
    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [31]:
Gcolumns

{0: 'App',
 1: 'Category',
 2: 'Rating',
 3: 'Reviews',
 4: 'Size',
 5: 'Installs',
 6: 'Type',
 7: 'Price',
 8: 'Content Rating',
 9: 'Genres',
 10: 'Last Updated',
 11: 'Current Ver',
 12: 'Android Ver'}

In [23]:
genres_ios = freq_table(ios, -5)

for genre in genres_ios:
    total = 0
    len_genre = 0
    for app in ios:
        genre_app = app[-5]
        if genre_app == genre:            
            n_ratings = float(app[5])
            total += n_ratings
            len_genre += 1
    avg_n_ratings = total / len_genre
    print(genre, ':', round(avg_n_ratings, 2))

Shopping : 26919.69
Sports : 23008.9
Catalogs : 4004.0
Health & Fitness : 23298.02
Weather : 52279.89
Book : 39758.5
Social Networking : 71548.35
Entertainment : 14029.83
Finance : 31467.94
Photo & Video : 28441.54
Lifestyle : 16485.76
Travel : 28243.8
Business : 7491.12
News : 21248.02
Reference : 74942.11
Medical : 612.0
Music : 57326.53
Education : 7003.98
Utilities : 18684.46
Food & Drink : 33333.92
Navigation : 86090.33
Productivity : 21028.41
Games : 22788.67


In [36]:
display_table(andro,5)

1,000,000+ : 15.74 %
100,000+ : 11.55 %
10,000,000+ : 10.52 %
10,000+ : 10.2 %
1,000+ : 8.4 %
100+ : 6.92 %
5,000,000+ : 6.84 %
500,000+ : 5.57 %
50,000+ : 4.77 %
5,000+ : 4.51 %
10+ : 3.54 %
500+ : 3.25 %
50,000,000+ : 2.29 %
100,000,000+ : 2.12 %
50+ : 1.92 %
5+ : 0.79 %
1+ : 0.51 %
500,000,000+ : 0.27 %
1,000,000,000+ : 0.23 %
0+ : 0.05 %
0 : 0.01 %


In [39]:
for category in freq_table(andro,1):
    total = 0
    len_category = 0
    for a in andro:
        category_app = a[1]
        if category_app == category:
            installs = a[5]
            installs = installs.replace("+", "")
            installs = installs.replace(",", "")
            total += float(installs)
            len_category += 1
    avg_installs = total / len_category
    print(category, ": ", round(avg_installs,2))

WEATHER :  5074486.2
HEALTH_AND_FITNESS :  4188821.99
MAPS_AND_NAVIGATION :  4056941.77
GAME :  15560965.6
SOCIAL :  23253652.13
PRODUCTIVITY :  16787331.34
COMICS :  817657.27
LIBRARIES_AND_DEMO :  638503.73
SHOPPING :  7036877.31
FAMILY :  3694276.33
PHOTOGRAPHY :  17805627.64
SPORTS :  3638640.14
DATING :  854028.83
EDUCATION :  1820673.08
NEWS_AND_MAGAZINES :  9549178.47
LIFESTYLE :  1437816.27
FINANCE :  1387692.48
HOUSE_AND_HOME :  1331540.56
PARENTING :  542603.62
AUTO_AND_VEHICLES :  647317.82
FOOD_AND_DRINK :  1924897.74
TRAVEL_AND_LOCAL :  13984077.71
MEDICAL :  120616.49
ART_AND_DESIGN :  1986335.09
TOOLS :  10682301.03
PERSONALIZATION :  5201482.61
COMMUNICATION :  38456119.17
VIDEO_PLAYERS :  24727872.45
EVENTS :  253542.22
BEAUTY :  513151.89
BUSINESS :  1712290.15
ENTERTAINMENT :  11640705.88
BOOKS_AND_REFERENCE :  8767811.89


In [42]:
for app in andro:
    if app[1] == "COMMUNICATION":
        print(app[0], ": ", app[5])

WhatsApp Messenger :  1,000,000,000+
Messenger for SMS :  10,000,000+
My Tele2 :  5,000,000+
imo beta free calls and text :  100,000,000+
Contacts :  50,000,000+
Call Free – Free Call :  5,000,000+
Web Browser & Explorer :  5,000,000+
Browser 4G :  10,000,000+
MegaFon Dashboard :  10,000,000+
ZenUI Dialer & Contacts :  10,000,000+
Cricket Visual Voicemail :  10,000,000+
TracFone My Account :  1,000,000+
Xperia Link™ :  10,000,000+
TouchPal Keyboard - Fun Emoji & Android Keyboard :  10,000,000+
Skype Lite - Free Video Call & Chat :  5,000,000+
My magenta :  1,000,000+
Android Messages :  100,000,000+
Google Duo - High Quality Video Calls :  500,000,000+
Seznam.cz :  1,000,000+
Antillean Gold Telegram (original version) :  100,000+
AT&T Visual Voicemail :  10,000,000+
GMX Mail :  10,000,000+
Omlet Chat :  10,000,000+
My Vodacom SA :  5,000,000+
Microsoft Edge :  5,000,000+
Messenger – Text and Video Chat for Free :  1,000,000,000+
imo free video calls and chat :  500,000,000+
Calls & Tex