# Data quest guided project: Profitable App Profiles for the App store and Google Play markets.




Goal: To help developers understand what type of apps are likely to attract more users on Google Play and the App Store

In [4]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [5]:
filename1=open('AppleStore.csv')
filename2=open('googleplaystore.csv')
from csv import reader
readfile1=reader(filename1)
readfile2=reader(filename2)
dataiOS=list(readfile1)
data_google=list(readfile2)


In [6]:
explore_data(dataiOS, 0,4,True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 7198
Number of columns: 16


In [7]:
explore_data(data_google, 0,4,True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 10842
Number of columns: 13


Now we need to remove non-English apps as well as apps that are not free.

# Data cleaning time

The android data has one row with error as well as duplicates.

In [8]:
print(data_google[10472])

['Xposed Wi-Fi-Pwd', 'PERSONALIZATION', '3.5', '1042', '404k', '100,000+', 'Free', '0', 'Everyone', 'Personalization', 'August 5, 2014', '3.0.0', '4.0.3 and up']


In [9]:
print(data_google[10473])

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


In [10]:
del data_google[10473]

In [11]:
print(data_google[10473])


['osmino Wi-Fi: free WiFi', 'TOOLS', '4.2', '134203', '4.1M', '10,000,000+', 'Free', '0', 'Everyone', 'Tools', 'August 7, 2018', '6.06.14', '4.4 and up']


Example of duplication:

In [12]:
for app in data_google:
    name=app[0]
    if name=='Instagram':
        print(app)

['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']


In [13]:
duplicate_apps=[]
unique_apps=[]
for app in data_google:
    name=app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
print(len(duplicate_apps))

1181


We now remove the duplicates (not randomly). The criterion we can use is the number of reviews. Higher reviews mean a fresher timestamp.
First we create a dictionary that contains the name of the app along with the highest number of reviews so we can make sure to keep the freshest data.

In [14]:
reviews_max={}
for app in data_google[1:]:
    name=app[0]
    n_reviews=float(app[3])
    if name in reviews_max and reviews_max[name]<n_reviews:
        reviews_max[name]=n_reviews
    elif name not in reviews_max:
        reviews_max[name]=n_reviews
print(len(reviews_max))

9659


Then we use the dictionary to create new cleaned lists for the data by dropping redundancies. Once the reviews are checked to be the max number of reviews we keep this data in the cleaned list (android_clean).

In [15]:
android_clean=[]
already_added=[]
for app in data_google[1:]:
    name=app[0]
    n_reviews=float(app[3])    
    if n_reviews==reviews_max[name] and name not in already_added:
        android_clean.append(app)
        already_added.append(name)
        

In [16]:
explore_data(android_clean,0,4,True)


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']


Number of rows: 9659
Number of columns: 13


In [17]:
duplicate_apps=[]
unique_apps=[]
for app in dataiOS:
    name=app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
print(len(duplicate_apps))

0


This is to show that the ios data has no duplicates.
Now we will check if the app has an English name.

In [18]:
def english_check(string):
    for char in string:
        if ord(char)>127:
            return False
    return True
        

In [19]:
print(english_check('Instagram'))
print(english_check('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(english_check('Docs To Go™ Free Office Suite'))
print(english_check('Instachat 😜'))

True
False
False
False


In [20]:
def english_check_edit(string):
    x=0
    for char in string:
        if ord(char)>127:
            x+=1
            if x>3:
                return False
    return True
        

In [21]:
print(english_check_edit('Instagram'))
print(english_check_edit('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(english_check_edit('Docs To Go™ Free Office Suite'))
print(english_check_edit('Instachat 😜'))

True
False
True
True


In [22]:
English_android_clean=[]
English_iOS_clean=[]
for app in android_clean:
    name=app[0]
    if english_check_edit(name):
        English_android_clean.append(app)
print(len(English_android_clean))
    
for app in dataiOS[1:]:
    name=app[1]
    if english_check_edit(name):
        English_iOS_clean.append(app)
print(len(English_iOS_clean))

9614
6183


In [23]:
Free_English_android_clean=[]
Free_English_iOS_clean=[]
for app in English_android_clean:
    price=app[7]
    if price=='0'or price=='Free':
        Free_English_android_clean.append(app)
print(len(Free_English_android_clean))
    
for app in English_iOS_clean:
    price=app[4]
    if price=='0.0':
        Free_English_iOS_clean.append(app)
print(len(Free_English_iOS_clean))

8864
3222


Now we want to analyze the data and find apps that are successful in both markets. 
To minimize risks and overhead, our validation strategy for an app idea has three steps:

1. Build a minimal Android version of the app, and add it to Google Play.
2. If the app has a good response from users, we develop it further.
3. If the app is profitable after six months, we build an iOS version of the app and add it to the App Store.

Now we move on to finding an app profile that fits both markets.
Genre columns are 1,9 in android data and 11 in ios.

In [24]:
explore_data(dataiOS,0,1,False)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']




In [25]:
def freq_table(dataset,index):
    dicto={}
    for row in dataset:
        var=row[index]
        if var in dicto:
            dicto[var]+=1
        else:
            dicto[var]=1
    return dicto


In [26]:
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [28]:
display_table(Free_English_iOS_clean,11)

Games : 1874
Entertainment : 254
Photo & Video : 160
Education : 118
Social Networking : 106
Shopping : 84
Utilities : 81
Sports : 69
Music : 66
Health & Fitness : 65
Productivity : 56
Lifestyle : 51
News : 43
Travel : 40
Finance : 36
Weather : 28
Food & Drink : 26
Reference : 18
Business : 17
Book : 14
Navigation : 6
Medical : 6
Catalogs : 4


In [29]:
display_table(Free_English_android_clean,1)

FAMILY : 1676
GAME : 862
TOOLS : 750
BUSINESS : 407
LIFESTYLE : 346
PRODUCTIVITY : 345
FINANCE : 328
MEDICAL : 313
SPORTS : 301
PERSONALIZATION : 294
COMMUNICATION : 287
HEALTH_AND_FITNESS : 273
PHOTOGRAPHY : 261
NEWS_AND_MAGAZINES : 248
SOCIAL : 236
TRAVEL_AND_LOCAL : 207
SHOPPING : 199
BOOKS_AND_REFERENCE : 190
DATING : 165
VIDEO_PLAYERS : 159
MAPS_AND_NAVIGATION : 124
FOOD_AND_DRINK : 110
EDUCATION : 103
ENTERTAINMENT : 85
LIBRARIES_AND_DEMO : 83
AUTO_AND_VEHICLES : 82
HOUSE_AND_HOME : 73
WEATHER : 71
EVENTS : 63
PARENTING : 58
ART_AND_DESIGN : 57
COMICS : 55
BEAUTY : 53


In [30]:
display_table(Free_English_android_clean,9)

Tools : 749
Entertainment : 538
Education : 474
Business : 407
Productivity : 345
Lifestyle : 345
Finance : 328
Medical : 313
Sports : 307
Personalization : 294
Communication : 287
Action : 275
Health & Fitness : 273
Photography : 261
News & Magazines : 248
Social : 236
Travel & Local : 206
Shopping : 199
Books & Reference : 190
Simulation : 181
Dating : 165
Arcade : 164
Video Players & Editors : 157
Casual : 156
Maps & Navigation : 124
Food & Drink : 110
Puzzle : 100
Racing : 88
Role Playing : 83
Libraries & Demo : 83
Auto & Vehicles : 82
Strategy : 81
House & Home : 73
Weather : 71
Events : 63
Adventure : 60
Comics : 54
Beauty : 53
Art & Design : 53
Parenting : 44
Card : 40
Casino : 38
Trivia : 37
Educational;Education : 35
Board : 34
Educational : 33
Education;Education : 30
Word : 23
Casual;Pretend Play : 21
Music : 18
Racing;Action & Adventure : 15
Puzzle;Brain Games : 15
Entertainment;Music & Video : 15
Casual;Brain Games : 12
Casual;Action & Adventure : 12
Arcade;Action & Advent

Using this info we can draw conclusions related to free English apps on both platforms but only this subset not generally.  Now, lets determine the kind of apps with the most users.

In [31]:
ios_freq=freq_table(Free_English_iOS_clean,11)

In [32]:
print(ios_freq)

{'Social Networking': 106, 'Photo & Video': 160, 'Games': 1874, 'Music': 66, 'Reference': 18, 'Health & Fitness': 65, 'Weather': 28, 'Utilities': 81, 'Travel': 40, 'Shopping': 84, 'News': 43, 'Navigation': 6, 'Lifestyle': 51, 'Entertainment': 254, 'Food & Drink': 26, 'Sports': 69, 'Book': 14, 'Finance': 36, 'Education': 118, 'Productivity': 56, 'Business': 17, 'Catalogs': 4, 'Medical': 6}


In [35]:
average_user_rating_ios={}
for genre in ios_freq:
    total=0
    len_genre=0
    for app in Free_English_iOS_clean:
        genre_app=app[11]
        if genre_app==genre:
            total+=float(app[5])
            len_genre+=1
    average_user_rating_ios[genre]=total/len_genre
print(average_user_rating_ios)

{'Social Networking': 71548.34905660378, 'Photo & Video': 28441.54375, 'Games': 22788.6696905016, 'Music': 57326.530303030304, 'Reference': 74942.11111111111, 'Health & Fitness': 23298.015384615384, 'Weather': 52279.892857142855, 'Utilities': 18684.456790123455, 'Travel': 28243.8, 'Shopping': 26919.690476190477, 'News': 21248.023255813954, 'Navigation': 86090.33333333333, 'Lifestyle': 16485.764705882353, 'Entertainment': 14029.830708661417, 'Food & Drink': 33333.92307692308, 'Sports': 23008.898550724636, 'Book': 39758.5, 'Finance': 31467.944444444445, 'Education': 7003.983050847458, 'Productivity': 21028.410714285714, 'Business': 7491.117647058823, 'Catalogs': 4004.0, 'Medical': 612.0}


In [37]:
android_freq=freq_table(Free_English_android_clean,1)

In [48]:
android_cat_freq={}
for category in android_freq:
    total=0
    len_category=0
    
    for app in Free_English_android_clean:
        category_app=app[1]
        if category==category_app:
            installs=app[5]
            installs=installs.replace('+','')
            installs=installs.replace(',','')
            total+=float(installs)
            len_category+=1
    android_cat_freq[category]=total/len_category
print(android_cat_freq)
values=android_cat_freq.values()
highest_num=max(values)
print(highest_num)
max(stats, key=stats.get)

{'ART_AND_DESIGN': 1986335.0877192982, 'AUTO_AND_VEHICLES': 647317.8170731707, 'BEAUTY': 513151.88679245283, 'BOOKS_AND_REFERENCE': 8767811.894736841, 'BUSINESS': 1712290.1474201474, 'COMICS': 817657.2727272727, 'COMMUNICATION': 38456119.167247385, 'DATING': 854028.8303030303, 'EDUCATION': 1833495.145631068, 'ENTERTAINMENT': 11640705.88235294, 'EVENTS': 253542.22222222222, 'FINANCE': 1387692.475609756, 'FOOD_AND_DRINK': 1924897.7363636363, 'HEALTH_AND_FITNESS': 4188821.9853479853, 'HOUSE_AND_HOME': 1331540.5616438356, 'LIBRARIES_AND_DEMO': 638503.734939759, 'LIFESTYLE': 1437816.2687861272, 'GAME': 15588015.603248259, 'FAMILY': 3695641.8198090694, 'MEDICAL': 120550.61980830671, 'SOCIAL': 23253652.127118643, 'SHOPPING': 7036877.311557789, 'PHOTOGRAPHY': 17840110.40229885, 'SPORTS': 3638640.1428571427, 'TRAVEL_AND_LOCAL': 13984077.710144928, 'TOOLS': 10801391.298666667, 'PERSONALIZATION': 5201482.6122448975, 'PRODUCTIVITY': 16787331.344927534, 'PARENTING': 542603.6206896552, 'WEATHER': 50

Communication apps demonstrate the highest number of installs on google play.