## Mobile App Store Data
This project aims to assist with analysis that provides valuable insights on the apps (Google Play and Apple App Store) that make the most business case.

Column | Description
:----|:----
id | App ID
track_name | App Name
size_bytes | app size
currency | currency type
price | app cost
rating_count_tot | User Rating counts (for all version)
rating_count_ver |  User Rating counts (for current version)
user_rating | Average User Rating value (for all version)
user_rating_ver | Average User Rating value (for current version)
ver |  Latest version code
cont_rating | Content Rating 
prime_genre | Primary Genre
sup_devices.num | Number of supporting devices
ipadSc_urls.num |Number of screenshots showed for display
lang.num | Number of supported languages
vpp_lic | Vpp Device Based Licensing Enabled




In [1]:
from csv import reader

### ios App Store dataset ###
opened_file = open('C:/Users/Owner/Documents/Data Analysis/Jupyter/Guided Project/AppleStore.csv', encoding='utf8')
read_file = reader(opened_file)
ios = list(read_file)
ios_header = ios[0]
ios = ios[1:]

### GooglePlay dataset ###
opened_file = open('C:/Users/Owner/Documents/Data Analysis/Jupyter/Guided Project/googleplaystore.csv', encoding='utf8')
read_file = reader(opened_file)
android = list(read_file)
android_header = android[0]
android = android[1:]



In [2]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))
        

        

In [3]:
### To find out incomplete rows or rows with missing data ###
def incomplete_data(data):
    for row in data:
        if len(row) < len(data[0]):
            print(row)
            number_of_columns = len(row)
            print('Number of columns is:', number_of_columns)
            print(data.index(row))
            del data[data.index(row)]
print(len(android)) 
### To check if there are incomplete rows in the android data ###

### To delete the incomplete row(s) ###
incomplete_data(android)


print(len(android))

10841
['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']
Number of columns is: 12
10472
10840


In [4]:
### To check if the android dataset has duplicate apps ###
duplicate_android_apps = []
unique_android_apps = []

for app in android:
    app_name = app[0]

    if app_name in unique_android_apps:
        duplicate_android_apps.append(app_name)
    else:
        unique_android_apps.append(app_name)
    
print('The number of  duplicate android apps is',len(duplicate_android_apps),'apps')



The number of  duplicate android apps is 1181 apps


In [5]:
### Here is proof of the existence of duplicate apps within the android dataset ###
print('Some duplicate apps are:',duplicate_android_apps[:20], '\n')



Some duplicate apps are: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack', 'FreshBooks Classic', 'Insightly CRM', 'QuickBooks Accounting: Invoicing & Expenses', 'HipChat - Chat Built for Teams', 'Xero Accounting Software', 'MailChimp - Email, Marketing Automation', 'Crew - Free Messaging and Scheduling', 'Asana: organize team projects', 'Google Analytics', 'AdWords Express'] 



In [6]:
# The goal here is to use the criterion of the app duplicates with the highest reviews to eliminate duplicates
max_reviews = {}

for app in android:
    app_name = app[0]
    no_of_reviews = float(app[3])
    
    if app_name in 'max_reviews' and max_reviews[app] < no_of_reviews:
        max_reviews[app_name] = no_of_reviews
    elif app_name not in 'max_reviews':
        max_reviews[app_name] = no_of_reviews
        
print('Expected Length:',len(max_reviews))
print('Actual Length:',len(android) - len(duplicate_android_apps))

Expected Length: 9659
Actual Length: 9659


In [7]:
# The lines of code below will ultimately create a list of unique apps using information from the max_reviews list

android_unique = []
already_added_android = []

for app in android:
    app_name = app[0]
    no_of_reviews = float(app[3])
    
    if (max_reviews[app_name] == no_of_reviews) and (app_name not in already_added_android):
        android_unique.append(app)
        already_added_android.append(app_name)
        
print(len(android_unique))        

9659


In [8]:
explore_data(android_unique, 0, 5, True)


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']


['Paper flowers instructions', 'ART_AND_DESIGN', '4.4', '167', '5.6M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'March 26, 2017', '1.0', '2.3 and up']


Number of rows: 9659
Number of columns: 13


In [9]:
### This is a function to determine if an app is an english or non-english app

def english_app(string):
    non_english = 0
    
    for character in string:
        if ord(character) > 127:   
            non_english += 1
                
    if non_english > 3:
        return False
    else:
        return True
        
print(english_app('Instagram'))
print(english_app('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(english_app('Docs To Go™ Free Office Suite'))
print(english_app('Instachat 😜'))

True
False
True
True


In [10]:
###
'''This function analysis the clean datasets (i.e. android_unique and ios) and cleans them up to leave only the english
    apps. This is because the company we are conducting the feasibility analysis for is only interested in an english
    speaking audience'''
###

def language_checker(dataset, android=True):
    english_apps = []
      
    for app in dataset:
        android_app_name = app[0]
        ios_app_name = app[1]
        
        if not android:
            if english_app(ios_app_name):
                english_apps.append(app)
        
        if android:
            if english_app(android_app_name):
                english_apps.append(app)
                
    return english_apps
                

### English android apps ###    
android_english = language_checker(android_unique)



In [11]:
explore_data(android_english, 0, 3, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9614
Number of columns: 13


In [12]:
### English ios apps ###
ios_english = language_checker(ios, False)
explore_data(ios_english, 0, 3, True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 6183
Number of columns: 16


In [13]:
### Now we want to isolate the free apps as our company is only interested in the analysis of free apps

def free(dataset, ios_english=True):
    free_apps = []
    
    for app in dataset:
                    
        if not ios_english:
            android_price = float(app[7].strip('$'))
            if android_price == 0:
                free_apps.append(app)
        
        if ios_english:
            ios_price = float(app[4])
            if ios_price == 0:
                free_apps.append(app)
                
    return free_apps
              

android_final = free(android_english, False)
explore_data(android_final, 0, 4,True)    
    

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']


Number of rows: 8864
Number of columns: 13


In [14]:
ios_final = free(ios_english)
explore_data(ios_final, 0, 4,True)    

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


Number of rows: 3222
Number of columns: 16


In [15]:
''' Now we want to identify the types of apps that would make the most business case. '''

def freq(dataset, index):
    table = {}
    total = 0
    
    for app in dataset:
        total += 1
        value = app[index]
        
        if value in table:
            table[value] += 1
        else:
            table[value] = 1
            
    percentages = {}
    for key in table:
        percentage = (table[key] / total) * 100
        percentages[key] = percentage       
            
    return percentages

def display(dataset, index):
    table = freq(dataset, index)
    display_table = []
    
    for key in table:
        key_as_tuple = (table[key], key)
        display_table.append(key_as_tuple)
    
    table_sorted = sorted(display_table, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
        
# Frequency of the various android app categories
display(android_final, 1)


FAMILY : 19.223826714801444
GAME : 9.510379061371841
TOOLS : 8.461191335740072
BUSINESS : 4.580324909747293
LIFESTYLE : 3.9034296028880866
PRODUCTIVITY : 3.892148014440433
FINANCE : 3.7003610108303246
MEDICAL : 3.5424187725631766
SPORTS : 3.4183212996389893
PERSONALIZATION : 3.3167870036101084
COMMUNICATION : 3.2490974729241873
HEALTH_AND_FITNESS : 3.068592057761733
PHOTOGRAPHY : 2.944494584837545
NEWS_AND_MAGAZINES : 2.7978339350180503
SOCIAL : 2.6624548736462095
TRAVEL_AND_LOCAL : 2.33528880866426
SHOPPING : 2.2450361010830324
BOOKS_AND_REFERENCE : 2.1435018050541514
DATING : 1.861462093862816
VIDEO_PLAYERS : 1.782490974729242
MAPS_AND_NAVIGATION : 1.3989169675090252
FOOD_AND_DRINK : 1.2409747292418771
EDUCATION : 1.128158844765343
LIBRARIES_AND_DEMO : 0.9363718411552346
AUTO_AND_VEHICLES : 0.9250902527075812
ENTERTAINMENT : 0.8799638989169676
HOUSE_AND_HOME : 0.8235559566787004
WEATHER : 0.8009927797833934
EVENTS : 0.7107400722021661
PARENTING : 0.6543321299638989
ART_AND_DESIGN : 0

In [16]:
display(android_final, 9)

Tools : 8.449909747292418
Entertainment : 6.069494584837545
Education : 5.347472924187725
Business : 4.580324909747293
Productivity : 3.892148014440433
Lifestyle : 3.892148014440433
Finance : 3.7003610108303246
Medical : 3.5424187725631766
Sports : 3.463447653429603
Personalization : 3.3167870036101084
Communication : 3.2490974729241873
Action : 3.1024368231046933
Health & Fitness : 3.068592057761733
Photography : 2.944494584837545
News & Magazines : 2.7978339350180503
Social : 2.6624548736462095
Travel & Local : 2.3240072202166067
Shopping : 2.2450361010830324
Books & Reference : 2.1435018050541514
Simulation : 2.0419675090252705
Dating : 1.861462093862816
Arcade : 1.861462093862816
Video Players & Editors : 1.782490974729242
Casual : 1.7486462093862816
Maps & Navigation : 1.3989169675090252
Food & Drink : 1.2409747292418771
Puzzle : 1.128158844765343
Racing : 0.9927797833935018
Role Playing : 0.9363718411552346
Libraries & Demo : 0.9363718411552346
Auto & Vehicles : 0.925090252707581

In [17]:
# Frequency of the various ios app categories
display(ios_final, 11)

Games : 58.16263190564867
Entertainment : 7.883302296710118
Photo & Video : 4.9658597144630665
Education : 3.662321539416512
Social Networking : 3.2898820608317814
Shopping : 2.60707635009311
Utilities : 2.5139664804469275
Sports : 2.1415270018621975
Music : 2.0484171322160147
Health & Fitness : 2.0173805090006205
Productivity : 1.7380509000620732
Lifestyle : 1.5828677839851024
News : 1.3345747982619491
Travel : 1.2414649286157666
Finance : 1.1173184357541899
Weather : 0.8690254500310366
Food & Drink : 0.8069522036002483
Reference : 0.5586592178770949
Business : 0.5276225946617008
Book : 0.4345127250155183
Navigation : 0.186219739292365
Medical : 0.186219739292365
Catalogs : 0.12414649286157665


In [18]:
# Calculating the averaging user rating per genre for ios apps

ios_genres = freq(ios_final, 11)

for genre in ios_genres:
    total = 0
    len_genre = 0
    
    for app in ios_final:
        app_genre = app[11]
        if app_genre == genre:
            no_ratings = float(app[5])
            total += no_ratings
            len_genre += 1
            
    avg_user_ratings = total/len_genre
    print(genre, ':', avg_user_ratings)

Social Networking : 71548.34905660378
Photo & Video : 28441.54375
Games : 22788.6696905016
Music : 57326.530303030304
Reference : 74942.11111111111
Health & Fitness : 23298.015384615384
Weather : 52279.892857142855
Utilities : 18684.456790123455
Travel : 28243.8
Shopping : 26919.690476190477
News : 21248.023255813954
Navigation : 86090.33333333333
Lifestyle : 16485.764705882353
Entertainment : 14029.830708661417
Food & Drink : 33333.92307692308
Sports : 23008.898550724636
Book : 39758.5
Finance : 31467.944444444445
Education : 7003.983050847458
Productivity : 21028.410714285714
Business : 7491.117647058823
Catalogs : 4004.0
Medical : 612.0


'''From the results above, the Navigation app genre has the highest average ratings. On this basis, the 
    Social Networking app genre may be put forward.'''

In [23]:
# To calculate average number of installs for each app genre

android_genres = freq(android_final, 1)

for genre in android_genres:
    total = 0
    len_genre = 0
    
    for app in android_final:
        app_category = app[1]
        
        if app_category == genre:
            installs = app[5]
            cleaned_installs = installs.replace('+','').replace(',', '')
            cleaned_installs = float(cleaned_installs)
            total += cleaned_installs
            len_genre += 1
            
    avg_no_installs = total/ len_genre
    avg_no_installs= (genre, ':', avg_no_installs)
    print(avg_no_installs, '\n')
   

('ART_AND_DESIGN', ':', 1986335.0877192982) 

('AUTO_AND_VEHICLES', ':', 647317.8170731707) 

('BEAUTY', ':', 513151.88679245283) 

('BOOKS_AND_REFERENCE', ':', 8767811.894736841) 

('BUSINESS', ':', 1704192.3399014778) 

('COMICS', ':', 817657.2727272727) 

('COMMUNICATION', ':', 38326063.197916664) 

('DATING', ':', 854028.8303030303) 

('EDUCATION', ':', 1768500.0) 

('ENTERTAINMENT', ':', 9146923.076923076) 

('EVENTS', ':', 253542.22222222222) 

('FINANCE', ':', 1387692.475609756) 

('FOOD_AND_DRINK', ':', 1924897.7363636363) 

('HEALTH_AND_FITNESS', ':', 4167457.3602941176) 

('HOUSE_AND_HOME', ':', 1331540.5616438356) 

('LIBRARIES_AND_DEMO', ':', 638503.734939759) 

('LIFESTYLE', ':', 1437816.2687861272) 

('GAME', ':', 12914435.883748516) 

('FAMILY', ':', 5180161.789906103) 

('MEDICAL', ':', 123064.7898089172) 

('SOCIAL', ':', 23253652.127118643) 

('SHOPPING', ':', 7036877.311557789) 

('PHOTOGRAPHY', ':', 17840110.40229885) 

('SPORTS', ':', 4274688.722772277) 

('TRAVEL_