# Mobile App Analysis

This project involves exploring, cleaning, and sorting through data to find the best category to create a new, free app for the Apple store and the Google Play store. Frequency tables are used to determine the popularity of each category.

In [1]:
#open the two data sets and save as lists
apple_file = open('AppleStore.csv')
google_file = open('googleplaystore.csv')

from csv import reader

read_apple = reader(apple_file)
read_google = reader(google_file)

apple_data = list(read_apple)
google_data = list(read_google)

In [2]:
#Define explore_data and explore datasets
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]  
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

print('APPLE DATA')
explore_data(apple_data[1:],0,3,rows_and_columns=True)
print('\n')
print('GOOGLE DATA')
explore_data(google_data[1:],0,3,rows_and_columns=True)
print('\n')

#print column names
print('APPLE COLUMN NAMES')
print(apple_data[0])
print('\n')
print('GOOGLE COLUMN NAMES')
print(google_data[0])

APPLE DATA
['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 7197
Number of columns: 16


GOOGLE DATA
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Ar

# Data set documentation links
- Apple Mobile App Store: [Link](https://www.kaggle.com/ramamet4/app-store-apple-data-set-10k-apps/home)
- Google Play Mobile App Store: [Link](https://www.kaggle.com/lava18/google-play-store-apps/home)

In [3]:
#CLEANING DATA
#delete row 10473 of google list with missing 'category' entry
print(google_data[10473])
del google_data[10473]  #DONT RUN THIS CELL TWICE
#based on discussion forum apple store list is OK

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


In [4]:
#google store has some duplicate app names- identify duplicates
duplicate_apps = []
unique_apps = []

for app in google_data:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)

print('Number of duplicate apps:', len(duplicate_apps))
print('\n')
print('Examples of duplicate apps:', duplicate_apps[:5])

Number of duplicate apps: 1181


Examples of duplicate apps: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings']


In [5]:
#discard duplicates and choose most recent entry
#we will choose the entry with the highest number of reviews
max_reviews = {}

for app in google_data[1:]:
    name = app[0]
    reviews = float(app[3])
    
    if name in max_reviews and max_reviews[name] < reviews:
        max_reviews[name] = reviews
        
    elif name not in max_reviews:
        max_reviews[name] = reviews

#now check to make sure we have the correct number of apps
print('Expected length:', len(google_data[1:]) - 1181)
print('Actual length:', len(max_reviews))

Expected length: 9659
Actual length: 9659


In [6]:
#create new datasets with duplicates removed
google_clean = []
already_added = []

#using the max reviews dictionary to add only the app entry with
#the highest number of reviews
for app in google_data[1:]:
    name = app[0]
    reviews = float(app[3])
    if reviews == max_reviews[name] and name not in already_added:
        google_clean.append(app)
        already_added.append(name)#in case there are multiple entries with the same number of reviews
        
explore_data(google_clean,0,3,True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9659
Number of columns: 13


In [7]:
#now separate english from non english apps
def is_english(x):
    y = 0
    for char in x:
        if ord(char) > 127:
            y += 1
    if y > 3:
            return False
    else:
            return True
        
google_en = []       
for app in google_clean:
    name = app[0]
    if is_english(name) == True:
        google_en.append(app)

apple_en = []
for app in apple_data[1:]:
    name = app[1]
    if is_english(name) == True:
        apple_en.append(app)

print('GOOGLE DATA')
explore_data(google_en,0,3,True)
print('\n')
print('APPLE DATA')
explore_data(apple_en,0,3,True)

GOOGLE DATA
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9614
Number of columns: 13


APPLE DATA
['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '

In [20]:
#We just want the free apps, so now separate the free apps
google_final = []
for app in google_en:
    price = app[7]
    if price == '0':
        google_final.append(app)
        
apple_final = []
for app in apple_en:
    price = app[4]
    if price == '0.0':
        apple_final.append(app)

print('GOOGLE DATA')
explore_data(google_final,0,3,True)
print('\n')
print('APPLE DATA')
explore_data(apple_final,0,3,True)

GOOGLE DATA
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 8864
Number of columns: 13


APPLE DATA
['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '

# Analysis Strategy

As we mentioned in the introduction, our aim is to determine the kinds of apps that are likely to attract more users because our revenue is highly influenced by the number of people using our apps.

To minimize risks and overhead, our validation strategy for an app idea is comprised of three steps:

1. Build a minimal Android version of the app, and add it to Google Play.
2. If the app has a good response from users, we develop it further.
3. If the app is profitable after six months, we build an iOS version of the app and add it to the App Store.

Because our end goal is to add the app on both Google Play and the App Store, we need to find app profiles that are successful on both markets. For instance, a profile that works well for both markets might be a productivity app that makes use of gamification.

Let's begin the analysis by getting a sense of what are the most common genres for each market. For this, we'll need to build frequency tables for a few columns in our data sets.

Google genre column index is 9, category is 1, and Apple prime genre column index is 11

In [19]:
#define frequency table function
def freq_table(dataset, index):
    freq = {}
    for row in dataset:
        value = row[index]
        if value in freq:
            freq[value] += ( 1 / len(dataset) * 100)
        else:
            freq[value] = ( 1 / len(dataset) * 100)
    return freq

#define display table function
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
        
#display the frequency tables for the prime_genre, Genres, and Category columns
print(' - Apple Primary Genres - ')
display_table(apple_final, 11)#apple prime genre
print('\n - Google Play Genres - ')
display_table(google_final, 9)  #google Genres
print('\n - Google Play Categories - ')
display_table(google_final, 1)   #google category

 - Apple Primary Genres - 
Games : 58.1626319056464
Entertainment : 7.883302296710134
Photo & Video : 4.965859714463075
Education : 3.6623215394165176
Social Networking : 3.2898820608317867
Shopping : 2.6070763500931133
Utilities : 2.5139664804469306
Sports : 2.1415270018621997
Music : 2.048417132216017
Health & Fitness : 2.0173805090006227
Productivity : 1.7380509000620747
Lifestyle : 1.5828677839851035
News : 1.3345747982619496
Travel : 1.2414649286157668
Finance : 1.1173184357541899
Weather : 0.8690254500310364
Food & Drink : 0.8069522036002481
Reference : 0.558659217877095
Business : 0.5276225946617009
Book : 0.4345127250155184
Navigation : 0.186219739292365
Medical : 0.186219739292365
Catalogs : 0.12414649286157665

 - Google Play Genres - 
Tools : 8.449909747292507
Entertainment : 6.069494584837599
Education : 5.34747292418777
Business : 4.591606498194979
Productivity : 3.8921480144404565
Lifestyle : 3.8921480144404565
Finance : 3.7003610108303455
Medical : 3.5311371841155417
Spo

# Apple's prime_genre analysis

Most common: Games (58.2%)

Second most common: Entertainment (7.9%)

Most apps are designed for entertainment.
An entertainment app is likely best. Should look at # of users to confirm.

# Google's Category and Genre analysis

Most common genres: Tools(8.4%) then Entertainment(6.1%)

Most common categories: Family(18.9%) then Game(9.7%)

Google still has a majority of entertainment apps but doesnt have quite as extreme a focus as Apple does. There are a fair amount of information and tool apps. Entertainment is still probably best but an analysis of the # of users is necessary to confirm. 

In [21]:
#now we identify the approximate number of users using ratings
prime_genre_freq = freq_table(apple_final,11)

In [11]:
for genre in prime_genre_freq:
    total = 0
    len_genre = 0
    for app in apple_final:
        genre_app = app[11]
        if genre_app == genre:
            ratings = float(app[5])
            total += ratings
            len_genre += 1
    avg_ratings = total / len_genre
    print('          Genre: ',genre)
    print('Average Ratings: ',round(avg_ratings,1))
    print('\n')

          Genre:  Medical
Average Ratings:  612.0


          Genre:  Social Networking
Average Ratings:  71548.3


          Genre:  Health & Fitness
Average Ratings:  23298.0


          Genre:  Book
Average Ratings:  39758.5


          Genre:  Shopping
Average Ratings:  26919.7


          Genre:  Entertainment
Average Ratings:  14029.8


          Genre:  Productivity
Average Ratings:  21028.4


          Genre:  News
Average Ratings:  21248.0


          Genre:  Catalogs
Average Ratings:  4004.0


          Genre:  Photo & Video
Average Ratings:  28441.5


          Genre:  Finance
Average Ratings:  31467.9


          Genre:  Music
Average Ratings:  57326.5


          Genre:  Travel
Average Ratings:  28243.8


          Genre:  Utilities
Average Ratings:  18684.5


          Genre:  Reference
Average Ratings:  74942.1


          Genre:  Navigation
Average Ratings:  86090.3


          Genre:  Business
Average Ratings:  7491.1


          Genre:  Food & Drink
Average Ratings:  

# App profile recommendation

Reference, Navigation, Social Networking, Music, and Weather all have a high number of users. Social networking, navigation, and music are all largely skewed by a couple leading apps. Weather requires information that is not easily accessed and doesn't present much opportunity for profit. Therefore I'd recommend a reference app, especially if the developer has expertise in some area that he can effectively share on the app. 

In [12]:
#category analysis for google
cat_freq = freq_table(google_final, 1)

for cat in cat_freq:
    total = 0
    len_cat = 0
    for app in google_final:
        cat_app = app[1]
        if cat_app == cat:
            installs = app[5]
            installs = installs.replace('+','')
            installs = float(installs.replace(',',''))
            total += installs
            len_cat += 1
            
    avg_installs = total / len_cat
    print('Category: ',cat)
    print('Average Installs: ', avg_installs)
    print('\n')

Category:  AUTO_AND_VEHICLES
Average Installs:  647317.8170731707


Category:  COMICS
Average Installs:  817657.2727272727


Category:  DATING
Average Installs:  854028.8303030303


Category:  BUSINESS
Average Installs:  1712290.1474201474


Category:  EVENTS
Average Installs:  253542.22222222222


Category:  HEALTH_AND_FITNESS
Average Installs:  4188821.9853479853


Category:  MAPS_AND_NAVIGATION
Average Installs:  4056941.7741935486


Category:  FAMILY
Average Installs:  3695641.8198090694


Category:  WEATHER
Average Installs:  5074486.197183099


Category:  PARENTING
Average Installs:  542603.6206896552


Category:  NEWS_AND_MAGAZINES
Average Installs:  9549178.467741935


Category:  TOOLS
Average Installs:  10801391.298666667


Category:  PRODUCTIVITY
Average Installs:  16787331.344927534


Category:  ENTERTAINMENT
Average Installs:  11640705.88235294


Category:  SPORTS
Average Installs:  3638640.1428571427


Category:  MEDICAL
Average Installs:  120550.61980830671


Category:  S

# Google analysis results

- Results are similar to the Apple store
- Most categories with high users are skewed by a couple giants
- A reference app for a popular book would be a good idea since that category is relatively popular while not being completely dominated by one or two apps