# Profitable App Profiles for the App Store and Google Play Markets 
### By Jorge Hernandez Leon

This project is to find mobile app profiles that are profitable for the App Store and Google Play markets. Our job is to enable our team of developers to make data-driven decisions with respect to the kind of apps they build.

At our company, we only build apps that are free to download and install, and our main source of revenue consists of in-app ads. This means that our revenue for any given app is mostly influenced by the number of users that use our app. Our goal for this project is to analyze data to help our developers understand what kinds of apps are likely to attract more users.


In [10]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [2]:
from csv import reader
def open_dataset(file_name):
    
    #file_name = 'AppleStore.csv'
    opened_file = open(file_name)
    read_file = reader(opened_file)
    data = list(read_file)
    
    return data


In [3]:
#my data has a header 
Appleplay = open_dataset('AppleStore2.csv')
googleplay = open_dataset('googleplaystore.csv')

In [4]:
explore_data(googleplay, 0, 3, True)
for n in Appleplay:
    del n[0]
explore_data(Appleplay, 0, 3, True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


Number of rows: 10842
Number of columns: 13
['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['281656475', 'PAC-MAN Premium', '100788224', 'USD', '3.99', '21292', '26', '4', '4.5', '6.3.5', '4+', 'Games', '38', '5', '10', '1']


['281796108', 'Evernote - stay organized', '158578688', 'USD', '0', '161

In [5]:
print(googleplay[10473])
print(len(googleplay[10473]))

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']
12


In [6]:
# deleting data function that isnt complete
del googleplay[10473]

In [44]:
#googleplay has a few repeating apps
def duplicate_apps(list1, row_name): 
    duplicate_apps = []
    unique_apps = []
    for app in list1:
        name = app[row_name]
        if name in unique_apps:
            duplicate_apps.append(name)
            
        else:
            unique_apps.append(name)
    return print('Examples of duplicate apps:',len(duplicate_apps)),print('Unique Apps:', len(unique_apps))


In [45]:
duplicate_apps(googleplay, 0)



Examples of duplicate apps: 1181
Unique Apps: 9660


(None, None)

In [46]:
duplicate_apps(Appleplay, 1)


Examples of duplicate apps: 2
Unique Apps: 7196


(None, None)

In [47]:
for app in Appleplay:
    name = app[1] # 1 because apple has an Id number
    if name == 'VR Roller Coaster':
        print(app)

['952877179', 'VR Roller Coaster', '169523200', 'USD', '0', '107', '102', '3.5', '3.5', '2.0.0', '4+', 'Games', '37', '5', '1', '1']
['1089824278', 'VR Roller Coaster', '240964608', 'USD', '0', '67', '44', '3.5', '4', '0.81', '4+', 'Games', '38', '0', '1', '1']


In [18]:
for app in googleplay:
    name = app[0]
    if name == 'Box':
        print(app)

['Box', 'BUSINESS', '4.2', '159872', 'Varies with device', '10,000,000+', 'Free', '0', 'Everyone', 'Business', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Box', 'BUSINESS', '4.2', '159872', 'Varies with device', '10,000,000+', 'Free', '0', 'Everyone', 'Business', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Box', 'BUSINESS', '4.2', '159872', 'Varies with device', '10,000,000+', 'Free', '0', 'Everyone', 'Business', 'July 31, 2018', 'Varies with device', 'Varies with device']


In [19]:
print('Expected length:', len(googleplay) - 1181-1)
# We won't remove rows randomly, but rather we'll keep the rows that have the highest number of reviews because the higher the number of reviews, the more reliable the ratings.
# To do that, we will:
# Create a dictionary where each key is a unique app name, and the value is the highest number of reviews of that app
# Use the dictionary to create a new data set, which will have only one entry per app (and we only select the apps with the highest number of reviews)
reviews_max = {}
for app in googleplay[1:]:
    name = app[0]
    n_reviews = float(app[3])
    
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews
print('Actual length:', len(reviews_max))


Expected length: 9659
Actual length: 9659


In [20]:
reviews_max2 = {}
for app in Appleplay[1:]:
    name = app[0]
    n_reviews = float(app[5])
    
    if name in reviews_max2 and reviews_max2[name] < n_reviews:
        reviews_max2[name] = n_reviews
    elif name not in reviews_max2:
        reviews_max2[name] = n_reviews
print('Actual length:', len(reviews_max2))

Actual length: 7197


### Using th above dictionary I will remove duplicates
#### Create a new list and add to already added if number of reviews is lower

In [23]:
googleplay_clean = []
already_added = []
for row in googleplay[1:]: # due to I have a header
    name = row[0]
    n_reviews = float(row[3])
    
    if (reviews_max[name] == n_reviews) and (name not in already_added):
            googleplay_clean.append(row)
            already_added.append(name)
print('Length of googleplay without repeats',len(googleplay_clean))
Appleplay_clean = []
already_added2 = []
for row in Appleplay[1:]: # due to I have a header
    name = row[0]
    n_reviews = float(row[5])
    
    if (reviews_max2[name] == n_reviews) and (name not in already_added2):
            Appleplay_clean.append(row)
            already_added2.append(name)
print('Length of Appleplay without repeats',len(Appleplay_clean))


Length of googleplay without repeats 9659
Length of Appleplay without repeats 7197


In [24]:
explore_data(googleplay_clean, 0, 3, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9659
Number of columns: 13


# Remove Non-English apps

In [25]:
def english_app(string):   
    
    for character in string:
        if ord(character) > 127:
           return False 
        
    return True
print(english_app('Instagram'))
print(english_app('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(english_app('Docs To Go™ Free Office Suite'))
print(english_app('Instachat 😜'))

True
False
False
False


### How to remove If the input string has more than three characters that fall outside the ASCII range (0 - 127)

In [26]:
def english_app2(string):   
    non_eng = 0
    
    for character in string:
        if ord(character) > 127:
           non_eng += 1
    if non_eng > 3:
        return False
    else:    
        return True
print(english_app2('Instagram'))
print(english_app2('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(english_app2('Docs To Go™ Free Office Suite'))
print(english_app2('Instachat 😜'))

True
False
True
True


In [53]:
googleplay_cleaner = []
for row in googleplay_clean:
    name = row[0]
    if english_app2(name):
        googleplay_cleaner.append(row)
print(len(googleplay_cleaner))        

9614


In [55]:
Appleplay_cleaner = []
for row in Appleplay_clean:
    name = row[1]
    if english_app2(name):
        Appleplay_cleaner.append(row)
print(len(Appleplay_cleaner)) 
print(Appleplay_cleaner[0])

6183
['281656475', 'PAC-MAN Premium', '100788224', 'USD', '3.99', '21292', '26', '4', '4.5', '6.3.5', '4+', 'Games', '38', '5', '10', '1']


# Free Apps

In [62]:
header = Appleplay[0]
for i, item in enumerate(header, 0):
    print(i, '.'+ item, sep='', end='')
    print('\n')

0.id

1.track_name

2.size_bytes

3.currency

4.price

5.rating_count_tot

6.rating_count_ver

7.user_rating

8.user_rating_ver

9.ver

10.cont_rating

11.prime_genre

12.sup_devices.num

13.ipadSc_urls.num

14.lang.num

15.vpp_lic



In [64]:
header2 = googleplay[0]
for i, item in enumerate(header2, 0):
    print(i, '.'+ item, sep='', end='')
    print('\n')

0.App

1.Category

2.Rating

3.Reviews

4.Size

5.Installs

6.Type

7.Price

8.Content Rating

9.Genres

10.Last Updated

11.Current Ver

12.Android Ver



In [66]:
free_apps = []
for apps in Appleplay_cleaner:
    price_app = (apps[4])
    if price_app == 0:
        free_apps.append(apps)
print(len(free_apps))

3222


In [97]:
free_apps = []
for apps in googleplay_cleaner:
    price_app = (apps[7])
    if price_app == '0':
        free_apps.append(apps)
print(len(free_apps))
# when ran the first time a price was $4.99 so ran the next set oc code

8864


In [None]:
def freq_table(dataset, index):
    freq_table1 = {}
    