# Profitable App Profiles for the App Store and Google Play Markets

This project intends to guide developers in what kind of projects to invest their time.

The goal is to understand what constitues a great app in the sense of attracting more users.

## Data Exploration

In [1]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print('\n')
    
    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [10]:
from csv import reader
with open('AppleStore.csv') as f:
    apple_store = list(reader(f))
apple_store_header = apple_store[0]
apple_store_examples = apple_store[1:]

with open('googleplaystore.csv') as f:
    google_play = list(reader(f))
google_play_header = google_play[0]
google_play_examples = google_play[1:]

In [12]:
print(apple_store_header)
print('\n')
explore_data(apple_store_examples, 0, 5, True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


['284035177', 'Pandora - Music & Radio', '130242560', 'USD', '0.0', '1126879', '3594', '4.0', '4.5', '8.4.1', '12+', 'Music', '37', '4', '1', '1']


Number of rows: 7197
Number of columns: 16


In [14]:
print(google_play_header)
print('\n')
explore_data(google_play_examples, 0, 5, True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Eve

In [15]:
header_length = len(google_play_header)
for row in google_play_examples:
    row_length = len(row)
    if row_length != header_length:
        print(row)
        print(google_play_examples.index(row))

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']
10472


In [16]:
del google_play_examples[10472]

## Data Cleaning

In [25]:
# Checking duplicates
apps = []
duplicate_apps = []

for example in google_play_examples:
    name = example[0]
    if name in apps:
        duplicate_apps.append(name)
    else:
        apps.append(name)

In [26]:
print('Number of unique apps:', len(apps) - len(duplicate_apps))
print('Number of duplicate apps:', len(duplicate_apps))

Number of unique apps: 8478
Number of duplicate apps: 1181


In [31]:
# Let's see some examples of duplicates
first_name = duplicate_apps[0]
print(first_name)
for example in google_play_examples:
    name = example[0]
    if name == first_name:
        print(example)

Quick PDF Scanner + OCR FREE
['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80805', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']
['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80805', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']
['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80804', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']


In [32]:
print('Expected length:', len(google_play_examples) - len(duplicate_apps))

Expected length: 9659


In [63]:
reviews_max = {}
for example in google_play_examples:
    name = example[0]
    n_reviews = float(example[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    else:
        reviews_max[name] = n_reviews

In [64]:
print(len(reviews_max))

9659


In [65]:
android_clean = [] # the new dataset to be created without duplicates
already_added = [] # for securing that any examples with same names are added
for example in google_play_examples:
    name = example[0]
    n_reviews = float(example[3])
    # if the review for this example is the maximum, most recent
    # and this example was not already added
    if n_reviews == reviews_max[name] and name not in already_added:
        android_clean.append(example)
        already_added.append(name)

In [66]:
len(android_clean)

9659

In [54]:
# Checking non-English apps
def detect_english_app(string):
    count = 0
    for c in string:
        if ord(c) > 127:
            count += 1
        if count > 3:
            return False
    return True

print(detect_english_app('Instagram'))
print(detect_english_app('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(detect_english_app('Docs To Go™ Free Office Suite'))
print(detect_english_app('Instachat 😜'))

True
False
True
True


In [67]:
android = []
apple = []
for example in android_clean:
    name = example[0]
    if detect_english_app(name):
        android.append(example)

for example in apple_store_examples:
    name = example[1]
    if detect_english_app(name):
        apple.append(example)

In [68]:
len(android)

9614

In [81]:
len(apple)

6183