# Profitable App Profiles for the App Store and Google Play Markets

In [1]:
from csv import reader

# Apple App Store data
apple = reader(open('AppleStore.csv'))
ios = list(apple)
ios_header = ios[0]
ios_apps = ios[1:]

# Google Play data
google = reader(open('googleplaystore.csv'))
android = list(google)
android_header = android[0]
android_apps = android[1:]

In [2]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    # Modified explore_data to return a tuple for the (# rows, # cols)
    if rows_and_columns:
        return (len(dataset), len(dataset[0]))

In [3]:
# Print the first five rows from the ios_apps dataset
ios_apps_size = explore_data(ios_apps, 0, 5, True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


['284035177', 'Pandora - Music & Radio', '130242560', 'USD', '0.0', '1126879', '3594', '4.0', '4.5', '8.4.1', '12+', 'Music', '37', '4', '1', '1']




In [4]:
# Print the first five rows from the android_apps dataset
android_apps_size = explore_data(android_apps, 0, 5, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']




In [5]:
# List out the column names for the ios_apps dataset
ios_header

['id',
 'track_name',
 'size_bytes',
 'currency',
 'price',
 'rating_count_tot',
 'rating_count_ver',
 'user_rating',
 'user_rating_ver',
 'ver',
 'cont_rating',
 'prime_genre',
 'sup_devices.num',
 'ipadSc_urls.num',
 'lang.num',
 'vpp_lic']

In [6]:
# The size (#rows, #cols) of the (uncleaned) ios_apps dataset
ios_apps_size

(7197, 16)

In [7]:
# List out the column names for the android_apps dataset
android_header

['App',
 'Category',
 'Rating',
 'Reviews',
 'Size',
 'Installs',
 'Type',
 'Price',
 'Content Rating',
 'Genres',
 'Last Updated',
 'Current Ver',
 'Android Ver']

In [8]:
# The size (#rows, #cols) of the (uncleaned) android_apps dataset
android_apps_size

(10841, 13)

## Data Cleaning

#### Google Play Store
From the discussion on Kaggle.com, the app located at index 10472 ('Life Made Wi-Fi Touchscreen Photo Frame') is missing a value for the 'Category' column. Doing a little research, I was able to deduce that this is a 'Lifestyle' category app. 
See [discussion](https://www.kaggle.com/lava18/google-play-store-apps/discussion/66015).

Rather than deleting this datapoint, I will insert the value for 'Category'.

#### Apple App Store
After reading through some of the discussion found on Kaggle.com, it appears that there two sets of apps with the same names are all unqiue (VR Roller Coaster and Mannequin Challenge).
See [discussion](https://www.kaggle.com/ramamet4/app-store-apple-data-set-10k-apps/discussion/90409).

Rather than remove these two datapoints, I will keep both of them.

In [9]:
android_apps[10472]

['Life Made WI-Fi Touchscreen Photo Frame',
 '1.9',
 '19',
 '3.0M',
 '1,000+',
 'Free',
 '0',
 'Everyone',
 '',
 'February 11, 2018',
 '1.0.19',
 '4.0 and up']

In [10]:
android_apps[10472].insert(1,'Lifestyle')

In [11]:
android_apps[10472]

['Life Made WI-Fi Touchscreen Photo Frame',
 'Lifestyle',
 '1.9',
 '19',
 '3.0M',
 '1,000+',
 'Free',
 '0',
 'Everyone',
 '',
 'February 11, 2018',
 '1.0.19',
 '4.0 and up']

In [12]:
# function returns a list of duplicate apps by name
def duplicate_apps(data, name_index=0):
    dup_apps = []
    unique_apps = []
    
    for app in data:
        name = app[name_index]
        if name in unique_apps:
            dup_apps.append(name)
        else:
            unique_apps.append(name)
  
    return dup_apps

In [13]:
dup_apps_android = duplicate_apps(android_apps)
print('Google Play Store')
print('Number of duplicate apps: ', len(dup_apps_android))

Google Play Store
Number of duplicate apps:  1181


In [14]:
dup_apps_ios = duplicate_apps(ios_apps, 1)
print('Apple App Store')
print('Number of duplicate apps: ', len(dup_apps_ios))

Apple App Store
Number of duplicate apps:  2


### Remove Duplicate Apps

#### Google Play Store
It appears that there are a lot of duplicate apps in the android_apps dataset. I will remove all duplicates based with the same name and keep only the one with the most number of reviews.

#### Apple App Store
Since there are two sets of duplicates found in the ios_apps dataset, these must be the two sets of applications with the same names (as mentioned above). I will not remove these as they appear to be unique.

In [15]:
# function to remove duplicates by selecting the row with the 
# highest number of reviews (i.e. the most up-to-date row) 
# data is a 2D array (a list of lists)
# name_index and n_reviews_index are both integers 
# returns a 2d array without any duplicates
def remove_duplicates(data, name_index, n_reviews_index):
    # create a dictionary for the max number of reviews per app 
    reviews_max = {}
    
    for row in data:
        name = row[name_index]
        n_reviews = float(row[n_reviews_index])
        if name in reviews_max and reviews_max[name] < n_reviews:
            reviews_max[name] = n_reviews
        elif name not in reviews_max:
            reviews_max[name] = n_reviews
        
    clean_data = []
    already_added = []

    for row in data:
        name = row[name_index]
        n_reviews = float(row[n_reviews_index])
        if n_reviews == reviews_max[name] and name not in already_added:
            clean_data.append(row)
            already_added.append(name)
    
    return clean_data

In [16]:
android_clean = remove_duplicates(android_apps, 0, 3)
print('The new android dataset (w/o duplicates) has', 
      len(android_clean), 'rows.')

The new android dataset (w/o duplicates) has 9660 rows.


In [17]:
ios_clean = ios_apps
print('This new ios_clean dataset is the same as ios_apps dataset and has', 
      len(ios_clean), 'rows.')

This new ios_clean dataset is the same as ios_apps dataset and has 7197 rows.


### Remove Non-English Apps

The way we have choosen to remove non-English apps is by creating a rule in which we count the number of non-English characters (0-127 ASCII codes) and if the name of the app has more than 3 non-English characters, we will remove it from our datasets. 

This rule is not perfect since we may remove apps with many emojis in their name that are still English. Or the reserve, we may keep apps that are in other languages but use mostly English characters (i.e. German or French)). This rule should be good enough for the time being.

In [18]:
# function which takes a string and returns False if more than 3 
# characters in the string are non-English (0-127 ASCII)
def english_detector(string):
    
    non_english_char_count = 0
    
    for char in string:
        if ord(char) > 127:
            non_english_char_count += 1
    
    if non_english_char_count > 3:
        return False
        
    return True

In [27]:
android_clean_english = []

for row in android_clean:
    english_name = english_detector(row[0])
    if english_name == True:
        android_clean_english.append(row)

print('''The new android dataset (w/o duplicates and non-English names) 
has''', len(android_clean_english), 'rows.')

The new android dataset (w/o duplicates and non-English names) 
has 9615 rows.


In [28]:
ios_clean_english = []

for row in ios_clean:
    english_name = english_detector(row[1])
    if english_name == True:
        ios_clean_english.append(row)

print('''The new ios dataset (w/o duplicates and non-English names) 
has''', len(ios_clean_english), 'rows.')

The new ios dataset (w/o duplicates and non-English names) 
has 6183 rows.


### Remove Non-Free Apps

We are only concerned with free apps and will remove all apps require payment to download.

In [24]:
android_clean_english_free = []

for row in android_clean_english:
    if row[6] == 'Free':
        android_clean_english_free.append(row)

print('''The new android dataset (w/o duplicates, non-English names
and non-Free apps) has''', len(android_clean_english_free), 'rows.')

The new android dataset (w/o duplicates, non-English names
and non-Free apps) has 8864 rows.


In [25]:
ios_clean_english_free = []

for row in ios_clean_english:
    if float(row[4]) == 0:
        ios_clean_english_free.append(row)

print('''The new ios dataset (w/o duplicates and non-English names
and non-Free apps) has''', len(ios_clean_english_free), 'rows.')

The new ios dataset (w/o duplicates and non-English names
and non-Free apps) has 3222 rows.
