# Analyzing Mobile App Data
- The project is about mobile app data analysis for a company that builds Android and iOS mobile apps. 
- The goal for the project is to analyze data to help our developers understand what type of apps are likely to attract more users

In [1]:
from csv import reader

#ios apps
opened_ios_file = open('AppleStore.csv')
read_ios_file = reader(opened_ios_file)
ios_apps_data = list(read_ios_file)
ios_header = ios_apps_data[0]
ios_apps_data = ios_apps_data[1:]

#android apps
opened_android_file = open('googleplaystore.csv')
read_android_file = reader(opened_android_file)
android_apps_data = list(read_android_file)
android_header = android_apps_data[0]
android_apps_data = android_apps_data[1:]


In [2]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print('\n') # add a new line after each row
    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))
print(ios_header)
print('\n')
explore_data(ios_apps_data, 0, 3, True)



['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 7197
Number of columns: 16


In [3]:
print(android_header)
print('\n')
explore_data(android_apps_data, 0, 3, True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13


# Data cleaning
- **Data Cleaning** is the process of data preperation before analysis. It includes removing or correcting wrong data, removing duplicate data, and modifying the data to fit the purpose of our analysis

###     Delete incorrect data

In [4]:
print(android_apps_data[10472])
print(android_header)
print(android_apps_data[0])
# The row 10472 corresponds to the app Life Made WI-Fi Touchscreen Photo Frame, 
# and we can see that the rating is 19. This is clearly off because the maximum rating 
# for a Google Play app is 5 (as mentioned in the discussions section, this problem is 
# caused by a missing value in the 'Category' column). As a consequence, we'll delete this row.

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


In [5]:
print(len(android_apps_data))
del android_apps_data[10472]
print(len(android_apps_data))


10841
10840


### Finding Dupplicate entries
- In the Google Dataset, there exists some duplicate entries of several apps.
- As a result, we need to remove all the duplicate ones in order to get an accurate result when analyzing
- However, we do not want to remove them randomly. For instace, Instagram apps have review col repetition. We should keep the one that has more reviewer since it indicates this is the recent data
- Now, let's demonstrate it in the code:

In [6]:
for app in android_apps_data:
    name = app[0]
    if name == 'Instagram':
        print(app)

['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']


In [7]:
# Count the number of duplicate
duplicate_apps = []
unique_apps = []

for app in android_apps_data:
    name = app[0]
    if name in unique_apps: #if name was already in unique apps
        duplicate_apps.append(name) # then save it to duplicate app dictionary
    else:
        unique_apps.append(name)
print('Number of duplicate apps: ', len(duplicate_apps))

Number of duplicate apps:  1181


### Removing duplicates
- To remove the duplicates, we will do the following:

    - Create a dictionary, where each dictionary key is a unique app name and the corresponding dictionary value is the highest number of reviews of that app.
    - Use the information stored in the dictionary and create a new dataset, which will have only one entry per app (and for each app, we'll only select the entry with the highest number of reviews).

In [8]:
reviews_max = {}
for app in android_apps_data:
    name = app[0]
    n_reviews = float(app[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews
        

In [9]:
print(len(android_apps_data))
print(len(reviews_max))

10840
9659


- Now, let's use the reviews_max dictionary to remove the duplicates.
- For the duplicate cases, we'll only keep the entries with the highest number of reviews. In the code cell below:

    - We start by initializing two empty lists, android_clean and already_added.
    - We loop through the android data set, and for every iteration:
        - We isolate the name of the app and the number of reviews.
        - We add the current row (app) to the android_clean list, and the app name (name) to the already_added list if:
            - The number of reviews of the current app matches the number of reviews of that app as described in the reviews_max dictionary; and
            - The name of the app is not already in the already_added list. We need to add this supplementary condition to account for those cases where the highest number of reviews of a duplicate app is the same for more than one entry (for example, the Box app has three entries, and the number of reviews is the same). If we just check for reviews_max[name] == n_reviews, we'll still end up with duplicate entries for some apps.

In [10]:
android_clean = []
already_added = []
for app in android_apps_data:
    name = app[0]
    n_reviews = float(app[3])
    if n_reviews == reviews_max[name] and name not in already_added:
        android_clean.append(app)
        already_added.append(name)
    

In [11]:
print(len(android_clean))
explore_data(android_clean, 0, 3, True)

9659
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9659
Number of columns: 13


### Removing Non-English Apps

In [12]:
def english_recognizer(a_string):
    for char in a_string:
        if ord(char) > 127:
            return False
    return True
    

In [13]:
english_recognizer('Instagram')

True

In [14]:
english_recognizer('爱奇艺PPS -《欢乐颂2》电视剧热播')

False

In [15]:
english_recognizer('Docs To Go™ Free Office Suite')

False

In [16]:
english_recognizer('Instachat 😜')

False

- The problem of the **english_recognizer** function is that it can't define the app name with emoji or ™
- Consequently, if we use it, we will lose useful data
- To reduce the impact of data loss, we will count all English apps with up to three emoji or other special characters

In [17]:
def is_english(a_string):
    count = 0;

    for char in a_string:
        if ord(char) > 127:
            count += 1
            if count > 3:
                return False
            
        
    return True
    

In [18]:
is_english('爱奇艺PPS -《欢乐颂2》电视剧热播')

False

In [19]:
is_english('Docs To Go™ Free Office Suite')

True

In [20]:
is_english('Instachat 😜')

True

In [21]:
english_android = []
for app in android_clean:
    if is_english(app[0]):
        english_android.append(app)

# ios english app
english_ios = []
for app in ios_apps_data:
    name = app[1]
    if is_english(name):
        english_ios.append(app)

In [22]:
explore_data(english_android, 0, 3, True)


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9614
Number of columns: 13


In [23]:
explore_data(english_ios, 0, 3, True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 6183
Number of columns: 16


### Isolating the free apps
- The datasets contain both free and non-free apps, we'll need to isolate only the free apps for our analysis

In [24]:
# free android apps
free_android = []
for app in english_android:
    price = app[7]
    if price == '0':
        free_android.append(app)

# free ios apps
free_ios = []
for app in english_ios:
    price = app[4]
    if price == '0.0':
        free_ios.append(app)
print(len(free_android))
print(len(free_ios))


8864
3222


### Most Common Apps by Genre
- Recall the goal: Determine the kinds of apps that are likely to attaract more users
- To minimize risks and overhead, our validation strategy for an app idea is comprised of three steps:
    - Build a minimal Android version of the app, and add it to Google Play.
    - If the app has a good response from users, we develop it further.
    - If the app is profitable after six months, we build an iOS version of the app and add it to the App Store.

In [25]:
ios_header

['id',
 'track_name',
 'size_bytes',
 'currency',
 'price',
 'rating_count_tot',
 'rating_count_ver',
 'user_rating',
 'user_rating_ver',
 'ver',
 'cont_rating',
 'prime_genre',
 'sup_devices.num',
 'ipadSc_urls.num',
 'lang.num',
 'vpp_lic']

In [26]:
android_header

['App',
 'Category',
 'Rating',
 'Reviews',
 'Size',
 'Installs',
 'Type',
 'Price',
 'Content Rating',
 'Genres',
 'Last Updated',
 'Current Ver',
 'Android Ver']

- For android apps, the columns to generate frequency tables to find out what are the most common genres are Genres and Category
- Same as anroid, the column in ios app dataset is prime_genre

- We'll build two functions we can use to analyze the frequency tables:
    - One function to generate frequency tables that show percentages
    - Another function we can use to display percentages in a descending order
        - we will take advantage of sorted() function. The sorted() function works well if we transform the dictionary into a list of tuples, where each tuple contains a dictionary key along with its corresponding dictionary value. To ensure the sorting works right, the dictionary value comes first, and the dictionary key comes second

In [27]:
# In order to find the percentage, we need to determine the total genre numbers, and the total number of each genre
def freq_table(dataset, index):
    frequency_table = {}
    total = 0
    for row in dataset:
        total += 1
        value = row[index]
        if value in frequency_table:
            frequency_table[value] += 1
        else:
            frequency_table[value] = 1
    table_percentages = {}
    for key in frequency_table:
        percentage = (frequency_table[key]/total)*100
        table_percentages[key] = percentage
    return table_percentages

# display table
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_value_as_tuple = (table[key], key)
        table_display.append(key_value_as_tuple)
        
    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [28]:
display_table(free_ios, 11)

Games : 58.16263190564867
Entertainment : 7.883302296710118
Photo & Video : 4.9658597144630665
Education : 3.662321539416512
Social Networking : 3.2898820608317814
Shopping : 2.60707635009311
Utilities : 2.5139664804469275
Sports : 2.1415270018621975
Music : 2.0484171322160147
Health & Fitness : 2.0173805090006205
Productivity : 1.7380509000620732
Lifestyle : 1.5828677839851024
News : 1.3345747982619491
Travel : 1.2414649286157666
Finance : 1.1173184357541899
Weather : 0.8690254500310366
Food & Drink : 0.8069522036002483
Reference : 0.5586592178770949
Business : 0.5276225946617008
Book : 0.4345127250155183
Navigation : 0.186219739292365
Medical : 0.186219739292365
Catalogs : 0.12414649286157665


- What is the most common genre? What is the next most common?
    - Games is the most common genre. The next most common is Entertainment
- What other patterns do you see?
    - Photo & Video is 4.96% in total, next is Education
- What is the general impression — are most of the apps designed for practical purposes (education, shopping, utilities, productivity, lifestyle) or more for entertainment (games, photo and video, social networking, sports, music)?
    - The general impression is that most of the apps are designed for entertainments such as games, photo and video, social networking, sports, music.
    -  However, the fact that fun apps are the most numerous doesn't also imply that they also have the greatest number of users — the demand might not be the same as the offer.



- Below are the android dataset:

In [29]:
display_table(free_android, 1)

FAMILY : 18.907942238267147
GAME : 9.724729241877256
TOOLS : 8.461191335740072
BUSINESS : 4.591606498194946
LIFESTYLE : 3.9034296028880866
PRODUCTIVITY : 3.892148014440433
FINANCE : 3.7003610108303246
MEDICAL : 3.531137184115524
SPORTS : 3.395758122743682
PERSONALIZATION : 3.3167870036101084
COMMUNICATION : 3.2378158844765346
HEALTH_AND_FITNESS : 3.0798736462093865
PHOTOGRAPHY : 2.944494584837545
NEWS_AND_MAGAZINES : 2.7978339350180503
SOCIAL : 2.6624548736462095
TRAVEL_AND_LOCAL : 2.33528880866426
SHOPPING : 2.2450361010830324
BOOKS_AND_REFERENCE : 2.1435018050541514
DATING : 1.861462093862816
VIDEO_PLAYERS : 1.7937725631768955
MAPS_AND_NAVIGATION : 1.3989169675090252
FOOD_AND_DRINK : 1.2409747292418771
EDUCATION : 1.1620036101083033
ENTERTAINMENT : 0.9589350180505415
LIBRARIES_AND_DEMO : 0.9363718411552346
AUTO_AND_VEHICLES : 0.9250902527075812
HOUSE_AND_HOME : 0.8235559566787004
WEATHER : 0.8009927797833934
EVENTS : 0.7107400722021661
PARENTING : 0.6543321299638989
ART_AND_DESIGN : 

### Most Popular Apps by Genre on the App Store
- One wat to find out what genres are the most popular (have the most users) is to calculate the average number of installs for each app genre.

In [30]:
genres_ios = freq_table(free_ios, -5) #get unique app genres
for genre in genres_ios:
    total = 0 #sum of user ratings (number of ratings, not the actual rating)
    len_genre = 0 #number of apps sepcific to each genre
    for app in free_ios:
        genre_app = app[-5]
        if genre_app == genre:
            n_rating = float(app[5])
            total += n_rating
            len_genre += 1
    avg = total/len_genre
    print(genre , ': ' ,avg)
    


Social Networking :  71548.34905660378
Photo & Video :  28441.54375
Games :  22788.6696905016
Music :  57326.530303030304
Reference :  74942.11111111111
Health & Fitness :  23298.015384615384
Weather :  52279.892857142855
Utilities :  18684.456790123455
Travel :  28243.8
Shopping :  26919.690476190477
News :  21248.023255813954
Navigation :  86090.33333333333
Lifestyle :  16485.764705882353
Entertainment :  14029.830708661417
Food & Drink :  33333.92307692308
Sports :  23008.898550724636
Book :  39758.5
Finance :  31467.944444444445
Education :  7003.983050847458
Productivity :  21028.410714285714
Business :  7491.117647058823
Catalogs :  4004.0
Medical :  612.0


In [39]:
category_table = freq_table(free_android, 1)
for category in category_table:
    total = 0
    len_category = 0
    for app in free_android:
        category_app = app[1]
        if category_app == category:
            n_install = app[5]
            n_install = n_install.replace('+','')
            n_install = n_install.replace(',','')
            total += int(n_install)
            len_category += 1
    avg = total / len_category
    print(category, ': ', avg)
            

ART_AND_DESIGN :  1986335.0877192982
AUTO_AND_VEHICLES :  647317.8170731707
BEAUTY :  513151.88679245283
BOOKS_AND_REFERENCE :  8767811.894736841
BUSINESS :  1712290.1474201474
COMICS :  817657.2727272727
COMMUNICATION :  38456119.167247385
DATING :  854028.8303030303
EDUCATION :  1833495.145631068
ENTERTAINMENT :  11640705.88235294
EVENTS :  253542.22222222222
FINANCE :  1387692.475609756
FOOD_AND_DRINK :  1924897.7363636363
HEALTH_AND_FITNESS :  4188821.9853479853
HOUSE_AND_HOME :  1331540.5616438356
LIBRARIES_AND_DEMO :  638503.734939759
LIFESTYLE :  1437816.2687861272
GAME :  15588015.603248259
FAMILY :  3695641.8198090694
MEDICAL :  120550.61980830671
SOCIAL :  23253652.127118643
SHOPPING :  7036877.311557789
PHOTOGRAPHY :  17840110.40229885
SPORTS :  3638640.1428571427
TRAVEL_AND_LOCAL :  13984077.710144928
TOOLS :  10801391.298666667
PERSONALIZATION :  5201482.6122448975
PRODUCTIVITY :  16787331.344927534
PARENTING :  542603.6206896552
WEATHER :  5074486.197183099
VIDEO_PLAYERS 