# Google and Apple Store Analysis

Find out most downloaded genres + statistics in both stores

### Importing Datasets

In [1]:
import csv

ios = list(csv.reader(open("Datasets/DSAppleStore.csv", encoding="utf8")))
android = list(csv.reader(open("Datasets/DSgoogleplaystore.csv", encoding="utf8")))


In [2]:
print(len(android))
print(len(ios))

10842
7198


### Function to explore data

In [3]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print('\n')
        
    if rows_and_columns:
        print("Number of rows: ", len(dataset))
        print("Number of columns: ", len(dataset[0]))

In [4]:

google = explore_data(android, 0, 2) 
apple = explore_data(ios, 0, 2)

# Take header off

android = android[1:]
ios = ios[1:]
print(len(android))
print(len(ios))

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


10841
7197


# Cleaning Data

In [5]:
del android[10472]

### Duplicates

In [6]:

def eliminate_dup(dataset, name_idx, rev_idx):
    max_rev_idx = {} #Store index of max rev
    max_rev = {}
    ds = list(dataset)
    counter = 0
    for row in dataset:
        app_name = row[name_idx]
        
        # If app not found before, add to max_rev with its nº of reviews
        if app_name not in max_rev:
            max_rev[app_name] = int(row[rev_idx])
            counter +=1
            max_rev_idx[app_name] = counter
        # If nº reviews of new found app > previous, update max nº rev and delet row with smallest review
        # stored app review < new app rev
        elif(int(max_rev[app_name]) < int(row[rev_idx])):
                max_rev[app_name] = row[rev_idx]
                # Delete old app row
                del ds[max_rev_idx[app_name]]
                max_rev_idx[app_name] = counter
        # stored app review > new app rev
        else:
            del ds[counter]
        
    return ds

In [7]:
android_clean = eliminate_dup(android,0, 3)
print(len(android_clean))

9659


### Eliminate non-English apps

In [8]:
print(android_clean[4412][0])

中国語 AQリスニング


In [9]:
def filter_english(dataset, name_idx=0):
    ds = list(dataset)
    for row in dataset:
        counter = 0
        #Check each character in each name
        for char in row[name_idx]:
            if(ord(char)>127):
                counter += 1
                if(counter>3):
                    del ds[ds.index(row)]
                    break
    return ds
    

In [10]:
# Check if function works
lst = [['Instagram'], ['爱奇艺PPS -《欢乐颂2》电视剧热播'], ['Docs To Go™ Free Office Suite'], ['Instachat 😜']]
print(filter_english(lst))


[['Instagram'], ['Docs To Go™ Free Office Suite'], ['Instachat 😜']]


In [11]:
print(len(android_clean))

android_clean = filter_english(android_clean)
print(len(android_clean))

ios_clean = filter_english(ios, name_idx=1)
print(len(ios_clean))

9659
9614
6183


### Filter Free apps

In [12]:
def filter_free(dataset, key_price, type_idx):
    ds = list(dataset)
    for row in dataset:
        if(row[type_idx] != key_price):
            del ds[ds.index(row)]
    return ds
            

In [13]:
print(len(android_clean))
android_clean = filter_free(android_clean, "0", 7)
print(len(android_clean))

print(len(ios_clean))
ios_clean = filter_free(ios_clean, "0.0" , 4)
print(len(ios_clean))

9614
8875
6183
3222


# Analyzing Data

1. We want to find frequency for each genre in play store and apple store.


### Frequency Play and App store

In [14]:
print(android[0])
print(android[1])
print('\n')
print(ios[0])
print(ios[1])

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']
['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']
['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


In [15]:
# Categories table
android_categories = {}
ios_categories = {}
android_genre = {}

# Categories total length
ios_categories_length = {}
android_categories_length = {}

def category_table(dataset, category_idx, categories, categories_length = {}, rating_idx=0, install_idx=0):
    for row in dataset:
        category = row[category_idx].lower()
        if category in categories:
            categories[category] +=1
        else:
            categories[category] = 1
            
        # Total length of all genres (ios)
        if(rating_idx != 0):
            if category in categories_length:
                categories_length[category] += float(row[rating_idx])
            else:
                categories_length[category] = float(row[rating_idx])
        
        # Total installs per genre (android)
        elif install_idx !=0:
            n_installs_str = row[install_idx]
            installs = n_installs_str.replace('+', '').replace(',', '')
            if category in categories_length:
                categories_length[category] += float(installs)
            else:
                categories_length[category] = float(installs)
    
    # Sort data descending order
    
    return sorted(categories.items(), key = lambda x: x[1], reverse=True)
            
# android category_idx=1
android_sorted = category_table(android_clean, 1, android_categories, categories_length = android_categories_length, install_idx=5)

# android genre: category_idx=9
android_genre_sorted = category_table(android_clean, 9, android_genre)

# ios category_idx=11
ios_sorted = category_table(ios_clean, 11, ios_categories, categories_length = ios_categories_length, rating_idx=5)


### Displaying Data

In [16]:
# Display all data + percentages
def display_data(data,dataset):
    print(len(dataset))
    for elem in data:
        print(elem[0] + ': ' + str(elem[1]))
    print("---------------")
    for elem in data:
        print(elem[0] + ': '+ str((elem[1]/len(dataset))*100))
        
    print('\n\n')
    
print(len(ios_sorted))
display_data(android_sorted, android_clean)
display_data(ios_sorted, ios_clean)
display_data(android_genre_sorted, android_clean)

23
8875
family: 1673
game: 878
tools: 747
business: 406
lifestyle: 344
productivity: 344
finance: 332
medical: 308
sports: 302
personalization: 294
communication: 292
health_and_fitness: 266
photography: 264
news_and_magazines: 249
social: 242
travel_and_local: 205
shopping: 194
books_and_reference: 191
dating: 162
video_players: 160
maps_and_navigation: 123
food_and_drink: 114
education: 105
entertainment: 91
auto_and_vehicles: 82
libraries_and_demo: 82
house_and_home: 75
weather: 72
parenting: 58
art_and_design: 57
comics: 55
events: 55
beauty: 53
---------------
family: 18.850704225352114
game: 9.892957746478872
tools: 8.416901408450704
business: 4.574647887323944
lifestyle: 3.876056338028169
productivity: 3.876056338028169
finance: 3.7408450704225356
medical: 3.470422535211268
sports: 3.4028169014084506
personalization: 3.3126760563380286
communication: 3.2901408450704226
health_and_fitness: 2.9971830985915493
photography: 2.9746478873239437
news_and_magazines: 2.8056338028169012
s

### Most used apps per genre

In [17]:
# ios_categories_length = {} Total ratings per category
# ios_sorted = {} Total occurrences per category

In [18]:
print(ios_categories_length)
print(ios_categories)

{'social networking': 7584125.0, 'photo & video': 4550647.0, 'games': 42705967.0, 'music': 3783551.0, 'reference': 1348958.0, 'health & fitness': 1514371.0, 'weather': 1463837.0, 'utilities': 1513441.0, 'travel': 1129752.0, 'shopping': 2261254.0, 'news': 913665.0, 'navigation': 516542.0, 'lifestyle': 840774.0, 'entertainment': 3563577.0, 'food & drink': 866682.0, 'sports': 1587614.0, 'book': 556619.0, 'finance': 1132846.0, 'education': 826470.0, 'productivity': 1177591.0, 'business': 127349.0, 'catalogs': 16016.0, 'medical': 3672.0}
{'social networking': 106, 'photo & video': 160, 'games': 1874, 'music': 66, 'reference': 18, 'health & fitness': 65, 'weather': 28, 'utilities': 81, 'travel': 40, 'shopping': 84, 'news': 43, 'navigation': 6, 'lifestyle': 51, 'entertainment': 254, 'food & drink': 26, 'sports': 69, 'book': 14, 'finance': 36, 'education': 118, 'productivity': 56, 'business': 17, 'catalogs': 4, 'medical': 6}


In [19]:
print(android_categories_length)
print(android_categories)

{'art_and_design': 108811100.0, 'auto_and_vehicles': 53080061.0, 'beauty': 27197050.0, 'books_and_reference': 795784260.0, 'business': 691802090.0, 'comics': 54975150.0, 'communication': 15080896201.0, 'dating': 157549757.0, 'education': 422340000.0, 'entertainment': 2086160000.0, 'events': 13571660.0, 'finance': 729163132.0, 'food_and_drink': 235538751.0, 'health_and_fitness': 1223248402.0, 'house_and_home': 112212461.0, 'libraries_and_demo': 62894810.0, 'lifestyle': 482484429.0, 'game': 19244369450.0, 'family': 6566695690.0, 'medical': 30272344.0, 'social': 4647761902.0, 'shopping': 1611838585.0, 'photography': 6155268815.0, 'sports': 1172080683.0, 'travel_and_local': 3030604086.0, 'tools': 8158443474.0, 'personalization': 1538235888.0, 'productivity': 8005629314.0, 'parenting': 31471010.0, 'weather': 365288520.0, 'video_players': 4530731720.0, 'news_and_magazines': 4848196260.0, 'maps_and_navigation': 502960780.0}
{'art_and_design': 57, 'auto_and_vehicles': 82, 'beauty': 53, 'books_

In [20]:
def ratio_data(categories, categories_length):
    for genre in categories:
        ratio = categories_length[genre] / categories[genre]
        print(genre + ': ' + str(ratio))

In [21]:
print(ratio_data(ios_categories, ios_categories_length))
print('\n')
print(ratio_data(android_categories, android_categories_length))

social networking: 71548.34905660378
photo & video: 28441.54375
games: 22788.6696905016
music: 57326.530303030304
reference: 74942.11111111111
health & fitness: 23298.015384615384
weather: 52279.892857142855
utilities: 18684.456790123455
travel: 28243.8
shopping: 26919.690476190477
news: 21248.023255813954
navigation: 86090.33333333333
lifestyle: 16485.764705882353
entertainment: 14029.830708661417
food & drink: 33333.92307692308
sports: 23008.898550724636
book: 39758.5
finance: 31467.944444444445
education: 7003.983050847458
productivity: 21028.410714285714
business: 7491.117647058823
catalogs: 4004.0
medical: 612.0
None


art_and_design: 1908966.6666666667
auto_and_vehicles: 647317.8170731707
beauty: 513151.88679245283
books_and_reference: 4166409.7382198954
business: 1703946.0344827587
comics: 999548.1818181818
communication: 51646904.79794521
dating: 972529.3641975309
education: 4022285.714285714
entertainment: 22924835.164835166
events: 246757.45454545456
finance: 2196274.49397590