# Profitable App Profiles for the App Store and Google Play Markets

Our goal for this project is to analyze data to help our developers understand what type of apps are likely to attract more users.

In [1]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [2]:
import csv
apple_file = open('AppleStore.csv')
apple_data = list(csv.reader(apple_file))

android_file = open('googleplaystore.csv')
android_data = list(csv.reader(android_file))

In [3]:
#explore_data(apple_data, 0, 5, rows_and_columns=True)
#explore_data(android_data, 0, 2, rows_and_columns=True)

Removing line 10473 because it is missing Category column

In [4]:
del android_data[10473]

Checking the number of duplicate apps.

In [5]:
duplicate_apps = []
unique_apps = []

for app in android_data:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
        
print("Number of duplicate apps: " + str(len(duplicate_apps)))

Number of duplicate apps: 1181


In [6]:
reviews_max = {}

for row in android_data[1:]:
    name = row[0]
    n_reviews = float(row[3])
    
    if ( name in reviews_max and reviews_max[name] < n_reviews ) or name not in reviews_max:
        reviews_max[name] = n_reviews

explore_data(android_data, 0, 2, rows_and_columns=True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13


In [7]:
android_clean = []
already_added = []

android_clean.append(android_data[0])

for row in android_data[1:]:
    name = row[0]
    n_reviews = float(row[3])
    
    if n_reviews == reviews_max[name] and name not in already_added:
        android_clean.append(row)
        already_added.append(name)
        
explore_data(android_clean, 0, 2, rows_and_columns=True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


Number of rows: 9660
Number of columns: 13


In [8]:
def english_chars(s):
    """Check if there is only English chars"""
    non_english = 0
    for a in s:
        if ord(a) > 127:
            non_english += 1
        
    if non_english > 3:
        return False
    else:
        return True

In [9]:
apple_clean = []

for row in apple_data:
    track_name = row[1]
    if english_chars(track_name):
        apple_clean.append(row)
        
explore_data(apple_clean, 0, 2, rows_and_columns=True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


Number of rows: 6184
Number of columns: 16


In [10]:
def filter_by_price(dataset, price_column, max_price):
    print("Length before filter: " + str(len(dataset)))
    filtered = [dataset[0]]
    for row in dataset[1:]:
        price = row[price_column]
        if price[0] == '$':
            price = price[1:]
            
        price = float(price)
        
        if price <= max_price:
            filtered.append(row)
            
    print("Length after filter: " + str(len(dataset)) +"\n")            
    
    return filtered

In [11]:
android_clean = filter_by_price(android_clean, 7, 0)

apple_clean = filter_by_price(apple_clean, 4, 0)

Length before filter: 9660
Length after filter: 9660

Length before filter: 6184
Length after filter: 6184



In [33]:
def freq_table(dataset, index):
    ft = {}
    
    for row in dataset[1:]:
        if row[index] in ft:
            ft[row[index]] += 1
        else:
            ft[row[index]] = 1
            
    total = len(dataset[1:])
    
    for k in ft:
        ft[k] = ft[k] / total * 100
        
    return ft

def display_table(dataset, index):
    print("Frequency table for " + dataset[0][index] + "\n")
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1] + ': ' + str(round(entry[0], 2))

SyntaxError: unexpected EOF while parsing (<ipython-input-33-8bfaaaca5199>, line 27)

In [32]:
# prime_genre
display_table(apple_clean, 11)

Frequency table for prime_genre



TypeError: can only concatenate str (not "float") to str

In [14]:
# Category
display_table(android_clean, 1)

# Genres
display_table(android_clean, 9)

Frequency table for Category

FAMILY : 18.97810218978102
GAME : 9.70241437394722
TOOLS : 8.433464345873105
BUSINESS : 4.581695676586187
LIFESTYLE : 3.9303761931499155
PRODUCTIVITY : 3.885457608085345
FINANCE : 3.6833239752947784
MEDICAL : 3.5148792813026386
SPORTS : 3.3801235261089273
PERSONALIZATION : 3.312745648512072
COMMUNICATION : 3.2341381246490735
HEALTH_AND_FITNESS : 3.065693430656934
PHOTOGRAPHY : 2.9421673217293653
NEWS_AND_MAGAZINES : 2.829870859067939
SOCIAL : 2.6501965188096577
TRAVEL_AND_LOCAL : 2.3245367770915215
SHOPPING : 2.2459292532285233
BOOKS_AND_REFERENCE : 2.1785513756316677
DATING : 1.8528916339135317
VIDEO_PLAYERS : 1.7967434025828188
MAPS_AND_NAVIGATION : 1.4149354295339696
FOOD_AND_DRINK : 1.235261089275688
EDUCATION : 1.167883211678832
ENTERTAINMENT : 0.9545199326221224
LIBRARIES_AND_DEMO : 0.9320606400898372
AUTO_AND_VEHICLES : 0.9208309938236946
HOUSE_AND_HOME : 0.8197641774284109
WEATHER : 0.7973048848961257
EVENTS : 0.7074677147669848
PARENTING : 0.65131

In [30]:
ft_apple = freq_table(apple_clean, 11)

avg_ratings = []

for genre in ft_apple:
    total = 0
    len_genre = 0
    for row in apple_clean[1:]:
        genre_app = row[11]
        if genre_app == genre:
            rating_count_tot = float(row[5])
            total += rating_count_tot
            len_genre += 1
            
    avg_rating = total / len_genre
    avg_rating = round(avg_rating, 2)
    avg_ratings.append((avg_rating, genre))
    
for avg in sorted(avg_ratings, reverse=True):
    row = avg[1] + ": " + str(avg[0])
    print(row)

Navigation: 86090.33
Reference: 74942.11
Social Networking: 71548.35
Music: 57326.53
Weather: 52279.89
Book: 39758.5
Food & Drink: 33333.92
Finance: 31467.94
Photo & Video: 28441.54
Travel: 28243.8
Shopping: 26919.69
Health & Fitness: 23298.02
Sports: 23008.9
Games: 22788.67
News: 21248.02
Productivity: 21028.41
Utilities: 18684.46
Lifestyle: 16485.76
Entertainment: 14029.83
Business: 7491.12
Education: 7003.98
Catalogs: 4004.0
Medical: 612.0


In [29]:
ft_android = freq_table(android_clean, 1)

avg_installs = []

for category in ft_android:
    total = 0
    len_category = 0
    
    for row in android_clean[1:]:
        category_app = row[1]
        if category_app == category:
            n_installs = row[5]
            n_installs = n_installs.replace('+', '')
            n_installs = n_installs.replace(',', '')
            n_installs = float(n_installs)
            total += n_installs
            len_category += 1
            
    avg = total / len_category
    avg = round(avg, 2)
    avg_installs.append((avg, category))
    
for c in sorted(avg_installs, reverse=True):
    print(c[1] + ": " + str(c[0]))

COMMUNICATION: 38322625.7
VIDEO_PLAYERS: 24573948.25
SOCIAL: 23253652.13
PHOTOGRAPHY: 17772018.76
PRODUCTIVITY: 16738957.55
GAME: 15551995.89
TRAVEL_AND_LOCAL: 13984077.71
ENTERTAINMENT: 11640705.88
TOOLS: 10787009.95
NEWS_AND_MAGAZINES: 9401635.95
BOOKS_AND_REFERENCE: 8587351.86
SHOPPING: 7001693.42
PERSONALIZATION: 5183850.81
WEATHER: 5074486.2
HEALTH_AND_FITNESS: 4188821.99
MAPS_AND_NAVIGATION: 3993339.6
FAMILY: 3668870.82
SPORTS: 3638640.14
ART_AND_DESIGN: 1952105.17
FOOD_AND_DRINK: 1924897.74
EDUCATION: 1825480.77
BUSINESS: 1708215.91
LIFESTYLE: 1436126.94
FINANCE: 1387692.48
HOUSE_AND_HOME: 1331540.56
DATING: 854028.83
COMICS: 803234.82
AUTO_AND_VEHICLES: 647317.82
LIBRARIES_AND_DEMO: 638503.73
PARENTING: 542603.62
BEAUTY: 513151.89
EVENTS: 253542.22
MEDICAL: 120550.62
