<a href="https://colab.research.google.com/github/julieklaessens-dotcom/data-analyst-formation/blob/main/Project_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Profitable App Profiles for the App Store and Google Play Markets

## About
For this project, we'll pretend we're working as data analysts for a company that builds Android and iOS mobile apps. We make our apps available on Google Play and in the App Store.
We only build apps that are free to download and install, and our main source of revenue consists of in-app ads. This means that the number of users of our apps determines our revenue for any given app ‚Äî the more users who see and engage with the ads, the better.

## Goal
To analyze data to help our developers understand what type of apps are likely to attract more users.

## Documentation
`AppleStore.csv` : [Link](https://www.kaggle.com/datasets/ramamet4/app-store-apple-data-set-10k-apps)
`googleplaystore.csv` : [Link](https://www.kaggle.com/datasets/lava18/google-play-store-apps)


In [None]:
from google.colab import drive
#drive.mount('/content/drive')
from csv import reader

# The App Store data set
opened_file_ios = open('/content/drive/MyDrive/Data/App/AppleStore.csv', encoding='utf-8')
read_file_ios = reader(opened_file_ios)
apps_data_ios = list(read_file_ios)
apps_data_ios_header = apps_data_ios[0]
apps_data_ios = apps_data_ios[1:]

# The Google Play data set
opened_file_android = open('/content/drive/MyDrive/Data/App/googleplaystore.csv', encoding='utf-8')
read_file_android = reader(opened_file_android)
apps_data_android = list(read_file_android)
apps_data_android_header = apps_data_android[0]
apps_data_android = apps_data_android[1:]

In [None]:
# explore rows in a more readable way
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [None]:
print(apps_data_ios_header)
print('\n')
explore_data(apps_data_ios, 0, 5, True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


['284035177', 'Pandora - Music & Radio', '130242560', 'USD', '0.0', '1126879', '3594', '4.0', '4.5', '8.4.1', '12+', 'Music', '37', '4', '1', '1']


Number of rows: 7197
Number of columns: 16


In [None]:
print(apps_data_android_header)
print('\n')
explore_data(apps_data_android, 0, 5, True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite ‚Äì FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'E

In [None]:
print(apps_data_android[10472])
# The category is missing for this app, so we delete it
del apps_data_android[10472]
print(len(apps_data_android))

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']
10840


In [None]:
# The file with the data set Google Play contains a couple apps with duplicate names:
for app in apps_data_android:
    name = app[0]
    if name == "Instagram":
        print(app)

['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']


In [None]:
duplicate_app = []
unique_app = []

for app in apps_data_android:
    name = app[0]
    if name in unique_app:
        duplicate_app.append(name)
    else:
        unique_app.append(name)

print("Duplicate apps: ", len(duplicate_app))

# We will keep the duplicate app that has the max amount of reviews (column 3)
# and delete the other duplicate ones.


Duplicate apps:  1181


In [None]:
# Create a dictionary where each key is a unique app name and the corresponding dictionary value
#is the highest number of reviews of that app.
reviews_max = {}

for app in apps_data_android:
    name = app[0]
    n_reviews = float(app[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews

print(len(reviews_max))

# print the first 5 elements of the dictionnary to inspect the items
first_elements = list(reviews_max.items())[:5]

for key, value in first_elements:
    print(f"{key}: {value}")

# Use the dictionnary to delete the duplicate apps
android_clean = [] # new cleaned data set
already_added = [] # store app names

for app in apps_data_android:
    name = app[0]
    n_reviews = float(app[3])
    if (n_reviews == reviews_max[name]) and (name not in already_added):
        android_clean.append(app)
        already_added.append(name)

explore_data(android_clean, 0, 5, True)


9659
Photo Editor & Candy Camera & Grid & ScrapBook: 159.0
Coloring book moana: 974.0
U Launcher Lite ‚Äì FREE Live Cool Themes, Hide Apps: 87510.0
Sketch - Draw & Paint: 215644.0
Pixel Draw - Number Art Coloring Book: 967.0
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite ‚Äì FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']


['Paper flowers instructions

In [None]:
# Takes in a string and returns False if there's any character in the string
# that doesn't belong to the set of common English characters; otherwise, the function returns True
#def english(a_string):
#    for character in a_string:
#        if ord(character) > 127:
#            return False
#        return True

# If the input string has more than three characters that fall outside the ASCII range (0 - 127),
# then the function should return False (identify the string as non-English),
# otherwise it should return True.

def english(a_string):
    char = 0
    for character in a_string:
        if ord(character) > 127:
            char += 1
    if char > 3:
        return False
    else:
        return True

print(english('Instagram'))
print(english('Áà±Â•áËâ∫PPS -„ÄäÊ¨¢‰πêÈ¢Ç2„ÄãÁîµËßÜÂâßÁÉ≠Êí≠'))
print(english("Suite bureautique gratuite ¬´ Docs To Go‚Ñ¢"))
print(english('Instachat üòú'))

True
False
True
True


In [None]:
# Filter out non-English apps from both datasets
# Google Play
android_clean_english = []
for name in android_clean:
    if english(name[0]):
        android_clean_english.append(name)

explore_data(android_clean_english, 0, 5, True)

# Apple Store
apps_data_ios_english = []
for name in apps_data_ios:
    if english(name[1]):
        apps_data_ios_english.append(name)

explore_data(apps_data_ios_english, 0, 5, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite ‚Äì FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']


['Paper flowers instructions', 'ART_AND_DESIGN', '4.4', '167', '5.6M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'March 26, 2017', '1.0', '2.3 and up']


Number of rows: 9614
Number of columns: 13
['284882215', 'Facebook', '389879808', 'USD', 

In [None]:
# Loop through each dataset to isolate the free apps in separate lists
# Google Play
android_clean_english_free = []
for key in android_clean_english:
    if key[7] == '0':
        android_clean_english_free.append(key)

explore_data(android_clean_english_free, 0, 5, True)

# Apple Store
apps_data_ios_english_free = []
for key in apps_data_ios_english:
    if key[4] == '0.0':
        apps_data_ios_english_free.append(key)

explore_data(apps_data_ios_english_free, 0, 5, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite ‚Äì FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']


['Paper flowers instructions', 'ART_AND_DESIGN', '4.4', '167', '5.6M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'March 26, 2017', '1.0', '2.3 and up']


Number of rows: 8864
Number of columns: 13
['284882215', 'Facebook', '389879808', 'USD', 

### Context

To minimize risks and overhead, our validation strategy for an app idea has three steps:

1. Build a minimal Android version of the app, and add it to Google Play.
2. If the app has a good response from users, we develop it further.
3. If the app is profitable after six months, we build an iOS version of the app and add it to the App Store.
Because our end goal is to add the app on both Google Play and the App Store, we need to find app profiles that are successful in both markets. For instance, a profile that works well for both markets might be a productivity app that makes use of gamification.

We will use the columns of genre for each market (App Store data set : `prime_genre` column, Google Play data set : `Genres` and `Category` columns)

In [None]:
# Return the frequency table (as a dictionary) for any column we want.
# The frequencies should also be expressed as percentages
def freq_table(dataset, index):
    freq = {}
    number_app = 0
    freq_perc = {}

    for row in dataset:
        number_app += 1
        key = row[index]
        if key in freq:
            freq[key] += 1
        else:
            freq[key] = 1

    for item in freq:
        percentage = freq[item]/number_app*100
        freq_perc[item] = percentage

    return freq_perc


# Prints the entries of the frequency table in descending order
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

prime_genre_tab = display_table(apps_data_ios_english_free,11)
print('\n')
genres_tab = display_table(android_clean_english_free,9)
print('\n')
category_tab = display_table(android_clean_english_free,1)

Games : 58.16263190564867
Entertainment : 7.883302296710118
Photo & Video : 4.9658597144630665
Education : 3.662321539416512
Social Networking : 3.2898820608317814
Shopping : 2.60707635009311
Utilities : 2.5139664804469275
Sports : 2.1415270018621975
Music : 2.0484171322160147
Health & Fitness : 2.0173805090006205
Productivity : 1.7380509000620732
Lifestyle : 1.5828677839851024
News : 1.3345747982619491
Travel : 1.2414649286157666
Finance : 1.1173184357541899
Weather : 0.8690254500310366
Food & Drink : 0.8069522036002483
Reference : 0.5586592178770949
Business : 0.5276225946617008
Book : 0.4345127250155183
Navigation : 0.186219739292365
Medical : 0.186219739292365
Catalogs : 0.12414649286157665


Tools : 8.449909747292418
Entertainment : 6.069494584837545
Education : 5.347472924187725
Business : 4.591606498194946
Productivity : 3.892148014440433
Lifestyle : 3.892148014440433
Finance : 3.7003610108303246
Medical : 3.531137184115524
Sports : 3.463447653429603
Personalization : 3.31678700

In [None]:
prime_genre_freq = freq_table(apps_data_ios_english_free,11)

for genre in prime_genre_freq:
    total = 0 # somme des notes des utilisateurs (le nombre de notes, et non les notes r√©elles) propres √† chaque genre
    len_genre = 0 # nombre d'applications sp√©cifiques √† chaque genre

    for item in apps_data_ios_english_free:
        genre_app = item[11]
        if genre_app == genre:
            total += float(item[5])
            len_genre += 1
    avg = total/len_genre
    print(genre, ": ", avg)

Social Networking :  71548.34905660378
Photo & Video :  28441.54375
Games :  22788.6696905016
Music :  57326.530303030304
Reference :  74942.11111111111
Health & Fitness :  23298.015384615384
Weather :  52279.892857142855
Utilities :  18684.456790123455
Travel :  28243.8
Shopping :  26919.690476190477
News :  21248.023255813954
Navigation :  86090.33333333333
Lifestyle :  16485.764705882353
Entertainment :  14029.830708661417
Food & Drink :  33333.92307692308
Sports :  23008.898550724636
Book :  39758.5
Finance :  31467.944444444445
Education :  7003.983050847458
Productivity :  21028.410714285714
Business :  7491.117647058823
Catalogs :  4004.0
Medical :  612.0


In [None]:
category_freq = freq_table(android_clean_english_free,1)

for category in category_freq:
    total = 0
    len_category = 0

    for item in android_clean_english_free:
        category_app = item[1]
        if category_app == category:
            car = item[5]
            car = car.replace('+', '')
            car  =car.replace(',', '')
            total += float(car)
            len_category += 1
    avg = total/len_category
    print(category, ": ", avg)

ART_AND_DESIGN :  1986335.0877192982
AUTO_AND_VEHICLES :  647317.8170731707
BEAUTY :  513151.88679245283
BOOKS_AND_REFERENCE :  8767811.894736841
BUSINESS :  1712290.1474201474
COMICS :  817657.2727272727
COMMUNICATION :  38456119.167247385
DATING :  854028.8303030303
EDUCATION :  1833495.145631068
ENTERTAINMENT :  11640705.88235294
EVENTS :  253542.22222222222
FINANCE :  1387692.475609756
FOOD_AND_DRINK :  1924897.7363636363
HEALTH_AND_FITNESS :  4188821.9853479853
HOUSE_AND_HOME :  1331540.5616438356
LIBRARIES_AND_DEMO :  638503.734939759
LIFESTYLE :  1437816.2687861272
GAME :  15588015.603248259
FAMILY :  3695641.8198090694
MEDICAL :  120550.61980830671
SOCIAL :  23253652.127118643
SHOPPING :  7036877.311557789
PHOTOGRAPHY :  17840110.40229885
SPORTS :  3638640.1428571427
TRAVEL_AND_LOCAL :  13984077.710144928
TOOLS :  10801391.298666667
PERSONALIZATION :  5201482.6122448975
PRODUCTIVITY :  16787331.344927534
PARENTING :  542603.6206896552
WEATHER :  5074486.197183099
VIDEO_PLAYERS 