In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

#set global settings 
pd.set_option("display.max_columns", 100)

print("Libraries loaded")

Libraries loaded


In [29]:
#defining project paths
MAIN = Path.cwd().parents[0] #back to project root 
RAW_DATA = MAIN / "data" / "raw"
CLEAN_DATA = MAIN / "data" / "clean"
OUTPUTS = MAIN / "outputs"

print("Project structure set:")
print("Raw data -->", RAW_DATA)
print("Clean data -->", CLEAN_DATA)
print("Outputs -->", OUTPUTS)


Project structure set:
Raw data --> /Users/juliaalie/Documents/app-market-analysis-1/data/raw
Clean data --> /Users/juliaalie/Documents/app-market-analysis-1/data/clean
Outputs --> /Users/juliaalie/Documents/app-market-analysis-1/outputs


In [30]:
#project paths for datasets
IOS_FILE = RAW_DATA / "AppleStore.csv"
ANDROID_FILE = RAW_DATA / "googleplaystore.csv"

print("ios -->", IOS_FILE)
print("android -->", ANDROID_FILE)


ios --> /Users/juliaalie/Documents/app-market-analysis-1/data/raw/AppleStore.csv
android --> /Users/juliaalie/Documents/app-market-analysis-1/data/raw/googleplaystore.csv


In [64]:
#load datasets
ios = pd.read_csv(IOS_FILE, on_bad_lines="skip")
android = pd.read_csv(ANDROID_FILE, on_bad_lines="skip")

print("ios dataset: ", ios.shape)
print("android dataset: ", android.shape)
print("ios dataset columns: ", ios.columns)
print("android dataset columns: ", android.columns)


ios dataset:  (7197, 17)
android dataset:  (10841, 13)
ios dataset columns:  Index(['Unnamed: 0', 'id', 'track_name', 'size_bytes', 'currency', 'price',
       'rating_count_tot', 'rating_count_ver', 'user_rating',
       'user_rating_ver', 'ver', 'cont_rating', 'prime_genre',
       'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'],
      dtype='object')
android dataset columns:  Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')


In [32]:
ios.head()


Unnamed: 0.1,Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
0,1,281656475,PAC-MAN Premium,100788224,USD,3.99,21292,26,4.0,4.5,6.3.5,4+,Games,38,5,10,1
1,2,281796108,Evernote - stay organized,158578688,USD,0.0,161065,26,4.0,3.5,8.2.2,4+,Productivity,37,5,23,1
2,3,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,USD,0.0,188583,2822,3.5,4.5,5.0.0,4+,Weather,37,5,3,1
3,4,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,USD,0.0,262241,649,4.0,4.5,5.10.0,12+,Shopping,37,5,9,1
4,5,282935706,Bible,92774400,USD,0.0,985920,5320,4.5,5.0,7.5.1,4+,Reference,37,5,45,1


In [48]:
android.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [None]:
#DATA CLEANING
## Remove Duplicates

#for ios dataset 
ios_before = ios.shape[0] 
ios = ios.drop_duplicates(subset="track_name", keep="first")
ios_after = ios.shape[0]

print(ios_before, ios_after)

#for android dataset
android_before = android.shape[0] 
android = android.drop_duplicates(subset="App", keep="first")
android_after = android.shape[0]

print(android_before, android_after)

7195 7195
9660 10841


In [None]:
## Isolate free apps

#for ios dataset
ios_free = ios[ios["price"] == 0.0]
print(ios_free.shape[0]) #4054 out of 7195 free apps

#for android dataset
android_free = android[android["Price"] == "0"]
print(android_free.shape[0]) #8903 out of 9660 free apps


4054
8903


In [None]:
## Remove non-english apps (focusing only on english apps scope)

def is_english(name):
    non_ascii = 0
    for character in name:
        if ord(character) > 127: #not an english alphabet
            non_ascii += 1
    return non_ascii <= 3

#for ios dataset
ios_english = ios_free[ios_free["track_name"].apply(is_english)]
print(ios_english.shape[0]) #3220

#for android dataset
android_english = android_free[android_free["App"].apply(is_english)]
print(android_english.shape[0]) #8862




3220
8862


In [59]:
#SAVE CLEANED DATA

#create paths
ios_clean_data = CLEAN_DATA / "AppleStore_clean.csv"
android_clean_data = CLEAN_DATA / "googleplaystore_clean.csv"

#convert to csv
ios_english.to_csv(ios_clean_data, index=False)
android_english.to_csv(android_clean_data, index=False)

print("clean file saved")

clean file saved
