# Data Analysis - Apps on Google Play & App store
## Project's purpose is to help app makers understand which apps are favored by most users

In [3]:
from csv import reader

# opening apple store file and creating dataset
opened_file_app = open('AppleStore.csv')
read_file_app = reader(opened_file_app)
app_dataset =  list(read_file_app)
app_header = app_dataset[0]
app_data = app_dataset[1:]

# opening google play store file and creating dataset

opened_file_goo = open('googleplaystore.csv')
read_file_goo= reader(opened_file_goo)
goo_dataset =  list(read_file_goo)
goo_header = goo_dataset[0]
goo_data = goo_dataset[1:]


*Adding a function (see_data) to explore the data. Arguments include the dataset, starting & ending rows.*

In [4]:
import re
def see_data(dataset, start, end,calc_rows_cols):
    data_portion = dataset[start:end]
     
    for row in data_portion:
        print(row)
        print('\n')
        
    if calc_rows_cols:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

print(app_header)
print('\n')
see_data(app_data,0, 2, True)
            
    

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 7197
Number of columns: 16


In [5]:
print(goo_header)
print('\n')
see_data(goo_data,0, 3, True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13


In google play store dataset, the columns that might be useful for the purpose of our analysis are 'App', 'Category', 'Reviews', 'Installs', 'Type', 'Price', and 'Genres'.

In apple store, the columns that seem interesting are: 'track_name', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', and 'prime_genre'

# Data Cleaning
### . Missing Data
### . Duplicate Data

In [6]:
def missing_data(dataset):
    header_len=len(goo_header)
    for row in dataset[1:]:
        if len(row) != header_len:
            print(row)
            print(dataset.index(row))
    

missing_data(goo_dataset)

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']
10473


In [7]:
# deleting missing data, index number in goo_data is one less than in goo_dataset
del goo_data[10472]

In [8]:
#checking if missing data is deleted
goo_header_len=len(goo_header)
ct=0
for row in goo_data:
        if len(row) != goo_header_len:
            print(row) 
            ct+=1
if ct == 0:
    print("No rows with missing column data found")

No rows with missing column data found


#Removing Duplicate entries

In [9]:
# Finding out the duplicate entries
def dup_appls(data):
    unique_apps=[]
    duplicate_apps=[]
    for row in data:
        name = row[0]
        if name in unique_apps:
            duplicate_apps.append(name)
        else:
            unique_apps.append(name)
    print("Number of duplicate apps "+str(len(duplicate_apps)))
    print("Number of unique apps "+str(len(unique_apps)))
    print( "Some of the duplicate apps \n", duplicate_apps[:10])
    
dup_appls(goo_data)



Number of duplicate apps 1181
Number of unique apps 9659
Some of the duplicate apps 
 ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack']


In [10]:
# Excluding the duplicate entries. 
# First, finding out the apps with highest reviews.
reviews_max={}
for row in goo_data:
    name=row[0]
    n_reviews = row[3]
    if name in reviews_max and n_reviews > reviews_max[name]:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews

print(len(reviews_max))
        

9659


In [11]:
# Now exlude the duplicate apps with less than max reviews
google_clean = []
already_added = []
ct=1
ctt=1
for row in goo_data:
    name = row[0]
    n_reviews = float(row[3])
    if (float(reviews_max[name]) == n_reviews) and (name not in already_added):
        google_clean.append(row)
        already_added.append(name)

len(google_clean)
     
        #google_clean.append(row)
        #already_added.append(name) # make sure this is inside the if block
         

9659

In [12]:
see_data(google_clean,4,10, True)

['Paper flowers instructions', 'ART_AND_DESIGN', '4.4', '167', '5.6M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'March 26, 2017', '1.0', '2.3 and up']


['Smoke Effect Photo Maker - Smoke Editor', 'ART_AND_DESIGN', '3.8', '178', '19M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'April 26, 2018', '1.1', '4.0.3 and up']


['Infinite Painter', 'ART_AND_DESIGN', '4.1', '36815', '29M', '1,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'June 14, 2018', '6.1.61.1', '4.2 and up']


['Garden Coloring Book', 'ART_AND_DESIGN', '4.4', '13791', '33M', '1,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'September 20, 2017', '2.9.2', '3.0 and up']


['Kids Paint Free - Drawing Fun', 'ART_AND_DESIGN', '4.7', '121', '3.1M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'July 3, 2018', '2.8', '4.0.3 and up']


['Text on Photo - Fonteee', 'ART_AND_DESIGN', '4.4', '13880', '28M', '1,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'October 27, 2017', '1.0.4

In [13]:
# Removing apps with Non-english names
def isEnglish(string):
    non_ascii=0
    for ch in string:
        if ord(ch) > 127:
            non_ascii+=1
            
    if non_ascii > 3:
        return False
    else:
        return True

print(isEnglish("Is this english😜😜😜😜😜"))
print(isEnglish("Is this english"))

            

False
True


In [14]:
goo_play_english=[]
app_store_english=[]

for row in google_clean:
    name=row[0]
    if isEnglish(name):
        goo_play_english.append(row)

for row1 in app_data:
    name=row1[1]
    if isEnglish(name):
        app_store_english.append(row1)
        
see_data(goo_play_english, 1,3, True)
see_data(app_store_english, 1,3, True)

['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9614
Number of columns: 13
['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 6183
Number of columns: 16


# Separating the free apps


In [18]:
google_play_final = []
apple_store_final = []

for row in goo_play_english:
    price = (row[7])
    if price == '0':
        google_play_final.append(row)

for row in app_store_english:
    price = (row[4])
    if price == '0.0':
        apple_store_final.append(row)

print(len(google_play_final))
print(len(apple_store_final))


8862
3222
