# Mobile-app-analysis

Apple estore and google play app analysis to identify common features among the most downloaded apps

In [2]:
from csv import reader

In [27]:
apple_file = list(reader(open('app-store-apple-data-set-10k-apps/AppleStore.csv', 'r')))
android_file = list(reader(open('google-play-store-apps/googleplaystore.csv', 'r')))

apple_header = apple_file[0]
apple_file = apple_file[1:]

android_header = android_file[0]
android_file = android_file[1:]

In [9]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

## Exploring the data sets

#### Apple store header and first row

In [12]:
apple_header

['',
 'id',
 'track_name',
 'size_bytes',
 'currency',
 'price',
 'rating_count_tot',
 'rating_count_ver',
 'user_rating',
 'user_rating_ver',
 'ver',
 'cont_rating',
 'prime_genre',
 'sup_devices.num',
 'ipadSc_urls.num',
 'lang.num',
 'vpp_lic']

In [10]:
explore_data(apple_file, 0, 1, True)

['1', '281656475', 'PAC-MAN Premium', '100788224', 'USD', '3.99', '21292', '26', '4', '4.5', '6.3.5', '4+', 'Games', '38', '5', '10', '1']


Number of rows: 7197
Number of columns: 17


#### Google play header and first row

In [13]:
android_header

['App',
 'Category',
 'Rating',
 'Reviews',
 'Size',
 'Installs',
 'Type',
 'Price',
 'Content Rating',
 'Genres',
 'Last Updated',
 'Current Ver',
 'Android Ver']

In [14]:
explore_data(android_file, 0, 1, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13


## Cleaning the data

As discussed [here](https://www.kaggle.com/lava18/google-play-store-apps/discussion/66015#latest-600082) the column "Category" is missing at row index 10472. So I removed this line from the dataset.

In [30]:
explore_data(android_file, 10472, 10473, False)

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']




In [32]:
#del android_file[10472]

### Duplicate Entries
The google play dataset has some duplicate entries, probaly due to collecting the data at different times.
First, I'll remove the duplicate lines, keeping olny one entry for any duplicate. As the lines are identical no information will be lost. After that, I will look at apps with duplicate names but with differences in any other columns. 

In [79]:
def find_duplicate(dataset, multi_index=True):
    unique_apps=[]
    duplicate_apps=[]
    
    if multi_index:
        for row in dataset:
            if row[0] in unique_apps:
                if row[0] not in duplicate_apps:
                    duplicate_apps.append(row[0])
            else:
                unique_apps.append(row[0])
    else:
        for row in dataset:
            if row in unique_apps:
                if row not in duplicate_apps:
                    duplicate_apps.append(row)
            else:
                unique_apps.append(row)
            
    return duplicate_apps

In [85]:
duplicate_apps = find_duplicate(android_file)

In [86]:
len(duplicate_apps)

798

In [87]:
duplicate = find_duplicate(duplicate_apps,False)

In [88]:
duplicate

[]