# SBM assignment - Data Cleaning and Inspection

### Importing packages

In [41]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import time
from datetime import datetime
import numpy as np

### Loading dataset

In [28]:
google_last_df = pd.read_csv("../Data/google_data_last.csv",encoding='cp1252')
# google_df = google_df.sample(n=1000, random_state=1) #uncomment to run on sample

In [29]:
data = google_last_df.copy()

In [30]:
data.columns

Index(['Unnamed: 0', 'my_app_id', 'date_published', 'privacy_policy',
       'rating_app', 'nb_rating', 'num_downloads', 'content_rating_app',
       'developer', 'categ_app', 'in_app', 'has_ads', 'price_gplay',
       'operating_system', 'software_version', 'interactive_element',
       'in_app_product', 'developer_name', 'nb_screenshots', 'description',
       'whats_new', 'email_to', 'developer_info', 'similar_apps_top15',
       'visit_website', 'more_from_developer', 'family_library',
       'permissions'],
      dtype='object')

In [81]:
data.operating_system.value_counts()

4.1 and up      183575
4.0.3 and up    134671
4.0 and up      131608
2.3 and up       76108
4.4 and up       67405
                 ...  
2.1 - 7.0            1
2.1 - 5.1            1
2.1 - 4.0.2          1
2.3 - 4.2.2          1
2.1 - 4.2.2          1
Name: operating_system, Length: 188, dtype: int64

### Cleaning the data

In [112]:
#Filter variables
google_df = data[['my_app_id','rating_app','nb_rating', 'num_downloads', 'price_gplay', 'has_ads', 'in_app', 'categ_app', 'date_published', 'software_version', 'content_rating_app', 'more_from_developer']].copy()

In [113]:
#looking at the NA's in rating and number of ratings
google_df[google_df.rating_app.isna()][['rating_app']].describe()
# Here we conclude that in most cases the missing values in rating_app & nb_rating are caused by no ratings being present so for RQ1 we can safely discard the missing values.
#Now we can look at the missing values in number of downloads.

#looking at number of downloads
google_df[google_df.num_downloads.isna()]
google_df[google_df.num_downloads.isna()][['nb_rating']].describe()
#We can see that a missing value in number of downloads often pairs with a missing value in rating, number of ratings and price. The datapoints with a missing number of downloads were either never downloaded, 
#or are a result of an error whilst gathering the data. We can never be sure about the first case, so we cannot convert them all to 0. Therefore we assume they are errors and delete these datapoints.

google_df = google_df.dropna(subset=['rating_app']) #comment voor goede dataset

In [114]:
#create year and month published
google_df['year_published'] = data.date_published.apply(lambda x: str(x)[-4:])
google_df['year_published'] = google_df['year_published'].replace({'span': 2012}, regex=True)
google_df['year_published'] = google_df['year_published'].astype(float)
google_df = google_df.drop('date_published', axis=1)
google_df['age'] = google_df.year_published.apply(lambda x: 2020 - x)

#content rating of app
google_df['content_rating_app'] = google_df.content_rating_app.apply(lambda x: str(x)[:7])
google_df.content_rating_app = google_df.content_rating_app.replace('nan', 'nan_content', regex = True)
OH_content = pd.get_dummies(google_df['content_rating_app'])
google_df = google_df.drop('content_rating_app', axis=1)
google_df = google_df.join(OH_content)

#number of apps per developer
google_df['nb_apps_developer'] = data.more_from_developer.apply(lambda x: 1 if type(x) == float else x.count(',') + 2)

#number of new versions
google_df['version'] = data.software_version.apply(lambda x: str(x)[:1])
google_df = google_df.drop('software_version', axis=1)

#minimal app operating system
google_df['min_operating_system'] = data.operating_system.apply(lambda x: str(x)[:3])
google_df.min_operating_system = google_df.min_operating_system.replace('nan', 'nan_os', regex = True)
OH_OS = pd.get_dummies(google_df['min_operating_system'])
google_df = google_df.drop('min_operating_system', axis=1)
google_df = google_df.join(OH_OS)

#clean num_download
google_df.num_downloads = google_df.num_downloads.str.replace('+','')
google_df.num_downloads = google_df.num_downloads.str.replace('>','')
google_df.num_downloads = google_df.num_downloads.str.replace(' ','')
google_df.num_downloads = google_df.num_downloads.str.replace(',','')
google_df.num_downloads = google_df.num_downloads.astype('float')

#add a new column with the nr of downloads as a catagory 
category = pd.cut(google_df.num_downloads, bins=[0,99,999,9999,99999,999999, 99999999999], labels=['0 - 99','100 - 999','1000 - 9999','10000 - 99999', '100000 - 999999', '1000000 +'])
google_df.insert(8,'num_downloads_cat', category)

#Make new column with free vs paid boolean
zero_values = ['0', '0 USD', '0 SEK', '0 GBP', '0 MXN', '0 HKD', '0 KRW', '0 CZK', '0 CAD', '0 AED', '0 DKK', '0 IDR', '0 TWD', '0 INR', '0 CLP', '0 SAR', '0 ILS', '0 RUB', '0 AUD', '0 PLN','0 CHF', '0 MYR', '0 TRY', '0 BRL', '0 BGN', '0 UAH', '0 JPY', '0 EUR'] 
for value in zero_values:
    google_df['price_gplay'] = google_df['price_gplay'].replace([value],0)
google_df['free_app'] = google_df['price_gplay'] ==  0

'''
#Make a variable that has all possible catagories resulting from combinations of the booleans: free_app, has_ads and in_app
category = [     'free_noads_noinapp' if     google_df.loc[row].free_app and not google_df.loc[row].has_ads and not google_df.loc[row].in_app
            else 'free_noads_inapp'   if     google_df.loc[row].free_app and not google_df.loc[row].has_ads and     google_df.loc[row].in_app
            else 'free_ads_noinapp'   if     google_df.loc[row].free_app and     google_df.loc[row].has_ads and not google_df.loc[row].in_app
            else 'free_ads_inapp'     if     google_df.loc[row].free_app and     google_df.loc[row].has_ads and     google_df.loc[row].in_app
            else 'paid_noads_noinapp' if not google_df.loc[row].free_app and not google_df.loc[row].has_ads and not google_df.loc[row].in_app 
            else 'paid_noads_inapp'   if not google_df.loc[row].free_app and not google_df.loc[row].has_ads and     google_df.loc[row].in_app
            else 'paid_ads_noinapp'   if not google_df.loc[row].free_app and     google_df.loc[row].has_ads and not google_df.loc[row].in_app
            else 'paid_ads_inapp'     for row in google_df.index]
google_df['category'] = category
'''

#clean rating variable
google_df.nb_rating = google_df.nb_rating.str.replace(',','')
google_df.nb_rating = google_df.nb_rating.astype('float')

#create interaction variables
google_df['freeApp_x_hasAds'] = google_df['free_app'] & google_df['has_ads']
google_df['freeApp_x_inApp'] = google_df['free_app'] & google_df['in_app']
google_df['freeApp_x_hasAds_inApp'] = google_df['free_app'] & google_df['has_ads'] & google_df['in_app']

#turn catagorical variables into one-hot variables
OH_categ_app = pd.get_dummies(google_df['categ_app'])
google_df = google_df.drop('categ_app', axis=1)
google_df = google_df.join(OH_categ_app)

In [115]:
[value for value in google_df.columns]

['my_app_id',
 'rating_app',
 'nb_rating',
 'num_downloads',
 'price_gplay',
 'has_ads',
 'in_app',
 'num_downloads_cat',
 'more_from_developer',
 'year_published',
 'age',
 'PEGI 12',
 'PEGI 16',
 'PEGI 18',
 'PEGI 3',
 'PEGI 7 ',
 'Parenta',
 'Unrated',
 'nan_content',
 'nb_apps_developer',
 'version',
 '1.0',
 '1.1',
 '1.5',
 '1.6',
 '2.0',
 '2.1',
 '2.2',
 '2.3',
 '3.0',
 '3.1',
 '3.2',
 '4.0',
 '4.1',
 '4.2',
 '4.3',
 '4.4',
 '5.0',
 '5.1',
 '6.0',
 '7.0',
 '7.1',
 '8.0',
 'Var',
 'nan_os',
 'free_app',
 'freeApp_x_hasAds',
 'freeApp_x_inApp',
 'freeApp_x_hasAds_inApp',
 'Action',
 'Action & Adventure',
 'Adventure',
 'Arcade',
 'Art & Design',
 'Auto & Vehicles',
 'Beauty',
 'Board',
 'Books & Reference',
 'Brain Games',
 'Business',
 'Card',
 'Casino',
 'Casual',
 'Comics',
 'Communication',
 'Creativity',
 'Dating',
 'Education',
 'Educational',
 'Entertainment',
 'Events',
 'Finance',
 'Food & Drink',
 'Health & Fitness',
 'House & Home',
 'Libraries & Demo',
 'Lifestyle',
 'M

# Wat er nog moet gebeuren aan features:
- Developer moet one-hot encoded maar 250.000 verschillende developers
- Age is nu aantal jaar, kan misschien aantal dagen worden
- Operating system pakt nu alleen eerte drie, dus bijv 4.0.3 word 4.09

### Save the file

In [10]:
google_df.to_csv('../Data/cleaned.csv',index=False)