# SBM assignment - Data Cleaning and Inspection

### Importing packages

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import time

### Loading dataset

In [3]:
google_last_df = pd.read_csv("Data/google_data_last.csv",encoding='cp1252')
# google_df = google_df.sample(n=1000, random_state=1) #uncomment to run on sample

In [8]:
data = google_last_df.copy()

### Cleaning the data

In [10]:
#Filter variables
google_df = data[['my_app_id','rating_app','nb_rating', 'num_downloads', 'price_gplay', 'has_ads', 'in_app', 'categ_app']].copy()

#looking at the NA's in rating and number of ratings
google_df[google_df.nb_rating.isna()][['rating_app']].describe()
# Here we conclude that in most cases the missing values in rating_app & nb_rating are caused by no ratings being present so for RQ1 we can safely discard the missing values.
#Now we can look at the missing values in number of downloads.

#looking at number of downloads
google_df[google_df.num_downloads.isna()]
google_df[google_df.num_downloads.isna()][['nb_rating']].describe()
#We can see that a missing value in number of downloads often pairs with a missing value in rating, number of ratings and price. The datapoints with a missing number of downloads were either never downloaded, 
#or are a result of an error whilst gathering the data. We can never be sure about the first case, so we cannot convert them all to 0. Therefore we assume they are errors and delete these datapoints.

#clean num_download
google_df.num_downloads = google_df.num_downloads.str.replace('+','')
google_df.num_downloads = google_df.num_downloads.str.replace('>','')
google_df.num_downloads = google_df.num_downloads.str.replace(' ','')
google_df.num_downloads = google_df.num_downloads.str.replace(',','')
google_df.num_downloads = google_df.num_downloads.astype('float')

#add a new column with the nr of downloads as a catagory 
category = pd.cut(google_df.num_downloads, bins=[0,99,999,9999,99999,999999, 99999999999], labels=['0 - 99','100 - 999','1000 - 9999','10000 - 99999', '100000 - 999999', '1000000 +'])
google_df.insert(8,'num_downloads_cat', category)

#Make new column with free vs paid boolean
zero_values = ['0', '0 USD', '0 SEK', '0 GBP', '0 MXN', '0 HKD', '0 KRW', '0 CZK', '0 CAD', '0 AED', '0 DKK', '0 IDR', '0 TWD', '0 INR', '0 CLP', '0 SAR', '0 ILS', '0 RUB', '0 AUD', '0 PLN','0 CHF', '0 MYR', '0 TRY', '0 BRL', '0 BGN', '0 UAH', '0 JPY', '0 EUR'] 
for value in zero_values:
    google_df['price_gplay'] = google_df['price_gplay'].replace([value],0)
google_df['free_app'] = google_df['price_gplay'] ==  0

'''
#Make a variable that has all possible catagories resulting from combinations of the booleans: free_app, has_ads and in_app
category = [     'free_noads_noinapp' if     google_df.loc[row].free_app and not google_df.loc[row].has_ads and not google_df.loc[row].in_app
            else 'free_noads_inapp'   if     google_df.loc[row].free_app and not google_df.loc[row].has_ads and     google_df.loc[row].in_app
            else 'free_ads_noinapp'   if     google_df.loc[row].free_app and     google_df.loc[row].has_ads and not google_df.loc[row].in_app
            else 'free_ads_inapp'     if     google_df.loc[row].free_app and     google_df.loc[row].has_ads and     google_df.loc[row].in_app
            else 'paid_noads_noinapp' if not google_df.loc[row].free_app and not google_df.loc[row].has_ads and not google_df.loc[row].in_app 
            else 'paid_noads_inapp'   if not google_df.loc[row].free_app and not google_df.loc[row].has_ads and     google_df.loc[row].in_app
            else 'paid_ads_noinapp'   if not google_df.loc[row].free_app and     google_df.loc[row].has_ads and not google_df.loc[row].in_app
            else 'paid_ads_inapp'     for row in google_df.index]
google_df['category'] = category
'''

#clean rating variable
google_df.nb_rating = google_df.nb_rating.str.replace(',','')
google_df.nb_rating = google_df.nb_rating.astype('float')

#create interaction variables
google_df['freeApp_x_hasAds'] = google_df['free_app'] & google_df['has_ads']
google_df['freeApp_x_inApp'] = google_df['free_app'] & google_df['in_app']
google_df['freeApp_x_hasAds_inApp'] = google_df['free_app'] & google_df['has_ads'] & google_df['in_app']

### Save the file

In [11]:
google_df.to_csv('Data/cleaned.csv',index=False)