# Import Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', 500)

In [None]:
gg_data = pd.read_csv('data/googleplaystore.csv')
# new_data = pd.read_csv('data/new_app_data.csv')

In [None]:
data = gg_data.copy()

In [None]:
for col in data:
    print(col)
    print(data[col].value_counts(normalize=True))
    print('-------------------------------')

**Observations**
1. **App - there seems to be a few duplicates in app. have to deal with deleting those.**
2. **Category - need to look into '1.9'.**
3. **Rating - the rating is from 1-5, but there's a rating of 19. Need to delete that one.**
4. **Reviews - might drop the '0' reviews or filter the amount of reviews.**
5. **Size - the 'varies with device' need to be replace with the average size and the "M" needs to be taken out.**
6. **Installs - need to get rid of the '+' and maybe filter a min-max scale.**
7. **Change Type into a numeric column**
8. **Change last update into days.**

# Cleaning

## App Duplicates

In [None]:
# let's check to make sure we have duplicates
data[data['App'] == 'ROBLOX']

In [None]:
data[data['App'] == 'ESPN']

In [None]:
data[data['App'] == 'Candy Crush Saga']

**Looking at some of the duplicates it seems that the only difference is the amount of reviews and/or the category name. For now I think I'll stick with the first value and drop the rest.**

In [None]:
data = data.drop_duplicates(subset=['App'], keep='first')

## Category

In [None]:
data[data['Category'] == '1.9']

Since this is one entry I can easily find the correct info and plug it in.
Category = Lifestyle
Rating = 1.9
Reviews = 19.0
Size = 3.0M
Installs = 1,000
Type = Free
Price = 0
Content Rating = Everyone

In [None]:
data.at[10472,"Category"]="LIFESTYLE"
data.at[10472, "Rating"] = 1.9
data.at[10472, "Reviews"] = "19.0"
data.at[10472, "Size"] = "3.0M"
data.at[10472, "Installs"] = "1,000+"
data.at[10472, "Type"] = "Free"
data.at[10472,'Price']= 0
data.at[10472, "Content Rating"] = "Everyone"
data.at[10472, "Last Updated"] = "February 11, 2018"

In [None]:
data[data['App'] == 'Life Made WI-Fi Touchscreen Photo Frame']

## Last Updated

In [None]:
data['Last Updated'] = pd.to_datetime(data['Last Updated'], format="%B %d, %Y", errors='coerce')
data['lastupdate'] = (data['Last Updated'] -  data['Last Updated'].max()).dt.days

In [None]:
# data['Date'] = pd.to_datetime(data['Last Updated'])
# release_month = data['Last Updated'].dt.month
# data['Month'] = release_month

In [None]:
data = data.drop(['Last Updated'], axis =1)

## Rating

In [None]:
# replacing the NAN values with the average rating
data['Rating'] = data['Rating'].fillna(data['Rating'].mean())

In [None]:
data.info()

## Add new data

In [None]:
new_data = pd.read_csv('data/new_app_data.csv')
new_data.info()

In [None]:
app_data = new_data.copy()
app_data

In [None]:
app_data = app_data.dropna()
app_data

In [None]:
df = pd.concat([data, app_data])
df.info()

In [None]:
df['App'].value_counts(normalize=True)

In [None]:
df = df.drop_duplicates(subset=['App'], keep='first')

In [None]:
df= df.drop(columns={'Price','Current Ver', 'Android Ver'}, axis=1)

In [None]:
df

## Reviews

In [None]:
df.shape

In [None]:
df = df.dropna().reset_index(drop =True)

In [None]:
df.shape

In [None]:
df.Reviews.value_counts(normalize=True)

In [None]:
df['Reviews'] = pd.to_numeric(df['Reviews'])

## Size

In [None]:
df[df['Size'] == "Varies with device"]

In [None]:
df['Size'].value_counts(normalize=True)

In [None]:
# create a variable to hold a certain value
mask = df['Size'].str.startswith('V')

In [None]:
#replace that value with zero
df.loc[mask, 'Size'] = 0

In [None]:
# convert the K and M to thousands and millions using regex
# df.Size = (df.Size.replace(r'[kMG]+$', '', regex=True).astype(float) * \
#            df.Size.str.extract(r'[\d\.]+([kMG]+)', expand=False).fillna(1)
#            .replace(['k','M', 'G'], [2**10, 2**20, 2**30]).astype(int))

#df['A'] = (df['A'] / 100).round(2)
# df['Size'] = (df['Size'] / 2**20).round(3)

In [None]:
df['Size'] = df['Size'].astype('str')
df['Size'] = df['Size'].apply(lambda x: x.strip('k'))
df['Size'] = df['Size'].apply(lambda x: x.strip('M'))
df['Size'] = df['Size'].apply(lambda x: x.strip('G'))

In [None]:
df['Size'] = df['Size'].astype('float')

In [None]:
#to retain some data replace zeros with the column's avearge
mean_size = df['Size'].mean()
df['Size'] = df.Size.mask(df.Size == 0.0,mean_size)

In [None]:
df.head(50)

## Installs

In [None]:
# remove the + sign from columns
df['Installs'] = df['Installs'].str.replace('+','').str.replace(',','')

In [None]:
df['Installs'] = pd.to_numeric(df['Installs'])

## Free Apps

In [None]:
df.info()

In [None]:
df.Type.value_counts(normalize=True)

In [None]:
df['is_free'] = df['Type'].map(lambda x: 1 if x == 'Free' else 0)

In [None]:
# df['Price'] = df['Price'].apply(lambda x : str(x).strip('$')).astype(float)
# df['Price'] = df['Price'].round()

In [None]:
# from sklearn import preprocessing
# def labelencode(col, df):
#     le = preprocessing.LabelEncoder()
#     df[col] = le.fit_transform(df[col])

In [None]:
# le = preprocessing.LabelEncoder()
# df['Content Rating'] = le.fit_transform(df['Content Rating'])

# le = preprocessing.LabelEncoder()
# df['Genres'] = le.fit_transform(df['Genres'])

In [None]:
main_df = df.copy()

In [None]:
main_df.head()

In [None]:
main_df = main_df.drop('Type', axis=1)

# Cleaned Data

In [None]:
main_df.info()

In [None]:
main_df.to_csv('data_cleaned.csv')