This notebook is used to process the raw googleplaystore dataset to obtain a dataset in line with HackerRank's train and test datasets used for the Data Scientist Assessment.

In [15]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/input/raw/googleplaystore.csv")
df.head(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [16]:
# Create 'app_id' column
df['app_id'] = df.groupby(['App']).ngroup()
df.head(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,app_id
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,6963
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,2632
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,8657
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,7828
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,7023


In [17]:
# Drop rows where 'Rating' is not given
df = df[df['Rating'].notna()]

# Convert 'Rating' column to a class
df['popularity'] = np.where(df['Rating'] >= 4, 'High', 'Low')

In [18]:
# Drop not necessary column
df.drop(columns=['App', 'Rating', 'Type', 'Genres', 'Android Ver'], inplace=True)

df.head(5)

Unnamed: 0,Category,Reviews,Size,Installs,Price,Content Rating,Last Updated,Current Ver,app_id,popularity
0,ART_AND_DESIGN,159,19M,"10,000+",0,Everyone,"January 7, 2018",1.0.0,6963,High
1,ART_AND_DESIGN,967,14M,"500,000+",0,Everyone,"January 15, 2018",2.0.0,2632,Low
2,ART_AND_DESIGN,87510,8.7M,"5,000,000+",0,Everyone,"August 1, 2018",1.2.4,8657,High
3,ART_AND_DESIGN,215644,25M,"50,000,000+",0,Teen,"June 8, 2018",Varies with device,7828,High
4,ART_AND_DESIGN,967,2.8M,"100,000+",0,Everyone,"June 20, 2018",1.1,7023,High


In [19]:
df.rename(columns={"Content Rating": "suitable_for", "Last Updated": "last_update", "Current Ver": "latest_ver"}, inplace=True)

In [20]:
df.columns = [x.lower() for x in df.columns]
df.head(5)

Unnamed: 0,category,reviews,size,installs,price,suitable_for,last_update,latest_ver,app_id,popularity
0,ART_AND_DESIGN,159,19M,"10,000+",0,Everyone,"January 7, 2018",1.0.0,6963,High
1,ART_AND_DESIGN,967,14M,"500,000+",0,Everyone,"January 15, 2018",2.0.0,2632,Low
2,ART_AND_DESIGN,87510,8.7M,"5,000,000+",0,Everyone,"August 1, 2018",1.2.4,8657,High
3,ART_AND_DESIGN,215644,25M,"50,000,000+",0,Teen,"June 8, 2018",Varies with device,7828,High
4,ART_AND_DESIGN,967,2.8M,"100,000+",0,Everyone,"June 20, 2018",1.1,7023,High


In [21]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.33, random_state=42)

train.to_csv('../data/input/processed/train.csv', index=False)
test.to_csv('../data/input/processed/test.csv', index=False)