In [1]:
import pandas as pd
import numpy as np
import os
from utils.eda import (get_data_shape, get_numeric_summary, get_value_counts_prop, 
                      get_missing_value_counts_prop)

In [2]:
TRAIN_PATH = os.path.join("data", "raw", "x_google_playstore_train.csv")
TARGET = "Rating"
CHUNKSIZE = 150000
Y_TRAIN_PATH = os.path.join("data", "raw", "y_google_playstore_train.csv")

### Columns

In [3]:
chunks = pd.read_csv(TRAIN_PATH, chunksize=CHUNKSIZE)
sample = next(chunks)
sample.columns

Index(['App Name', 'App Id', 'Category', 'Rating Count', 'Installs',
       'Minimum Installs', 'Maximum Installs', 'Free', 'Price', 'Currency',
       'Size', 'Minimum Android', 'Developer Id', 'Developer Website',
       'Developer Email', 'Released', 'Last Updated', 'Content Rating',
       'Privacy Policy', 'Ad Supported', 'In App Purchases', 'Editors Choice',
       'Scraped Time'],
      dtype='object')

### Data types

In [4]:
sample.dtypes

App Name              object
App Id                object
Category              object
Rating Count         float64
Installs              object
Minimum Installs     float64
Maximum Installs       int64
Free                    bool
Price                float64
Currency              object
Size                  object
Minimum Android       object
Developer Id          object
Developer Website     object
Developer Email       object
Released              object
Last Updated          object
Content Rating        object
Privacy Policy        object
Ad Supported            bool
In App Purchases        bool
Editors Choice          bool
Scraped Time          object
dtype: object

### Data Shape

In [5]:
chunks = pd.read_csv(TRAIN_PATH, chunksize=CHUNKSIZE)
data_shape = get_data_shape(chunks)
data_shape

(33476613, 23)

### Min, Max, Approx mean of numeric columns of training data

In [6]:
cols_to_use = [*sample.select_dtypes(["float64", "int64"]).columns] # selecting only float cols
chunks = pd.read_csv(TRAIN_PATH, chunksize=CHUNKSIZE)

min_, max_, approx_mean_ = get_numeric_summary(chunks, cols_to_use)
pd.DataFrame({'columns': cols_to_use, 'min': min_, 'max': max_, 'approx_mean': approx_mean_})

Unnamed: 0,columns,min,max,approx_mean
0,Rating Count,0.0,138557600.0,2831.014895
1,Minimum Installs,0.0,10000000000.0,183220.868605
2,Maximum Installs,0.0,12057630000.0,320067.57825
3,Price,0.0,400.0,0.104749


### Value Counts and proportions of categorical Variables in training data

In [7]:
chunks = pd.read_csv(TRAIN_PATH, chunksize=CHUNKSIZE)
cols_to_use = [*sample.select_dtypes(["O", "bool"]).columns] # selecting object and bool cols
value_counts, value_prop = get_value_counts_prop(chunks, cols_to_use)
print(cols_to_use)

['App Name', 'App Id', 'Category', 'Installs', 'Free', 'Currency', 'Size', 'Minimum Android', 'Developer Id', 'Developer Website', 'Developer Email', 'Released', 'Last Updated', 'Content Rating', 'Privacy Policy', 'Ad Supported', 'In App Purchases', 'Editors Choice', 'Scraped Time']


In [8]:
value_counts = pd.concat([pd.Series(value_counts['Content Rating']), pd.Series(value_prop['Content Rating'])], axis=1)
value_counts.columns = ['Count', "Proportion"]
value_counts

Unnamed: 0,Count,Proportion
Everyone,29268150,0.874286
Teen,2842066,0.084897
Mature 17+,871556,0.026035
Everyone 10+,490693,0.014658
Unrated,2200,6.6e-05
Adults only 18+,1948,5.8e-05


### Missing Value Counts of Variables in training data

In [9]:
chunks = pd.read_csv(TRAIN_PATH, chunksize=CHUNKSIZE)
sample = next(chunks)
cols_to_use = [*sample.columns]
chunks = pd.read_csv(TRAIN_PATH, chunksize=CHUNKSIZE)

missing_count, missing_prop = get_missing_value_counts_prop(chunks, cols_to_use)
missing_count = pd.concat([pd.Series(missing_count), pd.Series(missing_prop)], axis=1)
missing_count.columns = ['Count', "Proportion"]
missing_count

Unnamed: 0,Count,Proportion
Developer Website,11011028,0.328917
Privacy Policy,6090392,0.1819298
Released,1028483,0.03072243
Rating Count,331339,0.009897626
Minimum Android,94801,0.002831858
Size,2832,8.459637e-05
Currency,2002,5.980294e-05
Installs,1588,4.74361e-05
Minimum Installs,1588,4.74361e-05
Developer Id,463,1.383055e-05


### Target Distribution

In [10]:
chunks = pd.read_csv(Y_TRAIN_PATH, chunksize=CHUNKSIZE)
cols_to_use = [TARGET]
value_counts, value_prop = get_value_counts_prop(chunks, cols_to_use)
value_counts = pd.concat([pd.Series(value_counts[TARGET]), pd.Series(value_prop[TARGET])], axis=1)
value_counts.columns = ['Count', "Proportion"]
value_counts

Unnamed: 0,Count,Proportion
0.0,15671800,0.468142
5.0,1448486,0.043269
4.2,1272901,0.038024
4.4,1247794,0.037274
4.3,1204962,0.035994
4.6,1133435,0.033858
4.5,1110385,0.033169
4.1,1008516,0.030126
4.0,974300,0.029104
4.7,900842,0.02691


### Target Misisng Values

In [11]:
chunks = pd.read_csv(Y_TRAIN_PATH, chunksize=CHUNKSIZE)
cols_to_use = [TARGET]

missing_count, missing_prop = get_missing_value_counts_prop(chunks, cols_to_use)
missing_count = pd.concat([pd.Series(missing_count), pd.Series(missing_prop)], axis=1)
missing_count.columns = ['Count', "Proportion"]
missing_count

Unnamed: 0,Count,Proportion
Rating,0,0.0
