# Introduction

The goal of this notebooks is to explore a given dataset of users who have installed our mobile app and their associated features and predict if a user will listen to podcasts.

In [2]:
# Libraries
# Standard libraries
import pandas as pd
import numpy as np

# Statistics and modelings libraries
import sklearn

In [5]:
# Inputs

path_to_data = '/Users/juliameo/dataIntelligence_exercise/part2_ML/data/ds_practical.txt'

## Reading in Data and Prepping

In [8]:
raw_df = pd.read_csv(path_to_data,
                    sep='|')

In [15]:
raw_df.head()

Unnamed: 0,anon_person_id,target,state,presence_of_child_ind,streaming_media_subs,streaming_entertainment_subs,marital_status,gender,income,number_of_news_subs,age
0,1,0,CA,0,N,0,M,Female,$175k-199k,0,43.0
1,2,0,CA,0,N,0,U,Male,$225k-249k,0,90.0
2,3,1,CA,0,Y,1,S,Female,$150k-174k,0,46.0
3,4,0,CA,1,N,0,U,Female,$125k-149k,0,25.0
4,5,0,CA,0,N,0,U,Female,$175k-199k,0,62.0


In [16]:
raw_df.shape

(1000000, 11)

In [20]:
# Dropping exact duplicates
raw_df.drop_duplicates(inplace=True)

In [21]:
raw_df.shape

(999500, 11)

In [22]:
raw_df.anon_person_id.nunique()

999500

In [26]:
# Looking at count of nulls per column
raw_df.isnull().sum(axis=0)

anon_person_id                      0
target                              0
state                           10024
presence_of_child_ind               0
streaming_media_subs                0
streaming_entertainment_subs        0
marital_status                      0
gender                              0
income                              0
number_of_news_subs                 0
age                             19601
dtype: int64

In [30]:
raw_df.dtypes

anon_person_id                    int64
target                            int64
state                            object
presence_of_child_ind             int64
streaming_media_subs             object
streaming_entertainment_subs      int64
marital_status                   object
gender                           object
income                           object
number_of_news_subs               int64
age                             float64
dtype: object

In [31]:
# Converting the streaming_media_subs field to actual boolean
raw_df['streaming_media_subs'] = np.where(raw_df['streaming_media_subs']=='Y',1,0)

In [32]:
# Plotting out distributions of the data scross the different fields


Unnamed: 0,anon_person_id,target,state,presence_of_child_ind,streaming_media_subs,streaming_entertainment_subs,marital_status,gender,income,number_of_news_subs,age
0,1,0,CA,0,0,0,M,Female,$175k-199k,0,43.0
1,2,0,CA,0,0,0,U,Male,$225k-249k,0,90.0
2,3,1,CA,0,1,1,S,Female,$150k-174k,0,46.0
3,4,0,CA,1,0,0,U,Female,$125k-149k,0,25.0
4,5,0,CA,0,0,0,U,Female,$175k-199k,0,62.0


### High Level Summary of the Data
- 1M records with 999500 unique records for 999500 unique users.
- Most fields have full coverage, but the user age and user state are missing values (~2% and ~1% respectively)

In [29]:
19601.0/999500.0

0.01961080540270135