# Predicting User Churn in Digital Music Services

Notebook to document data exploration and development of ML algorithm to identify at risk customers in digital music services.

# Apache Spark on IBM Watson Setup

### Imports

In [19]:
# imports

# pyspark sql
from pyspark.sql import SparkSession, Window

# python
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
import plotly.express as px

### setup

In [20]:
# Build Spark session
spark = SparkSession.builder.appName("user_churn").getOrCreate()

In [21]:
# Read in data from IBM Cloud
data_df = spark.read.json("/Users/jacobmarlow/Documents/DataAnalytics/GitHub/user-churn-prediction-with-apache-spark/data/medium-sparkify-event-data.json")

# Visualisation

In [22]:
data_df.head()

Row(artist='Martin Orford', auth='Logged In', firstName='Joseph', gender='M', itemInSession=20, lastName='Morales', length=597.55057, level='free', location='Corpus Christi, TX', method='PUT', page='NextSong', registration=1532063507000, sessionId=292, song='Grand Designs', status=200, ts=1538352011000, userAgent='"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"', userId='293')

# ...

In [30]:
vis_df = data_df.limit(500000000).toPandas()
vis_df.head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,Martin Orford,Logged In,Joseph,M,20,Morales,597.55057,free,"Corpus Christi, TX",PUT,NextSong,1532064000000.0,292,Grand Designs,200,1538352011000,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",293
1,John Brown's Body,Logged In,Sawyer,M,74,Larson,380.21179,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1538070000000.0,97,Bulls,200,1538352025000,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",98
2,Afroman,Logged In,Maverick,M,184,Santiago,202.37016,paid,"Orlando-Kissimmee-Sanford, FL",PUT,NextSong,1535953000000.0,178,Because I Got High,200,1538352118000,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",179
3,,Logged In,Maverick,M,185,Santiago,,paid,"Orlando-Kissimmee-Sanford, FL",PUT,Logout,1535953000000.0,178,,307,1538352119000,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",179
4,Lily Allen,Logged In,Gianna,F,22,Campos,194.53342,paid,"Mobile, AL",PUT,NextSong,1535931000000.0,245,Smile (Radio Edit),200,1538352124000,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,246


### Users Subscription Level

In [31]:
import pandas as pd
user_level = pd.DataFrame(vis_df.groupby(['userId'])['level'].agg(pd.Series.mode))
user_level['count'] = [1]*user_level.shape[0]

In [36]:
level_count_df = user_level.groupby('level').count()
level_count_df['Percentage (%)'] = (level_count_df['count'].values/sum(level_count_df['count'].values))*100
level_count_df.drop('count', axis=1,inplace=True)
level_count_df = level_count_df.round(1)
level_count_df.head()

Unnamed: 0_level_0,Percentage (%)
level,Unnamed: 1_level_1
free,40.3
paid,59.7


In [37]:

fig = px.bar(level_count_df, template='plotly_dark')

fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    yaxis=dict(
        title_text='Percentage of Users (%)',
        tickmode="array",
        titlefont=dict(size=14),
    ),
    xaxis=dict(
    title_text='User Subscription Level',
    tickmode="array",
    titlefont=dict(size=14),
    ))

fig.show()

In [29]:
# import chart_studio
# username = 'jhmarlow' # your username
# api_key = 'b6yhP8ncVquhQVSPISyV' # your api key - go to profile > settings > regenerate key
# chart_studio.tools.set_credentials_file(username=username, api_key=api_key)
# import chart_studio.plotly as py
# py.plot(fig, filename = 'Sparkify user subscription level', auto_open=True)

'https://plotly.com/~jhmarlow/16/'

### User Interactions Subscription Level

In [1]:
interations_count = vis_df.groupby('level').count()['gender']
interactions_level = pd.DataFrame(interations_count)
interactions_level['Percentage (%)'] = (interactions_level['gender'].values/sum(interactions_level['gender'].values))*100
interactions_level.drop('gender', axis=1,inplace=True)
interactions_level.head()

NameError: name 'vis_df' is not defined

In [41]:
fig = px.bar(interactions_level, template='plotly_dark')

fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    yaxis=dict(
        title_text='Percentage of events (%)',
        tickmode="array",
        titlefont=dict(size=14),
    ),
    xaxis=dict(
    title_text='User Subscription Level',
    tickmode="array",
    titlefont=dict(size=14),
    ))

fig.show()

In [32]:
import chart_studio
username = 'jhmarlow' # your username
api_key = 'b6yhP8ncVquhQVSPISyV' # your api key - go to profile > settings > regenerate key
chart_studio.tools.set_credentials_file(username=username, api_key=api_key)
import chart_studio.plotly as py
py.plot(fig, filename = 'Sparkify user interactions level', auto_open=True)

'https://plotly.com/~jhmarlow/18/'

### User Sessions

In [33]:
avg_sessions = vis_df.groupby('userId').max()['sessionId']
avg_sessions.head()

userId
          4790
10           9
100       4759
100001      96
100002     293
Name: sessionId, dtype: int64

In [34]:
level_sessions = user_level.join(avg_sessions)

In [35]:
level_session_avg = level_sessions.groupby('level').mean()
level_session_avg.head()

Unnamed: 0_level_0,count,sessionId
level,Unnamed: 1_level_1,Unnamed: 2_level_1
free,1.0,2166.19337
paid,1.0,2841.533582


In [36]:

level_session_avg_plot = level_session_avg.drop('count', axis=1)
level_session_avg_plot.head()

Unnamed: 0_level_0,sessionId
level,Unnamed: 1_level_1
free,2166.19337
paid,2841.533582


In [37]:

fig = px.bar(level_session_avg_plot, template='plotly_dark')

fig.update_layout(
    autosize=False,
    width=800,
    height=500,
    yaxis=dict(
        title_text='Avg. Number of Sessions',
        tickmode="array",
        titlefont=dict(size=14),
    ),
    xaxis=dict(
    title_text='Subscription Level',
    tickmode="array",
    titlefont=dict(size=14),
    ))

fig.show()


In [38]:
import chart_studio
username = 'jhmarlow' # your username
api_key = 'b6yhP8ncVquhQVSPISyV' # your api key - go to profile > settings > regenerate key
chart_studio.tools.set_credentials_file(username=username, api_key=api_key)
import chart_studio.plotly as py
py.plot(fig, filename = 'Sparkify avg number session', auto_open=True)

'https://plotly.com/~jhmarlow/20/'