# Predicting User Churn in Digital Music Services

Notebook to document data exploration and development of ML algorithm to identify at risk customers in digital music services.

### Data Definition

From Exploratory Data Analysis (EDA): 
#### Useful:
- *location*: location of user, seems to append each new state (location, state)
- *gender*: user gender (M/F/None)

- *page*: what page the user is on during event (pages)
- *level*: subscription level check uniqueness (free or paid)
- *auth*: authenication (logged in/out)
- *length*: time spent on page, max 50 mins on NextSong (if song paused??)

- *registration*: unknown (registration unixtime)
- *ts*: timestamp of event in ms (event unixtime)

- *userId*: unique (userId val)
- *sessionId*: unique sessionId per user?
- *itemInSession*: lcounter for the number of items in a single session (item listened to in session)


#### Not Useful:
- *firstName*: users first name (not important, remove)
- *lastName*: users lastname
- *artist*: song artist
- *song*: songname
- *userAgent*: device/browser (not important for us, remove)
- *method*: API PUT/GET http request (not important for us, remove)
- *status*: http status

# Apache Spark on IBM Watson Setup

### Imports

In [1]:
# imports
#import ibmos2spark

# pyspark sql
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import from_unixtime, udf, col, when, isnan, desc
from pyspark.sql.functions import sum as Fsum
from pyspark.sql.types import IntegerType, StringType
from pyspark.sql import functions as F

# python
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

### setup

In [2]:
# # config
# # @hidden_cell
# credentials = {
#     'endpoint': 'https://s3.eu-geo.objectstorage.service.networklayer.com',
#     'service_id': 'iam-ServiceId-147e1161-7da9-41fe-ac00-c144730def00',
#     'iam_service_endpoint': 'https://iam.cloud.ibm.com/oidc/token',
#     'api_key': 'kAtvjdC8VIYYUmU3gDaOYIK2fCvP3nkjYYlDiNuu4gw6'
# }

# configuration_name = 'os_76774389dfa04fb5acbb1640b3e11704_configs'
# cos = ibmos2spark.CloudObjectStorage(sc, credentials, configuration_name, 'bluemix_cos')

In [2]:
# Build Spark session
spark = SparkSession.builder.appName("user_churn").getOrCreate()

# Read in data from IBM Cloud
# data_df = spark.read.json(cos.url('medium-sparkify-event-data.json', 'sparkify-donotdelete-pr-fnqu5byx41gcai'))

user_log_validuser_log_valid = spark.read.parquet("../data/03_primary/medium-sparkify-event-data-wrangled.parquet")

# Exploratory Data Analysis

In [3]:
user_log_valid.printSchema()

root
 |-- authLevel: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- sessionLength_s: double (nullable = true)
 |-- subLevel: string (nullable = true)
 |-- page: string (nullable = true)
 |-- unixRegistrationTS: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- unixEventTS: long (nullable = true)
 |-- userId: string (nullable = true)
 |-- timestampDatetime: string (nullable = true)
 |-- registrationDatetime: string (nullable = true)
 |-- usStateAbbr: string (nullable = true)



# ML Feature Engineering

### Flag user Cancellations and Create Phase

In [4]:
def create_phase_feature(user_log_valid):
    """Use the cancellation to identify churned users.
    """
    flag_cancellation_event = udf(lambda x: 1 if x == "Cancellation Confirmation" else 0, IntegerType())
    user_log_valid = user_log_valid.withColumn("churn", flag_cancellation_event("page"))
    windowval = Window.partitionBy("userId").orderBy(desc("ts")).rangeBetween(Window.unboundedPreceding, 0)
    user_log_valid = user_log_valid.withColumn("label", Fsum("churn").over(windowval))
    
user_log_valid = create_phase_feature(user_log_valid)

NameError: name 'user_log_valid' is not defined

In [None]:
user_log_valid.head()

In [None]:
user_log_valid.filter(user_log_valid['userId']==100010).head(50000)

### Calculate Hours Since Registration

In [None]:
def hours_since_reg(user_log_valid):

    # hours since registration
    user_log_valid = user_log_valid.withColumn('hours_since_registration',
                                         (user_log_valid['ts'] - user_log_valid['registration']) / (1000 *3600))
    return user_log_valid.withColumn("hours_since_registration", user_log_valid["hours_since_registration"].cast(IntegerType()))

user_log_valid = hours_since_reg(user_log_valid)

### Calculate Hour in the Day of Event

In [None]:
def hour_in_day(user_log_valid):

    # hour in the day of event
    get_hour = udf(lambda x:  int(datetime.datetime.fromtimestamp(x / 1000.0).hour)) 
    user_log_valid = user_log_valid.withColumn("hour", get_hour(user_log_valid.ts))
    return user_log_valid

user_log_valid = hour_in_day(user_log_valid)

In [None]:
def avg_user_items_in_sesh(user_log_valid):
    # calculate average listening time
    windowval = Window.partitionBy("userId").orderBy("ts").rangeBetween(Window.unboundedPreceding, 0)
    return user_log_valid.withColumn('itemInSession_rolling_average', F.avg("itemInSession").over(windowval))
    
user_log_valid = avg_user_items_in_sesh(user_log_valid)

In [None]:
def avg_user_listening_time(user_log_valid)
    # calculate average listening time
    windowval = Window.partitionBy("userId").orderBy("ts").rangeBetween(Window.unboundedPreceding, 0)
    return user_log_valid.withColumn('length_rolling_average', F.avg("length").over(windowval))

user_log_valid = avg_user_listening_time(user_log_valid)

In [None]:
user_log_valid.filter(user_log_valid['userId']==293).select("sessionId","length","length_rolling_average").head(5)

In [None]:
def num_neg_user_events(user_log_valid):
    # Number of Positive Events
    return user_log_valid.withColumn("positive_event",
                                         when((user_log_valid["page"] == 'Add to Playlist') |\
                                              (user_log_valid["page"] == 'Add Friend') |\
                                              (user_log_valid["page"] == 'Thumbs Up'),
                                              1).otherwise(0))

user_log_valid = num_neg_user_events(user_log_valid)

In [None]:
def num_pos_user_events(user_log_valid):
    # Number of Negative Events
    return user_log_valid.withColumn("negative_event",
                                         when((user_log_valid["page"] == 'Thumbs Down') |\
                                              (user_log_valid["page"] == 'Help') |\
                                              (user_log_valid["page"] == 'Error'),
                                              1).otherwise(0))

user_log_valid = num_pos_user_events(user_log_valid)

In [None]:
user_log_valid.head(1)

In [None]:
pd_features = features_df.toPandas()

In [None]:
fig = plt.figure(figsize=(30,25))
ax = fig.gca()
h = pd_features.hist(ax=ax)