# Predicting User Churn with Apache Spark and AWS EMR

This notebook contains data cleaning, feature creation and ML optimization for the Sparkify dataset.

## Setup

In [1]:
%%configure -f 
{"driverMemory": "6000M"}

In [2]:
#conf.set("spark.executor.heartbeatInterval","3600s")

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
2,application_1597600389056_0003,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
# imports

# pyspark sql
from pyspark.sql import SparkSession, Window, functions as F
from pyspark.sql.functions import from_unixtime, udf, col, when, isnan, desc
from pyspark.sql.functions import sum as Fsum
from pyspark.sql.types import IntegerType, StringType
from pyspark.sql.functions import isnan, when, count, col

# pyspark ml
from pyspark.ml.feature import VectorAssembler, Normalizer, StringIndexer, OneHotEncoder
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# python imports
import datetime
from typing import NewType
pysparkdf = NewType('pysparkdf', object)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
spark.sparkContext.getConf().get('spark.driver.memory')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'6000M'

In [6]:
# Build Spark session
spark = SparkSession.builder.appName("Sparkify").getOrCreate()

event_data = "s3n://udacity-dsnd/sparkify/mini_sparkify_event_data.json"
#event_data = "s3n://udacity-dsnd/sparkify/sparkify_event_data.json"

data_df = spark.read.json(event_data)


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
data_df.head()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Row(artist='Martha Tilston', auth='Logged In', firstName='Colin', gender='M', itemInSession=50, lastName='Freeman', length=277.89016, level='paid', location='Bakersfield, CA', method='PUT', page='NextSong', registration=1538173362000, sessionId=29, song='Rockpools', status=200, ts=1538352117000, userAgent='Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0', userId='30')

In [8]:
from pyspark.sql.functions import countDistinct

# count users
data_df.agg(countDistinct("userId")).collect()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[Row(count(DISTINCT userId)=226)]

In [9]:
# count events
print((data_df.count(), len(data_df.columns)))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(286500, 18)

In [10]:
from pyspark.sql.functions import min, max
data_df.select(min("ts"), max("ts")).first()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Row(min(ts)=1538352117000, max(ts)=1543799476000)

## Data Wrangling

In [11]:
def clean_df(data_df):
    """Remove non useful columns and drop missing values for user and session.
    """
    # lets remove some of the columns we don't think will be useful from data exploration
    cols_to_drop = ['firstName', 'lastName','artist', 'song', 'method', 'status', 'userAgent']
    user_log_df = data_df.drop(*cols_to_drop)
    
    # drop rows with missing info
    return user_log_df.dropna(how = "any", subset = ["userId", "sessionId"])

def unix_to_datetime(data_df):
    """ onvert unix timestamps to datetime.
    """
    # event unix to datetime
    data_df = data_df.withColumn("timestampDatetime",
                                         from_unixtime(user_log_valid.ts/1000,
                                                       format='yyyy-MM-dd HH:mm:ss'))
    # registration unix to datetime
    data_df = data_df.withColumn("registrationDatetime",
                                         from_unixtime(user_log_valid.registration/1000,
                                                       format='yyyy-MM-dd HH:mm:ss'))
    return data_df

def create_us_states(data_df):
    """Create US states column from location.
    """
    # we don't really want to drop these rows as the col isn't vital 
    # so replace missing values to allow split
    data_df = data_df.fillna({'location':'unknown'})

    # define UDFs
    # create state column
    loc_split = udf(lambda x: x.split(', ')[-1], StringType())

    # Sates seem to be appended, so take latest
    state_split = udf(lambda x: x.split('-')[-1], StringType())

    # apply UDFs
    data_df = data_df.withColumn("usStateAbbr",
                                 when(data_df.location.isNotNull(),
                                      loc_split(data_df.location)).otherwise(''))
    data_df = data_df.withColumn("usStateAbbr",
                                 when(data_df.usStateAbbr.isNotNull(),
                                      state_split(data_df.usStateAbbr)).otherwise(''))
    return data_df

def replace_missing_gender(data_df):
    """Replace missing gender with 'unknown'.
    """
    return data_df.fillna({'gender':'unknown'})

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
user_log_valid = clean_df(data_df)
user_log_valid = unix_to_datetime(user_log_valid)
user_log_valid = create_us_states(user_log_valid)
user_log_valid = replace_missing_gender(user_log_valid)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
wrangled_data = user_log_valid.withColumnRenamed("auth", "authLevel").\
                               withColumnRenamed("length", "sessionLength_s").\
                               withColumnRenamed("level", "subLevel").\
                               withColumnRenamed("ts", "unixEventTS").\
                               withColumnRenamed("registration", "unixRegistrationTS")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Feature Engineering

In [14]:
def create_phase_feature(data_df: pysparkdf) -> pysparkdf:
    """Use the cancellation to identify churned users.
    """
    # flag any cancellation confirmation events in pages
    flag_cancellation_event = udf(lambda x: 1 if x == "Cancellation Confirmation" else 0, IntegerType())
    data_df = data_df.withColumn("churn", flag_cancellation_event("page"))

    # search for flags and fill user rows as churned
    windowval = Window.partitionBy("userId").orderBy(desc("unixEventTS")).rangeBetween(Window.unboundedPreceding, 0)
    return data_df.withColumn("label", Fsum("churn").over(windowval))

def avg_items_in_session(data_df: pysparkdf)-> pysparkdf:
    """Calculate avg items in session for each user.
    """
    # calculate metric and join dfs back together
    return data_df.join(data_df.groupBy('userId').avg('itemInSession'),
                               on='userId')

def avg_user_listening_time(data_df: pysparkdf) -> pysparkdf:
    """Calculate average listening time.
    """
    return data_df.join(data_df.groupBy('userId').avg('sessionLength_s'),
                               on='userId')

def recommendation_performance_good(data_df: pysparkdf) -> pysparkdf:
    """Number of Positive Events.
    """
    # flag events
    data_df = data_df.withColumn("recc_performance_good_events",
                                 when((data_df["page"] == 'Add to Playlist') |\
                                      (data_df["page"] == 'Add Friend') |\
                                      (data_df["page"] == 'Thumbs Up'),
                                       1).otherwise(0))
    # calculate number
    return data_df.join(data_df.groupBy('userId').sum('recc_performance_good_events'),
                               on='userId')

def recommendation_performance_bad(data_df: pysparkdf) -> pysparkdf:
    """Number of Positive Events.
    """
    # flag events
    data_df = data_df.withColumn("recc_performance_bad_events",
                                 when((data_df["page"] == 'Thumbs Down'),
                                       1).otherwise(0))
    # calculate number
    return data_df.join(data_df.groupBy('userId').sum('recc_performance_bad_events'),
                               on='userId')

def system_performance_bad(data_df: pysparkdf) -> pysparkdf:
    """Number of Positive Events.
    """
    # flag events
    data_df = data_df.withColumn("sys_performance_bad",
                                 when((data_df["page"] == 'Help') |\
                                      (data_df["page"] == 'Upgrade') |\
                                      (data_df["page"] == 'Error'),
                                       1).otherwise(0))
    # calculate number                                
    return data_df.join(data_df.groupBy('userId').sum('sys_performance_bad'),
                               on='userId')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
# create features
wrangled_data = create_phase_feature(wrangled_data)
wrangled_data = avg_items_in_session(wrangled_data)
wrangled_data = avg_user_listening_time(wrangled_data)
wrangled_data = recommendation_performance_good(wrangled_data)
wrangled_data = recommendation_performance_bad(wrangled_data)
wrangled_data = system_performance_bad(wrangled_data)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
# create ml features df
ml_features_data = wrangled_data.select('label',
                                        'userId',
                                        'gender',
                                        'usStateAbbr',
                                        col('avg(itemInSession)').alias("avg_item_in_session"),
                                        col('avg(sessionLength_s)').alias('avg_session_length'),
                                        col('sum(recc_performance_good_events)').alias('num_good_recc'),
                                        col('sum(recc_performance_bad_events)').alias('num_bad_recc'),
                                        col('sum(sys_performance_bad)').alias('num_bad_sys'))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [17]:
ml_features_data = ml_features_data.drop_duplicates()  # remove duplicates as features will be created on each event row

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
ml_features_data.head(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[Row(label=1, userId='100005', gender='M', usStateAbbr='LA', avg_item_in_session=29.61111111111111, avg_session_length=240.50079227272732, num_good_recc=13, num_bad_recc=3, num_bad_sys=6), Row(label=1, userId='100014', gender='M', usStateAbbr='PA', avg_item_in_session=34.538709677419355, avg_session_length=263.43763455252935, num_good_recc=30, num_bad_recc=3, num_bad_sys=2), Row(label=1, userId='100025', gender='F', usStateAbbr='PA', avg_item_in_session=131.2808988764045, avg_session_length=242.92456783673484, num_good_recc=32, num_bad_recc=7, num_bad_sys=7), Row(label=0, userId='154', gender='F', usStateAbbr='PA', avg_item_in_session=26.56779661016949, avg_session_length=245.95266559523813, num_good_recc=15, num_bad_recc=0, num_bad_sys=1), Row(label=1, userId='18', gender='M', usStateAbbr='KS', avg_item_in_session=59.14035087719298, avg_session_length=253.32141729603728, num_good_recc=44, num_bad_recc=1, num_bad_sys=1)]

In [19]:
# clean up to free up space!
data_df.unpersist()
wrangled_data.unpersist()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[userId: string, authLevel: string, gender: string, itemInSession: bigint, sessionLength_s: double, subLevel: string, location: string, page: string, unixRegistrationTS: bigint, sessionId: bigint, unixEventTS: bigint, timestampDatetime: string, registrationDatetime: string, usStateAbbr: string, churn: int, label: bigint, avg(itemInSession): double, avg(sessionLength_s): double, recc_performance_good_events: int, sum(recc_performance_good_events): bigint, recc_performance_bad_events: int, sum(recc_performance_bad_events): bigint, sys_performance_bad: int, sum(sys_performance_bad): bigint]

In [20]:
print((ml_features_data.count(), len(ml_features_data.columns)))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(226, 9)

In [21]:
def onehot_encoder(ml_features_data):
    ## https://stackoverflow.com/questions/32277576/how-to-handle-categorical-features-with-spark-ml

    cols = ['gender', 'usStateAbbr']

    indexers = [
        StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
        for c in cols
    ]

    encoders = [
        OneHotEncoder(
            inputCol=indexer.getOutputCol(),
            outputCol="{0}_encoded".format(indexer.getOutputCol())) 
        for indexer in indexers
    ]

    assembler = VectorAssembler(
        inputCols=[encoder.getOutputCol() for encoder in encoders],
        outputCol="features2"
    )

    pipeline = Pipeline(stages=indexers + encoders + [assembler])
    return pipeline.fit(ml_features_data).transform(ml_features_data)

ml_features_data = onehot_encoder(ml_features_data)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
def vector_assembler(ml_features_data):

    # this vector is created in prep for ml
    assembler = VectorAssembler(inputCols=["avg_item_in_session",
                                           "avg_session_length",
                                           "num_good_recc",
                                           "num_bad_recc",
                                           "num_bad_sys",
                                           'gender_indexed',
                                           'usStateAbbr_indexed'],
                                outputCol="raw_features",
                                handleInvalid="skip")

    return assembler.transform(ml_features_data)

ml_features_data = vector_assembler(ml_features_data)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [23]:
def feature_scaler(ml_features_data):
    """apply scaler
    """
    scaler = Normalizer(inputCol="raw_features", outputCol="features")
    return scaler.transform(ml_features_data)

ml_df = feature_scaler(ml_features_data)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Machine Learning

In [24]:
def evaluate_model(results):
    """Customer function to evaluate result.
    """
    # Generic evaluation
    total_results = results.count()
    correct_pred = results.filter(results.label == results.prediction).count()
    incorrect_pred = results.filter(results.label != results.prediction).count()
    print('Total user events predicted correctly: {}'.format(correct_pred))
    print('Total user events predicted wrongly: {}'.format(incorrect_pred))
    print("Percentage predicted correct (%): {} \n".format((correct_pred/total_results)))
    
    # Correct churn predictions
    churn_correct = results.filter((results.label == 1) & (results.prediction == 1)).count()
    actual_churned_users = results.filter(results.label == 1).count()
    print('User churned and predicted to churn: {}'.format(churn_correct))
    print('User churned : {}'.format(actual_churned_users))
    print('Percent churned user events predicted correctly(%): {}\n'.format((churn_correct/actual_churned_users)))
    
    # Incorrect churn predictions
    print('Number of events predicted to churn but didnt: {}'.format(results.filter((results.label == 0) & (results.label == 1)).count()))
    churn_incorrect = results.filter((results.label == 0) & (results.prediction == 1)).count()
    print('User did not churn and predicted to: {}'.format(churn_incorrect))
    print('Percent churned user events predicted correctly(%): {}\n'.format(churn_incorrect/total_results))
    

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [25]:
# train test split for ML validation
train, test =  ml_df.randomSplit([0.6, 0.4], seed=42)  # more equal fit to combat overfitting

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [26]:
def train_set_evaluator(train):
    """Check for data skew
    """
    print("{} churned user events".format(train.filter(train['label']==1).count()))
    print("{} non-churned user events".format(train.filter(train['label']==0).count()))
    print("{} ratio of churned/non-churned user events".format(train.filter(train['label']==1).count()/train.filter(train['label']==0).count()))
    
train_set_evaluator(train)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

33 churned user events
99 non-churned user events
0.3333333333333333 ratio of churned/non-churned user events

In [27]:
# print('Baseline Logistic Regression Model')
# lr_model = LogisticRegression().fit(train)
# results = lr_model.transform(test)
# evaluate_model(results)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
# print('Baseline Random Forest Classifier Model')
# rfc_model = RandomForestClassifier().fit(train)
# results = rfc_model.transform(test)
# evaluate_model(results)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [29]:
# print('Baseline GBT Classifier Model')
# gbt_model = GBTClassifier().fit(train)
# results = gbt_model.transform(test)
# evaluate_model(results)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [34]:
# pipeline, just running it on classifier no transformations

gbt_model = GBTClassifier()

pipeline = Pipeline(stages=[gbt_model])

# set up param grid to iterate over
paramGrid = ParamGridBuilder() \
.addGrid(gbt_model.maxDepth, [2, 4]) \
.addGrid(gbt_model.maxBins, [15, 40]) \
.build()
#.addGrid(gbt_model.stepSize, [0.02, 0.2]) \
# set up crossvalidator to tune parameters and optimize
crossval = CrossValidator(estimator=pipeline,
                         estimatorParamMaps=paramGrid,
                         evaluator=MulticlassClassificationEvaluator(metricName='f1'),
                         numFolds=3)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
cvModel = crossval.fit(train)  # train model
results = cvModel.transform(test)  # apply model on test data

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Exception in thread cell_monitor-34:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/site-packages/awseditorssparkmonitoringwidget-1.0-py3.7.egg/awseditorssparkmonitoringwidget/cellmonitor.py", line 178, in cell_monitor
    job_binned_stages[job_id][stage_id] = all_stages[stage_id]
KeyError: 781



In [32]:
evaluate_model(results)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

name 'results' is not defined
Traceback (most recent call last):
NameError: name 'results' is not defined



In [33]:
cvModel.avgMetrics  # look at model scoring metrics

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

name 'cvModel' is not defined
Traceback (most recent call last):
NameError: name 'cvModel' is not defined

