In [25]:
# import libraries
from pyspark.sql import SparkSession
from pathlib import Path # better file paths# import libraries

# Setup spark session

In [26]:
# create a Spark session
spark = SparkSession \
    .builder \
    .appName('Sparkify') \
    .getOrCreate()
spark# create a Spark session

# Read in data

In [27]:
user_data = Path.cwd() / "data" / "TRANSFORMED_sparkify_event_data.csv" 
user_df = spark.read.csv(str(user_data), header='true')
user_df.head()

Row(userId='1000280', churn='1', gender='M', subscription_level='paid', page_upgraded='1', page_downgraded='1', auth_logged_in_cnt='1316', auth_logged_out_cnt='0', auth_guest_cnt='0', status_404_cnt='3', status_307_cnt='119', page_next_song_cnt='0', page_thumbs_up_cnt='53', page_thumbs_down_cnt='33', page_playlist_cnt='25', page_friend_cnt='14', page_roll_ad_cnt='74', page_logout_cnt='15', page_help_cnt='8', artist_cnt='767', song_cnt='945', session_cnt='22')

In [34]:
user_df.printSchema()

root
 |-- userId: string (nullable = true)
 |-- churn: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- subscription_level: string (nullable = true)
 |-- page_upgraded: string (nullable = true)
 |-- page_downgraded: string (nullable = true)
 |-- auth_logged_in_cnt: string (nullable = true)
 |-- auth_logged_out_cnt: string (nullable = true)
 |-- auth_guest_cnt: string (nullable = true)
 |-- status_404_cnt: string (nullable = true)
 |-- status_307_cnt: string (nullable = true)
 |-- page_next_song_cnt: string (nullable = true)
 |-- page_thumbs_up_cnt: string (nullable = true)
 |-- page_thumbs_down_cnt: string (nullable = true)
 |-- page_playlist_cnt: string (nullable = true)
 |-- page_friend_cnt: string (nullable = true)
 |-- page_roll_ad_cnt: string (nullable = true)
 |-- page_logout_cnt: string (nullable = true)
 |-- page_help_cnt: string (nullable = true)
 |-- artist_cnt: string (nullable = true)
 |-- song_cnt: string (nullable = true)
 |-- session_cnt: string (nulla

In [29]:
# a pandas version for visualizations...
pdf = user_df.toPandas()
pdf.head()

Unnamed: 0,userId,churn,gender,subscription_level,page_upgraded,page_downgraded,auth_logged_in_cnt,auth_logged_out_cnt,auth_guest_cnt,status_404_cnt,...,page_thumbs_up_cnt,page_thumbs_down_cnt,page_playlist_cnt,page_friend_cnt,page_roll_ad_cnt,page_logout_cnt,page_help_cnt,artist_cnt,song_cnt,session_cnt
0,1000280,1,M,paid,1,1,1316,0,0,3,...,53,33,25,14,74,15,8,767,945,22
1,1002185,0,F,paid,0,1,2080,0,0,2,...,92,14,49,25,1,18,12,1205,1570,17
2,1017805,0,F,paid,0,1,320,0,0,0,...,7,4,5,13,6,7,1,223,245,3
3,1030587,0,F,paid,1,1,1752,0,0,1,...,66,16,46,23,0,17,10,1071,1335,11
4,1033297,0,M,paid,1,1,299,0,0,0,...,10,3,7,4,10,5,2,215,234,5


In [40]:
pdf.describe()

Unnamed: 0,userId,churn,gender,subscription_level,page_upgraded,page_downgraded,auth_logged_in_cnt,auth_logged_out_cnt,auth_guest_cnt,status_404_cnt,...,page_thumbs_up_cnt,page_thumbs_down_cnt,page_playlist_cnt,page_friend_cnt,page_roll_ad_cnt,page_logout_cnt,page_help_cnt,artist_cnt,song_cnt,session_cnt
count,22277,22277,22277,22277,22277,22277,22277,22277,22277,22277,...,22277,22277,22277,22277,22277,22277,22277,22277,22277,22277
unique,22277,2,2,2,2,2,4303,1,1,18,...,446,110,239,166,164,124,67,2545,3397,113
top,1353091,0,M,paid,1,1,86,0,0,0,...,5,0,1,0,0,1,0,74,117,5
freq,1,17274,11651,16184,16151,15209,38,22277,22277,11005,...,574,2246,1019,1972,2209,1505,4034,50,49,1463


# Additional EDA on a few more features

In [41]:
# Toy example - Several histograms for the different values of one column
import plotly.express as px
df = px.data.tips()
fig = px.histogram(df, x="total_bill", color="sex")
fig.show()

In [42]:
# toy example - box plot
import plotly.graph_objects as go
import numpy as np

x0 = np.random.randn(50)
x1 = np.random.randn(50) + 2 # shift mean

fig = go.Figure()
# Use x instead of y argument for horizontal plot
fig.add_trace(go.Box(x=x0))
fig.add_trace(go.Box(x=x1))

fig.show()

In [43]:
fig = px.histogram(pdf, x="artist_cnt", color="churn")
fig.show()

In [47]:
pdf.loc[pdf['churn'] == '0']

Unnamed: 0,userId,churn,gender,subscription_level,page_upgraded,page_downgraded,auth_logged_in_cnt,auth_logged_out_cnt,auth_guest_cnt,status_404_cnt,...,page_thumbs_up_cnt,page_thumbs_down_cnt,page_playlist_cnt,page_friend_cnt,page_roll_ad_cnt,page_logout_cnt,page_help_cnt,artist_cnt,song_cnt,session_cnt
1,1002185,0,F,paid,0,1,2080,0,0,2,...,92,14,49,25,1,18,12,1205,1570,17
2,1017805,0,F,paid,0,1,320,0,0,0,...,7,4,5,13,6,7,1,223,245,3
3,1030587,0,F,paid,1,1,1752,0,0,1,...,66,16,46,23,0,17,10,1071,1335,11
4,1033297,0,M,paid,1,1,299,0,0,0,...,10,3,7,4,10,5,2,215,234,5
5,1057724,0,M,paid,1,1,4669,0,0,1,...,200,29,135,76,67,47,19,2157,3141,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22271,1921570,0,F,paid,1,1,3088,0,0,1,...,228,29,74,45,31,32,18,1515,2121,36
22272,1930015,0,F,paid,1,1,1184,0,0,3,...,64,9,18,19,24,22,5,747,894,13
22273,1955029,0,M,paid,1,1,567,0,0,0,...,23,3,14,6,12,7,4,390,440,9
22274,1960992,0,F,free,1,0,120,0,0,0,...,3,3,1,4,4,2,0,90,91,4


In [48]:
fig = go.Figure()
# Use x instead of y argument for horizontal plot
x0 = pdf.loc[pdf['churn'] == '0']["artist_cnt"]
x1 = pdf.loc[pdf['churn'] == '1']["artist_cnt"]
fig.add_trace(go.Box(x=x0))
fig.add_trace(go.Box(x=x1))
# RECALL - 0 is not churn, and 1 is churn
fig.show()

In [49]:
fig = px.histogram(pdf, x="song_cnt", color="churn")
fig.show()

In [50]:
fig = go.Figure()
# Use x instead of y argument for horizontal plot
x0 = pdf.loc[pdf['churn'] == '0']["song_cnt"]
x1 = pdf.loc[pdf['churn'] == '1']["song_cnt"]
fig.add_trace(go.Box(x=x0))
fig.add_trace(go.Box(x=x1))
# RECALL - 0 is not churn, and 1 is churn
fig.show()

In [51]:
fig = px.histogram(pdf, x="session_cnt", color="churn")
fig.show()

In [52]:
fig = go.Figure()
# Use x instead of y argument for horizontal plot
x0 = pdf.loc[pdf['churn'] == '0']["session_cnt"]
x1 = pdf.loc[pdf['churn'] == '1']["session_cnt"]
fig.add_trace(go.Box(x=x0))
fig.add_trace(go.Box(x=x1))
# RECALL - 0 is not churn, and 1 is churn
fig.show()

In [22]:
fig = px.histogram(pdf, x="page_friend_cnt", color="churn")
fig.show()

In [53]:
fig = go.Figure()
# Use x instead of y argument for horizontal plot
x0 = pdf.loc[pdf['churn'] == '0']["page_friend_cnt"]
x1 = pdf.loc[pdf['churn'] == '1']["page_friend_cnt"]
fig.add_trace(go.Box(x=x0))
fig.add_trace(go.Box(x=x1))
# RECALL - 0 is not churn, and 1 is churn
fig.show()

In [23]:
fig = px.histogram(pdf, x="page_help_cnt", color="churn")
fig.show()

In [54]:
fig = go.Figure()
# Use x instead of y argument for horizontal plot
x0 = pdf.loc[pdf['churn'] == '0']["page_help_cnt"]
x1 = pdf.loc[pdf['churn'] == '1']["page_help_cnt"]
fig.add_trace(go.Box(x=x0))
fig.add_trace(go.Box(x=x1))
# RECALL - 0 is not churn, and 1 is churn
fig.show()

* No obvious difference between the churn and non-churned users from these charts

# Random Forest

* Last time we ran a Logistic Regression model, now we will try a Random Forest model and see if the AUC increases.

* The first model used Logistic regression, this time we will try Random Forest and see if we see an improvement in AUC.

In [None]:
# Feature importance

In [None]:
# Cross-validation

In [None]:
# Output model

In [None]:
# Export predictions to a database