# Sparkify Project Workspace
This workspace contains a tiny subset (128MB) of the full dataset available (12GB). Feel free to use this workspace to build your project, or to explore a smaller subset with Spark before deploying your cluster on the cloud. Instructions for setting up your Spark cluster is included in the last lesson of the Extracurricular Spark Course content.

You can follow the steps below to guide your data analysis and model building portion of this project.

In [10]:
# import libraries
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import udf, last, when, sum, mean, col, ceil, struct
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, LongType, DoubleType
from pyspark.ml import Pipeline
from pyspark.sql.functions import avg, col, concat, count, desc, explode, lit, min, max, split, stddev, udf
from pyspark.ml.feature import StandardScaler, StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import re

In [2]:
def create_spark_session(local=True):
    if local:
        # create a local spark session
        spark = SparkSession.builder \
                .master("local") \
                .appName("Sparkify") \
                .getOrCreate()
    else:
        # create an AWS spark session 
        spark = SparkSession \
                .builder \
                .appName("Sparkify") \
                .getOrCreate()
    return spark

In [3]:
spark = create_spark_session(local=True)
spark

# Load and Clean Dataset


In [None]:
def load_data(session=spark, small=True):
    # specify file path of the dataset
    if session.sparkContext.getConf().getAll()[0][1] == "local": 
        if small:
            sparkify_data = "mini_sparkify_event_data.json"
        else:
            sparkify_data = "medium_sparkify_event_data.json"
    else:
        if small:
            sparkify_data = "s3n://udacity-dsnd/sparkify/sparkify_event_data.json"
    
        else:
            sparkify_data = "s3n://udacity-dsnd/sparkify/mini_sparkify_event_data.json"
    
    # read the dataset
    df = session.read.json(sparkify_data)
    return df

In [None]:
def clean_data(df):
    '''
    Values where NA-values must be prevented:
    - userId -> identifying the user
    - page -> which page was accessed: e.g. downgrade/upgrade/next song:
    - registration: the date of the registration 
    - ts -> play-time: needed for analyzing the user activity during the day 
    - level -> free/paid: needed for analyzing the churn rate
    - status -> HTTP status codes: check if user is unsatisfied with Sparkify's availability 
    - method -> HTTP request type: check if user is unsatisfied with Sparkify's availability
    - gender -> check if there is a different behaviour for different gender
    - sessionId -> assures the session was valid

    usefull but not necessary relevant:
    - auth -> "Cancalled / Logged In"
    - firstName -> first name of te user
    - lastName -> last name of the user
    - location -> user geographical location
    - userAgent -> difference between Win/Mac users
    - itemInSession

    Depending in the page event NA-valus in the follwing columns are acceptable:
    - song -> name of the played song
    - artist -> name of the artist 
    - length -> length of the song (do not remove NA's here or the column page only contains "next page")
    '''
    
    size_orig = df.count()
    
    df_clean = df.dropna(how = "any", subset = ["userId", "sessionId", "method", "page", "ts", 
                                                "registration", "level", "userAgent", "method",
                                                "status"])
    # cleaning the gender column - assuming the null values can be either null or a other gender
    # later this groups will be converted into numerical values
    df_clean = df_clean.fillna("null/other", subset=["gender"])

    # filter userIds with an empty string
    df_clean = df_clean.filter(df_clean["userId"] != "")

    # user defined function to extract the play hour from the timestamp
    get_hour = udf(lambda x: datetime.fromtimestamp(x / 1000.0).hour, IntegerType())
    # create the a new column "hour" this holds the hour when a user interacted with the system
    df_clean = df_clean.withColumn("hour", get_hour(df.ts))
    
    size_clean = df_clean.count()
    
    print("Dataset inital size: {}".format(size_orig))
    print("Dataset size after cleaning process: {} (removed {} entries)".format(size_clean, size_orig - size_clean))
    
    return df_clean

In [None]:
def load_and_clean_data(spark, small=True):
    df = load_data(spark, small)
    df = clean_data(df)
    return df

In [None]:
# load dataset and clean the data
df = load_and_clean_data(spark, small=True)

# Exploratory Data Analysis
When you're working with the full dataset, perform EDA by loading a small subset of the data and doing basic manipulations within Spark. In this workspace, you are already provided a small subset of data you can explore.


### Preliminary Analysis
The following part covers a basic analysis of Sparkify's user data.

In [None]:
# take a first quick look at the dataset schema
df.printSchema()

In [None]:
# get the number of records
total_records = df.count()
print("The dataset contains {} records.".format(total_records))

In [None]:
# number of users
num_users = df.select("userId").dropDuplicates().count()
print("The dataset contains {} users.".format(num_users))

#### User Groups (paid and unpaid memberships)
In this section the difference between paid and unpaid memberhips are shown. Since the users can change its membership status over time here all entries of the dataset are used.

In [None]:
# get membership data
pd_user_level = df.groupby("level").count().toPandas()
pd_user_level

In [None]:
# plot memberhsip data
plt.figure();
plt.bar(pd_user_level.level.values, pd_user_level["count"], color=["lightgreen", "lightcoral"]);
plt.xlabel("Membership type");
plt.ylabel("Number of members");
plt.title("Sparkify's membership overview");

In [None]:
# calculate the percentages
free_percentage = pd_user_level["count"][0] / total_records
paid_percentage = 1 - free_percentage
print("Membership overview:"+
      "\n free: {:.2f}% ({})\n paid: {:.2f}% ({})".format(free_percentage, pd_user_level["count"][0], 
                                                          paid_percentage, pd_user_level["count"][1]))

#### User groups: gender
Are Sparkify's user equally distributed between all genders:

In [None]:
# get Sparkify users grouped by their gender
pd_gender = df.groupby("gender").count().toPandas()
pd_gender.gender = pd_gender.gender.apply(lambda x: "female" if x == "F" else "male")

In [None]:
plt.figure();
plt.bar(pd_gender["gender"], pd_gender["count"], color=["lightgray", "darkgray"]);
plt.ylabel("Number of users");
plt.xlabel("Gender");
plt.title("Spakify's users by gender");

In [None]:
# percentage of female users
female_users = pd_gender["count"][0]/ total_records
# percentage of male users
male_users = pd_gender["count"][1] / total_records

print("male users: {:.2f}%".format(male_users*100))
print("female users: {:.2f}%".format(female_users*100))
print("difference in gender: {:.2f}%".format((female_users - male_users)*100))

#### Playtime over the day
Discover the playtime over the day in respect to different groups:

* all users
* gender
* payed / unpaid

In [None]:
pd_playtime_all = df.groupby("hour").count().sort("hour").toPandas()
pd_playtime_paid = df.where("level == 'paid'").groupby("hour").count().sort("hour").toPandas()
pd_playtime_unpaid = df.where("level != 'paid'").groupby("hour").count().sort("hour").toPandas()
pd_playtime_men = df.where("gender == 'M'").groupby("hour").count().sort("hour").toPandas()
pd_playtime_female = df.where("gender == 'F'").groupby("hour").count().sort("hour").toPandas()

In [None]:
plt.figure(figsize=[20,10]);
plt.plot(pd_playtime_all["count"], label="all users");
plt.plot(pd_playtime_paid["count"], label="paid users");
plt.plot(pd_playtime_unpaid["count"], label="unpaid users");
plt.plot(pd_playtime_men["count"], label="usergroup: men");
plt.plot(pd_playtime_female["count"], label="female")
plt.xlabel("Hours during the day");
plt.ylabel("number of song plays per hour");
plt.title("Number of song plays during the day")
plt.legend();

#### Memberhship downgrades/upgrades

Exploring the membership up- and downgrades.

In [None]:
# count page event for each entry in the log file
df.groupby("page").count().show()

In [None]:
# number of downgrades
num_downgrades = df.where("page = 'Submit Downgrade'").count()
num_downgrades_unique = df.where("page = 'Submit Downgrade'").dropDuplicates().count()
print("The total number of downgrades is {} from {} different users".format(num_downgrades,
                                                                            num_downgrades_unique))

In [None]:
# number of upgrades
num_upgrades = df.where("page = 'Submit Upgrade'").count()
num_upgrades_unique = df.where("page = 'Submit Upgrade'").dropDuplicates().count()
print("The total number of upgrades is {} from {} different users".format(num_upgrades,
                                                                          num_upgrades_unique))

#### Location
This section covers a quick look at the location of Sparkify's users

In [None]:
# use only the last two characters from the location indicating the state
split_location = udf(lambda x: re.split(" ", x)[-1])

In [None]:
pd_states = df.dropDuplicates(["userId"]).withColumn("state", split_location(df.location))\
                                         .groupby("state").count().toPandas()

In [None]:
plt.figure(figsize=[20,10]);
plt.bar(pd_states.state, pd_states["count"], color="green", alpha=0.8);
plt.xlabel("US state(s)");
plt.ylabel("Number of users");
plt.title("Sparkify's user loacation in the US");
plt.xticks(rotation=90);

In [None]:
pd_states["count"].describe()

The users declared 58 different locations in the US - some location consits of several US states. Are are three areas with an higher user occurence.

* California on the west coast
* East coast area: New York, New Jersy and Pennsylvania
* South states with Texas and Florida

#### Artist and song 
Most played artist and song:

In [None]:
# most played artists
df.where(col("artist").isNotNull()).groupby("artist").count().sort("count", ascending=False).show()

In [None]:
# get the most played songs
df.where(col("song").isNotNull()).groupby("song").count().sort("count", ascending=False).show()

In [None]:
# get the artist of the most played song
df.select(["artist", "song"]).filter(df.song.like("You're The One")).dropDuplicates().show()

Note: There are a lot of entries without an artist or song title, more precisely 50046 entries (in the small dataset). These entries represent the page event except "Next song".

In [None]:
# number of page events where the artist and song are null
num_page_event = total_records - df.filter(df.artist.isNotNull()).count()
num_page_event

In [None]:
# number of page events where the artist is not null
num_artist_events = df.dropna(how = "any", subset = ["artist"]).groupby("page").count().head()[1]
num_artist_events

In [None]:
# check if the artist and page events sum up to total record number -> true
num_artist_events + num_page_event == total_records

#### Technical issues
Take a look how often there are technical proplems on the Sparify platform - based on the http status codes

In [None]:
pd_http_status = df.groupby("status").count().toPandas()
pd_http_status

In [None]:
plt.figure();
plt.bar(["307", "404", "200"] ,pd_http_status["count"], color=["lightyellow", "red", "lightgreen"]);

In [None]:
print("Percentage of HTTP 404 Erros: {:.10f}".format(pd_http_status["count"][1]/ total_records))

Result: There is no need to care about technical issues. It seems that sparkify is a stable platform.

#### Registration time

In [None]:
# user defined function for extracting the year from the timestamp
get_year = udf(lambda x: datetime.fromtimestamp(x / 1000.0).year, IntegerType())
# create the a new column "hour" this holds the hour when a user interacted with the system
df.withColumn("year", get_year(df.registration)).groupby("year").count().show()

In [None]:
# user defined function for extracting the month from the timestamp
get_month = udf(lambda x: datetime.fromtimestamp(x / 1000).month, IntegerType())
# user defined function for extracting the day from the timestamp
get_day = udf(lambda x: datetime.fromtimestamp(x / 1000).day, IntegerType())

df_register = df.withColumn("month", get_month(df.registration))
df_register = df_register.withColumn("day", get_day(df_register.registration))
pd_reg_date = df_register.groupby(["month", "day"]).count().sort(["month", "day"]).toPandas()

In [None]:
plt.figure();
plt.bar(pd_reg_date.month, pd_reg_date["count"]);
plt.xlabel("Month from January to December");
plt.ylabel("Number of registrations");
plt.title("Sparkify's user registration in 2018");

### Feature creation

In this part the first features are created. The most important one is the churn rate.

### Define Churn
Once you've done some preliminary analysis, create a column Churn to use as the label for your model. I suggest using the Cancellation Confirmation events to define your churn, which happen for both paid and free users. As a bonus task, you can also look into the Downgrade events.

#### Feature: Churn

In [None]:
def get_churn_feature(df):
    
    # user defined function for set a churn indicator
    get_churn = udf(lambda x: 1 if x == "Cancellation Confirmation" else 0, IntegerType())

    # create new column churn
    df_churn = df.withColumn("churnEvent", get_churn(df.page))

    # create window with userId - this is needed to extract the churn users
    user_window = Window.partitionBy("userId").rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing)

    # create a new column churn, where a unique churn value is assigned to each user 
    df_churn = df_churn.withColumn("churn", F.max("churnEvent").over(user_window))

    return df_churn.select(["userId", "churn"]).dropDuplicates(["userId"])

#### Feature: Downgrade

In [None]:
def get_downgrade_feature(df):  
    # user defined function for indicating a downgrade event
    get_downgrade = udf(lambda x: 1 if x == "Downgrade" else 0, IntegerType())

    # create temporary downgradeEvent column
    df_downgrade = df.withColumn("downgradeEvent", get_downgrade(df.page)) 

    # create window with userId - this is needed to extract the churn users
    user_window = Window.partitionBy("userId").rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing)
    
    # create new column downgrade 
    df_downgrade = df_downgrade.withColumn("downgrade", F.max("downgradeEvent").over(user_window))
  
    return df_downgrade.select(["userId", "downgrade"]).dropDuplicates(["userId"])

#### Feature: Browser

In [None]:
def extract_os(userAgent):    
    if "Windows" in userAgent:
        return "Windows"
    elif "Macintosh" in userAgent:
        return "MacOS"
    elif "iPhone" in userAgent:
        return "iPhone"
    elif "iPad" in userAgent:
        return "iPad"
    elif "Linux" in userAgent:
        return "Linux"
    else:
        np.nan

In [None]:
def get_os_feature(df):
    get_os = udf(lambda x: extract_os(x))
    return df.withColumn("os", get_os(df.userAgent)).select(["userId", "os"]).dropDuplicates(["userId"]) 

#### Feature: Operating system/device

In [None]:
def extract_browser(userAgent):
    if "Firefox" in userAgent:
        return "Firefox"
    elif "Chrome" in userAgent:
        return "Chrome"
    elif "Safari" in userAgent:
        return "Safari"
    elif "Trident" in userAgent:
        return "InternetExplorer"
    else:
        return np.nan
    

In [None]:
def get_browser_feature(df):
    get_browser = udf(lambda x: extract_browser(x))
    return df.withColumn("browser", get_browser(df.userAgent)).select(["userId", "browser"]).dropDuplicates(["userId"])

In [None]:
def create_features(df):
    df_churn = get_churn_feature(df)
    df_downgrade = get_downgrade_feature(df)
    df_browser = get_browser_feature(df)
    df_os = get_os_feature(df)
    
    df = df.join(df_churn, on="userId", how="inner")\
           .join(df_downgrade, on="userId", how="inner")\
           .join(df_browser, on="userId", how="inner")\
           .join(df_os, on="userId", how="inner")
    return df

In [None]:
# add basic features (churn, downgrade, browser and os)
df = create_features(df)

In [None]:
# calculate the churn rate
churn_rate = df.dropDuplicates(["userId"]).where("churn = 1").count() / df.dropDuplicates(["userId"]).count()
print("The churn rate is {:.2f}%".format(churn_rate*100))

In [None]:
# calcuate the downgrade rate
downgrade_rate = df.dropDuplicates(["userId"]).where("downgrade = 1").count() / df.dropDuplicates(["userId"]).count()
print("The downgrade rate is {:.2f}%".format(downgrade_rate*100))

### Explore Data
Once you've defined churn, perform some exploratory data analysis to observe the behavior for users who stayed vs users who churned. You can start by exploring aggregates on these two groups of users, observing how much of a specific action they experienced per a certain time unit or number of songs played.

#### Churn in level (paid/unpaid)

In [None]:
# show the account level (paid/unpaid) regarding the churn rate
df.dropDuplicates(["userId"]).groupby(["churn", "level"]).count().sort("churn").show()

#### Churn in gender

In [None]:
# show the gender of the users grouped by churn
df.dropDuplicates(["userId"]).groupby(["churn", "gender"]).count().sort("churn").show()

#### Churn in user authentication

In [None]:
# check if the churn users were logged in 
df.dropDuplicates(["userId"]).groupby(["churn", "auth"]).count().sort("churn").show()

#### Churn in location

In [None]:
# get user location grouped by chrun
pd_states_churn = df.withColumn("state", split_location(df.location))\
                    .dropDuplicates(["userId"]).groupby(["churn", "state"])\
                    .count().sort("churn").toPandas()

In [None]:
# unstack grouped variables for plotting the data
pd_states_plot = pd_states_churn.groupby(["state", "churn"]).sum().unstack().fillna(0)

In [None]:
# create plot

# get required data
n_groups = pd_states_plot.shape[0]
user_state_churn = pd_states_plot["count"][1]
user_state = pd_states_plot["count"][0]
index_labels = pd_states_plot.index

# create plot
fig, ax = plt.subplots(figsize=[15,10])
index = np.arange(n_groups)
bar_width = 0.5
opacity = 0.8
plt.bar(index, user_state, bar_width, alpha=opacity, color="green", label="Non Churn Users")
plt.bar(index + bar_width, user_state_churn, bar_width, alpha=opacity, color="red", label="Churn Users")
plt.xlabel("US State(s)")
plt.ylabel("Number of users")
plt.title("Sparkify's user location")
plt.xticks(index + bar_width, index_labels, rotation=90)
plt.legend()
plt.tight_layout()
plt.show()

#### Time between registration and unsubsrcribe request

In [None]:
# user defined function of calculating difference of two timestamps
udf_time = udf(lambda x: (x[1] - x[0]) if x[0] < x[1] else 0, IntegerType())

In [None]:
# add new column "churntime" -> hours between registration and confirmed cancellation
df_churn_time = df.where("page = 'Cancellation Confirmation'").select(["userId", "registration", "ts"])\
                        .withColumn("churntime", udf_time(struct('registration', 'ts')))\
                        .withColumn("churntime", F.abs(F.ceil(col("churntime")/1000/60/60))).drop("ts")

In [None]:
# calculate descriptives
df_churn_time.dropDuplicates(["userID"]).select("churntime").describe().show()


The average churn time is 307.17 hours, which equals about 12.8 days. The longest churn time is only 2 houers where in contrast the longest duration is about 23.45 days

#### Playtime

In [None]:
# calculate playtime in minutes grouped by user churn
df.where(df.length.isNotNull()).groupby("length", "churn").count()\
.withColumn("time", (df.length * col("count"))/60).groupby("churn").count().show()

In [None]:
print("Total playtime for churn users: {:.2f} hours.".format(9195 / 60))
print("Total playtime for non-churn users: {:.2f} hours.".format(14336 / 60))

#### Page roll advert

In [None]:
df.where("page = 'Roll Advert'").groupby(["page", "churn"]).count().show()

#### Number of "NextSong" events

In [None]:
df.where("page = 'NextSong'").select(["page", "churn"]).groupby(["page", "churn"]).count().show()

#### Number of added Friends on Sparkify

In [None]:
df.where("page = 'Add Friend'").select(["userId", "page", "churn"]).groupby(["page", "churn"]).count().show()

#### Number of thumbs up / thumbs down

In [None]:
df.where("page = 'Thumbs Up' OR page = 'Thumbs Down'").groupby(["page", "churn"]).count().show()

#### Number of songs added to Playlist

In [None]:
df.where("page = 'Add to Playlist'").groupby(["page", "churn"]).count().show()

# TODO Browser + OS

# Feature Engineering
Once you've familiarized yourself with the data, build out the features you find promising to train your model on. To work with the full dataset, you can follow the following steps.
- Write a script to extract the necessary features from the smaller subset of data
- Ensure that your script is scalable, using the best practices discussed in Lesson 3
- Try your script on the full data set, debugging your script if necessary

If you are working in the classroom workspace, you can just extract features based on the small subset of data contained here. Be sure to transfer over this work to the larger dataset when you work on your Spark cluster.

#### Features
The following features are created in the following section:

Categorical features:
* Level (paid / unpaid membership)
* Gender
* Downgrade (user performed a downgrade)
* Location (state)
* Page (event)

Numerical features:
* Number of friends
* Number of Thumbs Up
* Number of Thumbs Down
* Number add to playlist
* Ratio like (Thumbs Up / Thumbs down)
* Number artists
* Number of songs per user
* Churntime (time from registation to cancelling event)

#### Feature Creation

In [None]:
def get_feature_dataframe(df):
    '''
    INPUT: 
    df - (pyspark dataframe) cleaned dataframe
    
    OUTPUT:
    df_result - (pyspark dataframe) dataframe with numerical features
    
    Description:
    This function computes several numerical features:
    - friends: Number of friends per Sparkify user
    - dislike: Number of dislikes
    - like: Number of likes
    - like-ration: ratio between likes and dislikes
    - churn-time: time from registration to churn in days
    - avg-songs-in-session: average amount of songs a user played in a session
    - advert: number of advert
    '''
    
#     df_downgrade = get_downgrade_feature(df)
    
#     df_browser = get_browser_feature(df)
     
#     df_os = get_os_feature(df)
        
    df_friends = df.where("page = 'Add Friend'").groupby("userId").count().dropDuplicates(["userId"])\
                   .withColumn("numFriends", col("count")).drop("count")
        
    df_dislike = df.where("page = 'Thumbs Down'").groupby("userId").count().dropDuplicates(["userId"])\
                   .withColumn("numDislikes", col("count")).drop("count")

    df_like = df.where("page = 'Thumbs Up'").groupby("userId").count().dropDuplicates(["userId"])\
                .withColumn("numLikes", col("count")).drop("count")
    
    df_playlist = df.where("page = 'Add to Playlist'").groupby("userId").count().dropDuplicates(["userId"])\
                    .withColumn("playlistSize", col("count")).drop("count")
    
    df_likeRatio = df_dislike.join(df_like, on="userId", how="left")\
                             .withColumn("like-ratio", col("numLikes")/col("numDislikes"))
    
    df_time = df.groupby("userId").agg(F.max("ts").alias("last_access"))
    ms_per_day = 86400000 # milliseconds per day
    df_time = df_time.join(df, on="userId").withColumn("days", ((col("last_access")-col("registration"))/ms_per_day)\
                                                             .cast(IntegerType())).dropDuplicates(["userId"])\
                                                             .dropDuplicates(["userId"]).select("userId", "days")

    df_songs_in_session = df.groupby("userId", "ts").agg(max("itemInSession")).groupby("userId")\
                                                    .avg("max(itemInSession)")\
                                                    .withColumnRenamed("avg(max(itemInSession))", "avg_song_session")
    
    df_advert = df.where("page = 'Roll Advert'").groupby("userId").count().dropDuplicates(["userId"])\
                  .withColumn("numAdvert", col("count")).drop("count")
    
    df_result = df_friends.join(df_likeRatio, on="userId", how="inner")\
                          .join(df_playlist, on="userId", how="inner")\
                          .join(df_time, on="userId", how="inner")\
                          .join(df_songs_in_session, on="userId", how="inner")\
                          .join(df_advert, on="userId", how="inner")
    
    # add data for categorical features and add churn value
    df_result = df_result.join(df.select(["userId", "churn", "gender", "level", "downgrade", "os", "browser"]), on="userId", how="inner")

    return df_result.dropDuplicates(["userId"])

In [None]:
# calculate numerical features
df_features = get_feature_dataframe(df)

In [None]:
# print dataframe schema
df_features.printSchema()

# Modeling
Split the full dataset into train, test, and validation sets. Test out several of the machine learning methods you learned. Evaluate the accuracy of the various models, tuning parameters as necessary. Determine your winning model based on test accuracy and report results on the validation set. Since the churned users are a fairly small subset, I suggest using F1 score as the metric to optimize.

Classification problem
* Logistic Regression
* Decision Trees
* Gradient Boosting Trees
* SVM
* Naive Bayes

#### Create dataset for the Model
Convert the dataset ```df_features``` into a dataset which can directly be used in ML models using a Vector Assembler and Features Scaler (using StandardScaler).

In [None]:
# list of numerical features
numerical_features = ["numFriends", "numDislikes", "numLikes", "like-ratio", "playlistSize", "days", 
                      "avg_song_session", "numAdvert", "downgrade"] 

# list of categorical features
categorical_features = ["genderFeat", "levelFeat", "downgradeFeat", "osFeat", "browserFeat"]

In [None]:
def create_categorical_features(df, columns_list):
    '''
    INPUT:
    columns_list - (list) column names of categorical features 
    df - (pyspark dataframe) dataframe
    
    OUTPUT:
    df - (pyspark dataframe) dataframe with categorical features
    
    Desciption:
    For each element in the columns list a categorical feature will
    be created using a StringIndexer.
    The original columns in columns_list will be replaced with the
    feature.
    '''
    for col in columns_list:
        indexer = StringIndexer(inputCol=col[:-4], outputCol=col)
        df = indexer.fit(df).transform(df)
    return df

In [None]:
def create_preprocessing_pipeline(numerical_features = numerical_features, categorical_features = categorical_features):
    '''
    INPUT:
    numerical_features - (list) a list of strings naming the columns of numerical features
    categorical_features - (list) a list of strings naming the columns of categorical features
    
    OUTPUT:
    pipeline - (pyspark pipeline) a pipeline which includes all necessary processing steps, including a 
               VetorAssembler and a Standard Scaler
               
    Description:
    Build a pyspark pipeline for transforming the data in the desired layout.
    Important note: the categorical features were already converted using a String Indexer in a previous step.
    '''
    # numieric features: feature vectorizer and scaler 
    assembler_num = VectorAssembler(inputCols = numerical_features, 
                                    outputCol = "numerical_features")
    
    feature_scaler_num = StandardScaler(withMean = True, withStd = True, 
                                   inputCol = "numerical_features", 
                                   outputCol = "scaled_features")

    # categorical features: 
    assembler_cat = VectorAssembler(inputCols = categorical_features+["scaled_features"], 
                                    outputCol = "features")
    
    # create pipeline
    pipeline = Pipeline(stages = [assembler_num, feature_scaler_num, assembler_cat])
    
    return pipeline

In [None]:
def create_model_dataset(df, numerical_features = numerical_features, categorical_features = categorical_features):
    '''
    INPUT:
    df_features - (pyspark dataframe) a dataframe which includes all features
    numerical_features - (list) a list of strings naming the columns of numerical features
    categorical_features - (list) a list of strings naming the columns of categorical features
    
    OUTPUT:
    df - (pyspark dataframe) which includes all columns from the input dataset, but extendet 
         with the features in an additional representation (defined with pipeline)
    '''
    # create categorical features
    df = create_categorical_features(df, categorical_features)
    
    # create the pipeline
    pipeline = create_preprocessing_pipeline(numerical_features, categorical_features)

    # fit and transform the data
    df = pipeline.fit(df).transform(df)
    
    # rename column churn into label (label is more common usage in ML algorithms) 
    df = df.withColumnRenamed("churn", "label")
    
    return df

In [None]:
df_model = create_model_dataset(df_features, numerical_features, categorical_features)

In [None]:
df_model.printSchema()

In [None]:
df_model.select("features").head(1)

In [None]:
df_model.select(["label", "features"]).rdd.saveAsPickleFile("model_data")

In [4]:
pickleRdd = spark.sparkContext.pickleFile("model_data").collect()
df_model = spark.createDataFrame(pickleRdd)

In [5]:
df_model.printSchema()

root
 |-- label: long (nullable = true)
 |-- features: vector (nullable = true)



In [6]:
df_model.count()

177

#### Model creation

In [7]:
def create_and_evaluate_model(classifier, train, test):

    if classifier == "LogisticRegression":
        clf = LogisticRegression()
        
        paramGrid = ParamGridBuilder() \
                        .addGrid(clf.maxIter, [1, 10]) \
                        .addGrid(clf.regParam, [0.1, 0.01]) \
                        .build()
    
    elif classifier == "RandomForestClassifier":
        clf = RandomForestClassifier()
        paramGrid = ParamGridBuilder() \
                        .addGrid(clf.maxDepth, [2, 4]) \
                        .addGrid(clf.numTrees, [5, 10]) \
                        .build()
        
    elif classifier == "GBTClassifier":
        clf = GBTClassifier()
        
        paramGrid = ParamGridBuilder() \
                        .addGrid(clf.maxIter, [1,5]) \
                        .addGrid(clf.maxDepth, [2, 4]) \
                        .build()
 
    elif classifier == "LinearSVC":
        clf = LinearSVC()
        
        paramGrid = ParamGridBuilder() \
                        .addGrid(clf.maxIter, [10, 100]) \
                        .build()
 
    else:
        return "Error: Invalid model."
    
    # create Cross-Validator with current classifier and parameter grid
    crossval = CrossValidator(estimator=clf,
                              estimatorParamMaps=paramGrid,
                              evaluator=MulticlassClassificationEvaluator(),
                              numFolds=2)   

    # fit (train) model
    model = crossval.fit(train)

    # make prediction for unknown test data
    predictions = model.transform(test)
    
    # Select (prediction, true label) and compute test error
    # based on pyspark documentation: https://spark.apache.org/docs/latest/ml-classification-regression.html
    # https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.evaluation.MulticlassClassificationEvaluator.html
    evaluator = MulticlassClassificationEvaluator(metricName = "f1",
                                                  labelCol = "label",
                                                  predictionCol = "prediction")
    
    # compute f1 score
    f1_score = evaluator.evaluate(predictions)
    
    # compute accuracy
    evaluator.setMetricName("accuracy")
    accuracy = evaluator.evaluate(predictions)

    print("Classifier: {} - F1-Score: {:.2f} - accuracy: {:.2f}".format(classifier, f1_score, accuracy))
    
    return f1_score, accuracy
    

In [8]:
# split the model data into train and test-set
train, test = df_model.randomSplit([0.8, 0.2], seed=42)

In [9]:
create_and_evaluate_model("LinearSVC", train, test)

NameError: name 'ParamGridBuilder' is not defined

In [None]:
create_and_evaluate_model("GBTClassifier", train, test)

In [None]:
create_and_evaluate_model("RandomForestClassifier", train, test)

In [None]:
create_and_evaluate_model("LogisticRegression", train, test)

# Final Steps
Clean up your code, adding comments and renaming variables to make the code easier to read and maintain. Refer to the Spark Project Overview page and Data Scientist Capstone Project Rubric to make sure you are including all components of the capstone project and meet all expectations. Remember, this includes thorough documentation in a README file in a Github repository, as well as a web app or blog post.