In [1]:
# Setting the environment variables

In [3]:
import os
import sys
os.environ["PYSPARK_PYTHON"]="/usr/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON"]="/usr/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"]="notebook --no-browser"
os.environ["JAVA_HOME"] = "/usr/java/jdk1.8.0_161/jre"
os.environ["SPARK_HOME"] = "/home/ec2-user/spark-2.4.4-bin-hadoop2.7"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

# Ecommerce Churn Assignment

The aim of the assignment is to build a model that predicts whether a person purchases an item after it has been added to the cart or not. Being a classification problem, you are expected to use your understanding of all the three models covered till now. You must select the most robust model and provide a solution that predicts the churn in the most suitable manner. 

For this assignment, you are provided the data associated with an e-commerce company for the month of October 2019. Your task is to first analyse the data, and then perform multiple steps towards the model building process.

The broad tasks are:
- Data Exploration
- Feature Engineering
- Model Selection
- Model Inference

### Data description

The dataset stores the information of a customer session on the e-commerce platform. It records the activity and the associated parameters with it.

- **event_time**: Date and time when user accesses the platform
- **event_type**: Action performed by the customer
            - View
            - Cart
            - Purchase
            - Remove from cart
- **product_id**: Unique number to identify the product in the event
- **category_id**: Unique number to identify the category of the product
- **category_code**: Stores primary and secondary categories of the product
- **brand**: Brand associated with the product
- **price**: Price of the product
- **user_id**: Unique ID for a customer
- **user_session**: Session ID for a user


### Initialising the SparkSession

The dataset provided is 5 GBs in size. Therefore, it is expected that you increase the driver memory to a greater number. You can refer to notebook 1 for the steps involved here.

In [4]:
# Spark environment
from pyspark import SparkConf
from pyspark.sql import SparkSession

In [5]:
# initialising the session with 14 GB driver memory
MAX_MEMORY = "14G"

spark = SparkSession \
    .builder \
    .appName("demo") \
    .config("spark.driver.memory", MAX_MEMORY) \
    .getOrCreate()

spark

In [6]:
# Spark session with 14 GB driver memory
spark.sparkContext.getConf().get('spark.driver.memory')

'14G'

In [7]:
#importing required libraries
import pandas as pd
from matplotlib import pyplot as plt
from pyspark.sql.functions import *

In [8]:
# Loading the clean and transformed data
df= spark.read.parquet("final_df.parquet")

## Task 3: Model Selection
3 models for classification:	
- Logistic Regression
- Decision Tree
- Random Forest

### Model 3: Random Forest

#### Train-test split

In [9]:
# Splitting the data into train and test (Remember you are expected to compare the model later)
train_data, test_data = df.randomSplit([0.7,0.3], seed=100)

In [10]:
# Number of rows in train data
train_data.count()

548387

In [11]:
# Number of rows in test data
test_data.count()

235974

In [12]:
# Importing the RandomForestClassifier
from pyspark.ml.classification import RandomForestClassifier

In [13]:
# Specifying the RF model
rf= RandomForestClassifier(featuresCol='features',labelCol='label')

In [14]:
# Importing all the required libraries
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics

# Numpy will help add steps in ParamGridBuilder
import numpy as np

#### Model Fitting

In [15]:
# Building the model with hyperparameter tuning
# Create ParamGrid for Cross Validation
# np.linspace will help in taking multiple values within the specified range
from pyspark.ml.tuning import ParamGridBuilder

rf_paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [int(x) for x in np.linspace(start = 10, stop = 50, num = 3)]) \
    .addGrid(rf.maxDepth, [int(x) for x in np.linspace(start = 10, stop = 20, num = 3)]) \
    .build()


In [16]:
# Model evaluation parameters
# Default metric - area under ROC
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",labelCol="label")

In [17]:
# Create 3-fold CrossValidator
rf_cv = CrossValidator(estimator = rf,
                      estimatorParamMaps = rf_paramGrid,
                      evaluator = evaluator,
                      numFolds = 3)

In [18]:
# Fitting the RF model on train data
cv_Model = rf_cv.fit(train_data)

In [19]:
# Number of trees in the best model
cv_Model.bestModel.explainParam('numTrees')

'numTrees: Number of trees to train (>= 1) (default: 20, current: 50)'

In [20]:
# Levels in the best model
cv_Model.bestModel.explainParam('maxDepth')

'maxDepth: Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 5, current: 20)'

In [21]:
# Transforming the train data
predictions_train = cv_Model.transform(train_data)

#### Model Analysis

Required Steps:
- Fit on test data
- Performance analysis
    - Appropriate Metric with reasoning

In [22]:
# Transforming the test data
predictions_test = cv_Model.transform(test_data)

In [23]:
# cvModel uses the best model found from the Cross Validation
print('Area under ROC for training set:', evaluator.evaluate(predictions_train))
print('Area under ROC for test set:', evaluator.evaluate(predictions_test))

Area under ROC for training set: 0.8189397967747892
Area under ROC for test set: 0.7833135775004997


In [24]:
# Distribution of label values
predictions_test.groupby("label").count().show()

+-----+------+
|label| count|
+-----+------+
|    1|146232|
|    0| 89742|
+-----+------+



In [25]:
# Distribution of predicted values
predictions_test.groupby("prediction").count().show()

+----------+------+
|prediction| count|
+----------+------+
|       0.0| 46572|
|       1.0|189402|
+----------+------+



In [26]:
# Confusion Matrix
from pyspark.sql.types import FloatType
from pyspark.mllib.evaluation import MulticlassMetrics
import pyspark.sql.functions as F

preds_and_labels=predictions_test.select(['prediction','label']).withColumn('label',F.col('label').cast(FloatType())).orderBy('prediction')

preds_and_labels=preds_and_labels.select(['prediction','label'])
metrics=MulticlassMetrics(preds_and_labels.rdd.map(tuple))

print(metrics.confusionMatrix().toArray())

[[ 35094.  54648.]
 [ 11478. 134754.]]


In [33]:
# accuracy score
accuracy=(35094+134754)/(35094+54648+11478+134754)
print("Accuarcy Score =",accuracy)

Accuarcy Score = 0.7197742124132319


In [34]:
# Precision value
precision=(134754)/(134754+54648)
print("Precision =",precision)

Precision = 0.7114708398010581


In [35]:
# Recall value
recall=(134754)/(134754+11478)
print("Recall =",recall)

Recall = 0.9215082881995733


In [36]:
# Fscore
fscore=2 * ((precision*recall) / (precision+recall))
print("F Score =",fscore)

F Score = 0.8029818194819357


#### Summary of the best Random Forest model

**Random Forest** model gives the best performance out of all three. 
<br>Eventhough decision tree model gave a recall of about 0.85, this one gives a better recall of **0.92** and all other metrics do come up as better with comparision to other two models.

## Task 4: Model Inference

- Feature Importance
- Model Inference
- Feature exploration

Best performed model: **Random Forest**

Decision Tree and Random forest models performed better than the logistic regression. The evaluation metrics reflect the same. If a choice has to be mad between random forest and decision trees, we can go with random forest.The model performed well on both training and test sets. The area under the ROC curve shows that.

Also the **recall** of the Random Forest model is by far the best value. 

In [37]:
x=cv_Model.bestModel

In [38]:
# Feature Importance
imp_features = x.featureImportances

In [39]:
# Defining a function to extract features along with the feature importance score
import pandas as pd
def ExtractFeatureImp(featureImp,dataset,featuresCol):
    list_extract=[]
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract=list_extract+dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist=pd.DataFrame(list_extract)
    varlist['score']=varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score',ascending=False))

In [45]:
# Printing the feature importance scores
ExtractFeatureImp(cv_Model.bestModel.featureImportances,predictions_test,"features").head(10)

Unnamed: 0,idx,name,score
1,70,user_product,0.479572
5,74,user_activity_count,0.10922
2,71,user_category_2,0.098883
4,73,user_session_count,0.05924
0,69,price,0.042779
3,72,user_mean_spend,0.04194
26,20,category_2_enc_smartphone,0.018303
9,3,brand_enc_xiaomi,0.010335
6,0,brand_enc_samsung,0.009834
72,66,hour_bin_enc_3,0.009604


Best features are mentioned and explored.