In [1]:
# Starter code
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F
from pyspark.sql.functions import isnan, count,lit, when, col, desc, udf, col, sort_array, asc, avg, lag
from pyspark.sql.window import Window
from pyspark.sql.functions import sum as Fsum
# Create spark session
spark = SparkSession \
    .builder \
    .appName("Sparkify") \
    .getOrCreate()

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1557090982442_0001,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
# Read in full sparkify dataset
event_data = "s3n://udacity-dsnd/sparkify/sparkify_event_data.json"
df = spark.read.json(event_data)
df.head()

VBox()

Row(artist=u'Popol Vuh', auth=u'Logged In', firstName=u'Shlok', gender=u'M', itemInSession=278, lastName=u'Johnson', length=524.32934, level=u'paid', location=u'Dallas-Fort Worth-Arlington, TX', method=u'PUT', page=u'NextSong', registration=1533734541000, sessionId=22683, song=u'Ich mache einen Spiegel - Dream Part 4', status=200, ts=1538352001000, userAgent=u'"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"', userId=u'1749042')

In order to automatize a cleaning and transforming process an ETL function was written, which originats from an investigation on "mini" version of the dataset. The function below does following:
1. Cleans dataframe to remove any NaN from "Gender" columns (by removing userid '1261737' who is the only user without information about gender)
2. Prepares timestamps and calculates time differences between user interactions with app
3. Creates a userid aggregated table with 7 features defined in the study of "mini" dataset

After application of the ETL function the resulting data is stored in .json file to avoid timeconsuming ETL in the future. As a result a "df_feature" table is read from a .json file to accelerate a ML Section.

## ETL Pipeline

In [3]:
def ETL(df):
    '''
    Function that implement an ETL Pipeline to the raw data
    Input: df (pyspark dataframe) - raw data with user logs
    Output: df_features (pyspark dataframe) - user aggregated dataframe
    '''
    # Clean dataset
    df = df.filter(df.userId != '1261737')
    
    # Prepare Timestamps
    ts_new = udf(lambda x: x / 1000)
    df = df.withColumn('new_ts', ts_new('ts')).drop('ts')
    df = df.withColumn('new_reg', ts_new('registration')).drop('registration')
    
    # Calculate Time Difference between user interactions with app
    my_window = Window.partitionBy('userId').orderBy('new_ts')

    df = df.withColumn('prev', lag(df.new_ts).over(my_window))
    df = df.withColumn('diff_dates', F.when(F.isnull(df.new_ts - df.prev), 0)
                      .otherwise(df.new_ts - df.prev))
    
    #Feature engineering
    df_features = df.filter(df.page == 'NextSong') \
        .groupby('userId') \
        .count() \
        .withColumnRenamed('count','number_songs')

    df_users_cancelled = df.filter(df.page == 'Cancellation Confirmation') \
        .groupby('userId') \
        .count() \
        .withColumnRenamed('count','Churn') \
        .withColumnRenamed('userId','user_canc')

    udf_gender = udf(lambda x: 1 if x == 'M' else 0)
    df_gender = df.dropDuplicates(['userId']) \
        .select(['userId','gender']) \
        .withColumnRenamed('userId','user_gender') \
        .withColumn('gender',udf_gender('gender'))
        
    df_gender = df_gender.withColumn('gender',df_gender.gender.cast(IntegerType()))
    
    df_thumbs_down = df.filter(df.page == 'Thumbs Down') \
        .groupby('userId') \
        .count() \
        .withColumnRenamed('count','Thumbs') \
        .withColumnRenamed('userId','user_thumbs')

    df_reg = df.groupby('userId') \
        .agg({'new_ts':'max','new_reg':'max'}) \
        .withColumnRenamed('max(new_ts)','max_ts') \
        .withColumnRenamed('max(new_reg)','max_reg') \
        .withColumnRenamed('userId','userid_reg_len')

    df_reg = df_reg.withColumn('reg_length',df_reg.max_ts - df_reg.max_reg) \
        .select('userid_reg_len','reg_length')

    df_diff_dates_max = df.groupby('userId') \
        .agg({'diff_dates':'max'}) \
        .withColumnRenamed('max(diff_dates)','diff_dates_max') \
        .withColumnRenamed('userId','user_diff_dates_max')

    df_diff_dates_mean = df.groupby('userId') \
        .agg({'diff_dates':'avg'}) \
        .withColumnRenamed('avg(diff_dates)','diff_dates_mean') \
        .withColumnRenamed('userId','user_diff_dates_mean')

    df_diff_dates_week = df.filter(df.diff_dates > 600000) \
        .groupby('userId') \
        .count() \
        .withColumnRenamed('count','diff_dates_session_week') \
        .withColumnRenamed('userid','user_diff_dates_week')

    # Joints

    df_features = df_features.join(df_users_cancelled, df_users_cancelled.user_canc == df_features.userId, how = 'left') \
        .drop(df_users_cancelled.user_canc)
    df_features = df_features.join(df_gender, df_gender.user_gender == df_features.userId, how = 'left') \
        .drop(df_gender.user_gender)
    df_features = df_features.join(df_thumbs_down, df_thumbs_down.user_thumbs == df_features.userId, how = 'left') \
        .drop(df_thumbs_down.user_thumbs)
    df_features = df_features.join(df_reg, df_reg.userid_reg_len == df_features.userId, how = 'left') \
        .drop(df_reg.userid_reg_len)
    df_features = df_features.join(df_diff_dates_max, df_diff_dates_max.user_diff_dates_max == df_features.userId, how = 'left') \
        .drop(df_diff_dates_max.user_diff_dates_max)
    df_features = df_features.join(df_diff_dates_mean, df_diff_dates_mean.user_diff_dates_mean == df_features.userId, how = 'left') \
        .drop(df_diff_dates_mean.user_diff_dates_mean)
    df_features = df_features.join(df_diff_dates_week, df_diff_dates_week.user_diff_dates_week == df_features.userId, how = 'left') \
        .drop(df_diff_dates_week.user_diff_dates_week)

    df_features = df_features.fillna(0, subset=['Churn','Thumbs','diff_dates_session_week'])

    df_features = df_features.withColumn('Thumbs_Down', F.expr('Thumbs / number_songs')).drop('Thumbs')
    
    return df_features

VBox()

In [4]:
df_features = ETL(df)

VBox()

In [5]:
path_wr = 's3n://vk1009bucket1/Spark_try1/df_features.json'
df_features.write.mode('append').json(path_wr)

VBox()

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 42976)
----------------------------------------
Traceback (most recent call last):
  File "/usr/lib64/python2.7/SocketServer.py", line 293, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib64/python2.7/SocketServer.py", line 321, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib64/python2.7/SocketServer.py", line 334, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib64/python2.7/SocketServer.py", line 655, in __init__
    self.handle()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/accumulators.py", line 266, in handle
    poll(authenticate_and_accum_updates)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/accumulators.py", line 254, in authe

In [2]:
path_wr = 's3n://vk1009bucket1/Spark_try1/df_features.json'
df_features = spark.read.json(path_wr)

VBox()

In [3]:
df_features.head()

VBox()

Row(Churn=1, Thumbs_Down=0.016736401673640166, diff_dates_max=1686725.0, diff_dates_mean=6337.923076923077, diff_dates_session_week=1, gender=0, number_songs=239, reg_length=14986090.0, userId=u'1000353')

## Machine Learning

### Prepare for ML

##### Load Libraries

In [4]:
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import RegexTokenizer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import numpy as np

VBox()

##### Data Transformation / TrainTest Split

In [5]:
# Define columns to use
X = df_features.drop('userid')
X_cols = X.schema.names
X_cols.remove('Churn')

# Vector Assembler
assembler = VectorAssembler(inputCols=X_cols, outputCol='features_vec')
X = assembler.transform(X)

# Standard Scaller
scaler = StandardScaler(inputCol='features_vec', outputCol='sc_features',
                        withStd=True, withMean=False)

ScalerModel = scaler.fit(X)

X = ScalerModel.transform(X)

# Train Test Split
(trainingData, testData) = X.randomSplit([0.7, 0.3])

VBox()

### Search for Best ML Classifier

In order to find the best classifier 3 classifiers available in spark.ml library were tested: Logistic Regression, Random Forest Classifier and Gradient Boost Classifier.

Using Parameter Grid the hyperparameters of all above mentioned classifiers were tuned to achieve the best model. 

**Logistic Regression**: 
- maxIter: maximum number of iterations to run the optimization algorithm. Here I used the values of 5,10 and 20.
- regParam: regularisation term. Here I used the values 0,0.1,1,10. For regularisation parameters the 10 fold difference between neighbor values is a common choice.

**Random Forest** 
- numTrees: number of trees in the Random Forest classifier. I used values of 1,10 and 20. 1 corresponds to a Decision Tree. I didn't choose the value >20 in order to not slow down the program.
- maxDepth: maximum depth of a single tree in the forest. I used 2,5,10. The values were chosen based on the size of the features: 7. 

**GBT Classifier** 
- maxIter: maximum number of iterations to run the optimization algorithm. Here I used the values of 5 and 15.
- maxDepth: maximum depth of a single tree in the model. I used 2 and 4. The values were chosen based on the size of the features: 7. 

To evaluate the model F1 score was used. F1 represents a balance between precision and recall. Here I can not see whether there is a certain preference for either recall or precision. On the one hand we want to identify as many uncertain customers as possible (recall high) but at the same time we do not want to offer too many discounts (precision low). So the balance between precision and recall is important here.  
F1 score also works if the data is unbalanced, as we have here. Accuracy is therefore not the best choice.

##### Logistic Regression

In [6]:
# Initialize a model
lr = LogisticRegression(labelCol="Churn", featuresCol="sc_features")

# Define Parameter Grid
paramGrid_lr = ParamGridBuilder() \
    .addGrid(lr.maxIter,[5, 10, 20]) \
    .addGrid(lr.regParam,[0, 0.1, 1, 10]) \
    .build()

# Define CV 
crossval_lr = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid_lr,
                          evaluator=MulticlassClassificationEvaluator(labelCol="Churn", predictionCol="prediction", metricName="f1"),
                          numFolds=3)
# Fit & predict
model_lr = crossval_lr.fit(trainingData)

pred_lr = model_lr.transform(testData)

evaluator = MulticlassClassificationEvaluator(labelCol="Churn", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(pred_lr)
print('The f1 score achieved using Logistic Regression after Hyperparameter Tuning is {}'.format(round(f1,2)))

VBox()

The f1 score achieved using Logistic Regression after Hyperparameter Tuning is 0.82

What are the best parameters for LR Classifier

In [20]:
best_model_lr = model_lr.bestModel

best_max_iter = best_model_lr._java_obj.getMaxIter()
best_reg_param = best_model_lr._java_obj.getRegParam()
print('Optimal parameters for LR Classifier: maxIter {}, regParam {}'.format(best_max_iter, best_reg_param))

VBox()

Optimal parameters for LR Classifier: maxIter 10, regParam 0.0

##### Random Forest

In [21]:
# Initialize a model
rf = RandomForestClassifier(labelCol="Churn", featuresCol="sc_features")

# Define Parameter Grid
paramGrid_rf = ParamGridBuilder() \
    .addGrid(rf.numTrees,[1, 10, 20]) \
    .addGrid(rf.maxDepth,[2, 5, 10]) \
    .build()

# Define CV
crossval_rf = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid_rf,
                          evaluator=MulticlassClassificationEvaluator(labelCol="Churn", predictionCol="prediction", metricName="f1"),
                          numFolds=3)
# Fit & predict
model_rf = crossval_rf.fit(trainingData)

pred_rf = model_rf.transform(testData)

# Evaluate
evaluator = MulticlassClassificationEvaluator(labelCol="Churn", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(pred_rf)
print('The f1 score achieved using Random Forest Classifier after Hyperparameter Tuning is {}'.format(round(f1,2)))

VBox()

The f1 score achieved using Random Forest Classifier after Hyperparameter Tuning is 0.86

What are the best tuning Parameters for RF Classifier:

In [14]:
best_model_rf = model_rf.bestModel

best_num_trees = best_model_rf._java_obj.getNumTrees()
best_max_depth = best_model_rf._java_obj.getMaxDepth()
print('Optimal parameters for RF Classifier: numTrees {}, maxDepth {}'.format(best_num_trees, best_max_depth))

VBox()

Optimal parameters for RF Classifier: numTrees 20, maxDepth 10

##### GBT Classifier

In [8]:
# Initialize a model
gbt = GBTClassifier(labelCol="Churn", featuresCol="sc_features")

# Define Parameter Grid
paramGrid_gbt = ParamGridBuilder() \
    .addGrid(gbt.maxIter,[5, 15]) \
    .addGrid(gbt.maxDepth,[2, 4]) \
    .build()

# Define CV
crossval_gbt = CrossValidator(estimator=gbt,
                          estimatorParamMaps=paramGrid_gbt,
                          evaluator=MulticlassClassificationEvaluator(labelCol="Churn", predictionCol="prediction", metricName="f1"),
                          numFolds=3)
# Fit & predict
model_gbt = crossval_gbt.fit(trainingData)

pred_gbt = model_gbt.transform(testData)

#Evaluate
evaluator = MulticlassClassificationEvaluator(labelCol="Churn", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(pred_gbt)
print('The f1 score achieved using GBT Classifier after Hyperparameter Tuning is {}'.format(round(f1,2)))

VBox()

Exception in thread cell_monitor-7:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.6/site-packages/awseditorssparkmonitoringwidget-1.0-py3.6.egg/awseditorssparkmonitoringwidget/cellmonitor.py", line 178, in cell_monitor
    job_binned_stages[job_id][stage_id] = all_stages[stage_id]
KeyError: 1958



The f1 score achieved using GBT Classifier after Hyperparameter Tuning is 0.86

What are the best parameters for GBT Classifier

In [13]:
best_model_gbt = model_gbt.bestModel
    
# get the best parameters for a model
best_max_iter = best_model_gbt._java_obj.getMaxIter()
best_max_depth = best_model_gbt._java_obj.getMaxDepth()

print('Optimal parameters for GBT Classifier: maxIter {}, maxDepth {}'.format(best_max_iter, best_max_depth))

VBox()

Optimal parameters for GBT Classifier: maxIter 15, maxDepth 4

##### Conclusion

In order to choose the best ML Classification alghorithm 3 approached were tested: Logistic Regression, Random Forest Classifier, GBT Classifier. Each alghorthm was tuned using hyperparameter tuning of at least two parameters.F1 Score was used as an evaluation metric.

Result:
1. Random Forest (F1 = 0.86)
2. GBT Classifier (F1 = 0.86)
3. Logistic Regression (F1 = 0.82)

As expected ensemble methods showed a way better F1 Score in comparison with Logistic Regression. However, the difference between both ensemble alghorithms is tiny. So in the next part both alghorithms are investigated for feature importance.

### Model Robustness investigation

##### Random Forest Classifier

In this section the robustness of the best RF Model is investigated. To test it the train/test data was shuffeled 10 times and each time the best RF Model was applied to the data. As a result we get 10 f1 scores for train and test data. 

In [15]:
# Initialise lists for stat calculation
f1_list_train = []
f1_list_test = []

# make 10 simulations
for i in range(10):
    
    # random shuffle of train/test data
    (trainingData, testData) = X.randomSplit([0.7, 0.3])
    
    # recall best RF Model from CV Model implemented above
    best_model_rf = model_rf.bestModel
    
    # get the best parameters for a model
    best_num_trees = best_model_rf._java_obj.getNumTrees()
    best_max_depth = best_model_rf._java_obj.getMaxDepth()
    
    # Initialize a RF Model with the best parameters set
    rf_best = RandomForestClassifier(labelCol="Churn", featuresCol="sc_features", numTrees=best_num_trees, maxDepth=best_max_depth)
    
    # Train a model
    model_fin = rf_best.fit(trainingData)
    
    # Initialize an evaluator
    evaluator = MulticlassClassificationEvaluator(labelCol="Churn", predictionCol="prediction", metricName="f1")

    # Calculate f1 for train dataset
    pred_rf_train = model_fin.transform(trainingData)
    f1_train = evaluator.evaluate(pred_rf_train)
    
    # Calculate f1 for test dataset
    pred_rf_test = model_fin.transform(testData)
    f1_test = evaluator.evaluate(pred_rf_test)
    
    # Append the f1 scores to corresponding lists
    f1_list_train.append(f1_train)
    f1_list_test.append(f1_test)
    
    print('Dataset Split {}: Train f1 score {}, Test f1 score {}'.format(i+1,round(f1_train,5), round(f1_test,5)))

VBox()

Dataset Split 1: Train f1 score 0.8991, Test f1 score 0.86112
Dataset Split 2: Train f1 score 0.89564, Test f1 score 0.8654
Dataset Split 3: Train f1 score 0.89961, Test f1 score 0.85693
Dataset Split 4: Train f1 score 0.89709, Test f1 score 0.8611
Dataset Split 5: Train f1 score 0.90084, Test f1 score 0.85926
Dataset Split 6: Train f1 score 0.89829, Test f1 score 0.8633
Dataset Split 7: Train f1 score 0.90139, Test f1 score 0.86606
Dataset Split 8: Train f1 score 0.89626, Test f1 score 0.86567
Dataset Split 9: Train f1 score 0.8971, Test f1 score 0.8597
Dataset Split 10: Train f1 score 0.89775, Test f1 score 0.8659

Calculate some statistics

In [26]:
train_mean = round(np.mean(f1_list_train),3)
test_mean = round(np.mean(f1_list_test),3)

train_std = round(np.std(f1_list_train),3)
test_std = round(np.std(f1_list_test),3)

print('Results after 10 simulations: Train Mean {}, Train Std {}; Test Mean {}, Test Std {}'.format(train_mean, train_std, 
                                                                                                    test_mean, test_std))

VBox()

Results after 10 simulations: Train Mean 0.898, Train Std 0.002; Test Mean 0.862, Test Std 0.003

As we can see both Train and Test F1 scores remain very stable over 10 folds. Thus the Model is robust. The differnce between Train and Tests scores of 0.036 in my point of view is acceptable. Therefore I can conclude that this model is not overfitted.

##### GBT Classifier

In this section the robustness of the best GBT Model is investigated. To test it the train/test data was shuffeled 10 times and each time the best GBT Model was applied to the data. As a result we get 10 f1 scores for train and test data. 

In [7]:
# Initialise lists for stat calculation
f1_list_train = []
f1_list_test = []

# make 10 simulations
for i in range(10):
    
    # random shuffle of train/test data
    (trainingData, testData) = X.randomSplit([0.7, 0.3])
    
    # recall best RF Model from CV Model implemented above
    best_model_gbt = model_gbt.bestModel
    
    # get the best parameters for a model
    best_max_iter = best_model_gbt._java_obj.getMaxIter()
    best_max_depth = best_model_gbt._java_obj.getMaxDepth()

    # Initialize a RF Model with the best parameters set
    gbt_best = GBTClassifier(labelCol="Churn", featuresCol="sc_features", maxIter = best_max_iter, maxDepth=best_max_depth)
    
    # Train a model
    model_fin_gbt = gbt_best.fit(trainingData)
    
    # Initialize an evaluator
    evaluator = MulticlassClassificationEvaluator(labelCol="Churn", predictionCol="prediction", metricName="f1")

    # Calculate f1 for train dataset
    pred_rf_train = model_fin_gbt.transform(trainingData)
    f1_train = evaluator.evaluate(pred_rf_train)
    
    # Calculate f1 for test dataset
    pred_rf_test = model_fin_gbt.transform(testData)
    f1_test = evaluator.evaluate(pred_rf_test)
    
    # Append the f1 scores to corresponding lists
    f1_list_train.append(f1_train)
    f1_list_test.append(f1_test)
    
    print('Dataset Split {}: Train f1 score {}, Test f1 score {}'.format(i+1,round(f1_train,5), round(f1_test,5)))

VBox()

Dataset Split 1: Train f1 score 0.85686, Test f1 score 0.85263
Dataset Split 2: Train f1 score 0.86212, Test f1 score 0.85002
Dataset Split 3: Train f1 score 0.85956, Test f1 score 0.8535
Dataset Split 4: Train f1 score 0.86038, Test f1 score 0.86126
Dataset Split 5: Train f1 score 0.85902, Test f1 score 0.85804
Dataset Split 6: Train f1 score 0.85939, Test f1 score 0.85436
Dataset Split 7: Train f1 score 0.86245, Test f1 score 0.85262
Dataset Split 8: Train f1 score 0.85329, Test f1 score 0.84343
Dataset Split 9: Train f1 score 0.86521, Test f1 score 0.852
Dataset Split 10: Train f1 score 0.85876, Test f1 score 0.84737

Calculate some statistics

In [8]:
train_mean = round(np.mean(f1_list_train),3)
test_mean = round(np.mean(f1_list_test),3)

train_std = round(np.std(f1_list_train),3)
test_std = round(np.std(f1_list_test),3)

print('Results after 10 simulations: Train Mean {}, Train Std {}; Test Mean {}, Test Std {}'.format(train_mean, train_std, 
                                                                                                    test_mean, test_std))

VBox()

Results after 10 simulations: Train Mean 0.86, Train Std 0.003; Test Mean 0.853, Test Std 0.005

As we can see both Train and Test F1 scores remain very stable over 10 folds. Thus the Model is robust. The differnce between Train and Tests scores of 0.007 is tiny. Therefore I can conclude that this model is not overfitted.

### Get feature Importance from RF Classifier

In [14]:
# define best model
best_model_rf = model_rf.bestModel

best_num_trees = best_model_rf._java_obj.getNumTrees()
best_max_depth = best_model_rf._java_obj.getMaxDepth()

rf_best = RandomForestClassifier(labelCol="Churn", featuresCol="sc_features", numTrees=best_num_trees, maxDepth=best_max_depth)

# fit best model
model_fin = rf_best.fit(trainingData)

VBox()

In [15]:
model_fin.featureImportances

VBox()

SparseVector(7, {0: 0.0836, 1: 0.2483, 2: 0.1167, 3: 0.1083, 4: 0.0129, 5: 0.177, 6: 0.2532})

In [16]:
X_cols

VBox()

['Thumbs_Down', 'diff_dates_max', 'diff_dates_mean', 'diff_dates_session_week', 'gender', 'number_songs', 'reg_length']

### Get feature Importance from GBT Classifier

In [17]:
# define best model
best_model_gbt = model_gbt.bestModel

best_max_iter = best_model_gbt._java_obj.getMaxIter()
best_max_depth = best_model_gbt._java_obj.getMaxDepth()

gbt_best = GBTClassifier(labelCol="Churn", featuresCol="sc_features", maxIter = best_max_iter, maxDepth=best_max_depth)

# fit best model
model_fin_gbt = gbt_best.fit(trainingData)

VBox()

In [18]:
model_fin_gbt.featureImportances

VBox()

SparseVector(7, {0: 0.1041, 1: 0.2062, 2: 0.2697, 3: 0.0375, 4: 0.0011, 5: 0.2676, 6: 0.1138})

In [19]:
X_cols

VBox()

['Thumbs_Down', 'diff_dates_max', 'diff_dates_mean', 'diff_dates_session_week', 'gender', 'number_songs', 'reg_length']

##### Conclusion

In order to investigate the feature importnaces the best sets of parameters from both RF and GBT models were used for the final versions of the models. 

Result

Random Forest Classifier's top features:
1. 'reg_length' 0.253
2. 'diff_dates_max' 0.248
3. 'number_songs' 0.177

GBT Classifier's top features:
1. 'diff_dates_mean' 0.27
2. 'number_songs' 0.268
3. 'diff_dates_max' 0.206

So, both classifiers agreed that the amount of songs user listened as well as the time difference between sessions are the most influential parameters for user to stay or cancel the subsription.