In [1]:
'''!pip install dask[dataframe] --upgrade --quiet
!pip install dask-ml -- quiet
!pip install aiohttp
!pip install joblib
!pip install dask distributed -- upgrade'''

'!pip install dask[dataframe] --upgrade --quiet\n!pip install dask-ml -- quiet\n!pip install aiohttp\n!pip install joblib\n!pip install dask distributed -- upgrade'

In [13]:
#create a client for later ML use
from dask.distributed import Client, progress

client = Client(n_workers=4, threads_per_worker=2, memory_limit='2GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:34307  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 8.00 GB


In [2]:
#import libraries
import dask.dataframe as dd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import roc_auc_score
import joblib
from dask_ml.model_selection import train_test_split
import pandas as pd

In [3]:

#import data set
data = dd.read_csv('https://assets.datacamp.com/production/repositories/1765/datasets/ae888d00f9b36dd7d50a4afbc112761e2db766d2/turnover.csv')

In [4]:
data.head(10)

Unnamed: 0,satisfaction,evaluation,number_of_projects,average_montly_hours,time_spend_company,work_accident,churn,promotion,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low
5,0.41,0.5,2,153,3,0,1,0,sales,low
6,0.1,0.77,6,247,4,0,1,0,sales,low
7,0.92,0.85,5,259,5,0,1,0,sales,low
8,0.89,1.0,5,224,5,0,1,0,sales,low
9,0.42,0.53,2,142,3,0,1,0,sales,low


In [5]:
#check for datatypes
data.dtypes

satisfaction            float64
evaluation              float64
number_of_projects        int64
average_montly_hours      int64
time_spend_company        int64
work_accident             int64
churn                     int64
promotion                 int64
department               object
salary                   object
dtype: object

In [6]:
#check for class balance
data.churn.value_counts().compute()

0    11428
1     3571
Name: churn, dtype: int64

Machine Learning Prep

In [22]:
# This is the feature set
X = data[['satisfaction', 'evaluation', 'number_of_projects', 'average_montly_hours', 'time_spend_company']]
# This is the target variable
y = data['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

Random Forest Model with Dask

In [23]:
rf_model = RandomForestClassifier()

with joblib.parallel_backend('dask'):
    scores = cross_validate(rf_model, X_train.compute(), y_train.compute(), cv=4)
    
scores

{'fit_time': array([4.5639782 , 5.34859324, 5.01502275, 5.18656492]),
 'score_time': array([0.35821724, 0.11410356, 0.13657904, 0.12441707]),
 'test_score': array([0.98703888, 0.9893617 , 0.99268617, 0.98670213])}

In [24]:
rf_params = {"max_depth": [2, 4, 8, 16]}

rf_model = RandomForestClassifier()

grid_search_rf = GridSearchCV(rf_model, 
                              param_grid=rf_params, 
                              return_train_score=True, 
                              cv=4, 
                              scoring='roc_auc')

In [25]:
with joblib.parallel_backend('dask'):
  grid_search_rf.fit(X_train.compute(), y_train.compute())

In [26]:
print("The best value is: ", grid_search_rf.best_params_)
print("The test AUC score is: ", grid_search_rf.score(X_test.compute(), y_test.compute()))

The best value is:  {'max_depth': 16}
The test AUC score is:  0.9935697412684278


Lets do anohter ML, Logistic Regression

In [27]:
from dask_ml.linear_model import LogisticRegression

lr = LogisticRegression()

lr.fit(X_train.to_dask_array(lengths=True), y_train.to_dask_array(lengths=True))

LogisticRegression()

In [28]:
preds_train = lr.predict(X_train.to_dask_array(lengths=True))
preds_test = lr.predict(X_test.to_dask_array(lengths=True))

print("Training score is: ", roc_auc_score(y_train.compute(), preds_train))
print("Test score is: ", roc_auc_score(y_test.compute(), preds_test))

Training score is:  0.5933426546884122
Test score is:  0.5970466013653902


Lastly, I will do and XGBoost

In [29]:
!pip install dask_xgboost --quiet

In [30]:
from dask_ml.xgboost import XGBClassifier

xgb = XGBClassifier(n_estimators=100)

xgb.fit(X_train.to_dask_array(lengths=True), y_train.to_dask_array(lengths=True))

XGBClassifier()

In [31]:
preds_train = xgb.predict(X_train.to_dask_array(lengths=True))
preds_test = xgb.predict(X_test.to_dask_array(lengths=True))

print("Training score is: ", roc_auc_score(y_train.compute(), preds_train))
print("Test score is: ", roc_auc_score(y_test.compute(), preds_test))

Training score is:  0.958620678326841
Test score is:  0.954209953497576


Overall the Random Forest performed the best. 