In [1]:
#%%time
#create DASK scheduler and worker remote containers
#this will take at least one minute as there are delays added on purpose to allow containers to spawn
#on successful run you should see scheduler URL printed

#!python3 daskmaster.py

In [2]:
# define as global
daskschurl = ""

In [3]:
import cdsw
import os
import time

def dask_distributed_launch(nworkers=1):
    # modify global copy
    global daskschurl
    
    #check if already running
    if daskschurl!="":
        #print(cdsw.list_workers())
        print(" Dask Scheduler Already Launched " + daskschurl)
        return(daskschurl)
    
    # Launch CDSW workers. These are engines that will run in 
    # the same project, execute a given code or script, and exit.
    # Scheduler engine will keep running in background until session is closed
    dask_scheduler = cdsw.launch_workers(n=1, cpu=2, memory=4, 
                                  kernel="python3",script="daskschedular.py")


    # IP of launched container comes up unknown for a while
    # Wait for a while so IP is available in data structure
    time.sleep(30)

    # Get schedular IP
    schedulerid = dask_scheduler[0]["id"]
    listtemp = cdsw.list_workers()

    for x in listtemp:
      if x["id"] == schedulerid:
        schedulerip = x["ip_address"]


    print(" Scheduler IP: " + schedulerip)

    #Scheduler protocol and port - defaults from Dask
    schproto = "tcp://"
    schport = ":8786"

    schloc = schproto + schedulerip + schport
    print(" Scheduler URL: " + schloc)


    # Launch at least one Dask Worker

    dask_client = cdsw.launch_workers(n=nworkers, cpu=2, memory=4, 
                                  kernel="python3",script="daskworker.py",
                                      env={"DASKSCHURL": schloc})

    # wait for a while until the container is launched successfully
    time.sleep(30)
    
    #set scheduler URL as environment variable
    #os.putenv("DASKSCHURL", schloc)
    daskschurl = schloc

    #return scheduler URL
    return(schloc)


def dask_stop_workers():
    global daskschurl
    cdsw.stop_workers()
    daskschurl = ""
    

In [4]:
def dask_test():
    import dask.array as da
    x = da.random.random((40000,40000),chunks=(1000,1000))
    y = da.exp(x).sum()
    print(" Result of DASK distributed array test " + y.compute())


In [5]:
#stop any previous dask distributed containers
#dask_stop_workers()

In [6]:
#launch scheduler and worker container(s) - you can specify number of workers as argument
#to relaunch - first call dask_stop_workers() and then call this again
dask_distributed_launch()

 Scheduler IP: 10.10.176.240
 Scheduler URL: tcp://10.10.176.240:8786


'tcp://10.10.176.240:8786'

In [7]:
#check if global variable has the right URL
#we will use this to register a client
print(daskschurl)

tcp://10.10.176.240:8786


In [8]:
cdsw.list_workers()

[{'adhoc': False,
  'app_url': 'https://291u8i1nqe04sndi.ml-94ebb181-c15.all-se-e.ylcu-atmi.cloudera.site/',
  'assets_cdn_root': 'https://consoles.ml-94ebb181-c15.all-se-e.ylcu-atmi.cloudera.site/0/388/291u8i1nqe04sndi/',
  'batch': True,
  'batch_run_id': None,
  'biller': {'html_url': 'https://ml-94ebb181-c15.all-se-e.ylcu-atmi.cloudera.site/harshal',
   'id': 52,
   'name': 'Harshal Sharadchandra Patil',
   'url': 'https://ml-94ebb181-c15.all-se-e.ylcu-atmi.cloudera.site/api/v1/users/harshal',
   'username': 'harshal'},
  'cluster_id': 1,
  'cost': 0.0024477833333333334,
  'cpu': 2,
  'created_at': '2020-01-09T12:34:54.412Z',
  'creator': {'html_url': 'https://ml-94ebb181-c15.all-se-e.ylcu-atmi.cloudera.site/harshal',
   'id': 52,
   'name': 'Harshal Sharadchandra Patil',
   'url': 'https://ml-94ebb181-c15.all-se-e.ylcu-atmi.cloudera.site/api/v1/users/harshal',
   'username': 'harshal'},
  'deleted': False,
  'engine_image': {'description': 'daskcdsw',
   'id': 21,
   'repository':

In [8]:
%%time
#Register a DASK client and run a test
from dask.distributed import Client
client = Client(daskschurl)
import dask.array as da
x = da.random.random((40000,40000),chunks=(1000,1000))
y = da.exp(x).sum()
y.compute()

CPU times: user 7.54 s, sys: 1.1 s, total: 8.64 s
Wall time: 12.9 s


In [9]:
#Use the wine dataset
from sklearn.datasets import load_wine
data = load_wine()

from dask import dataframe as dd
#since the data is numpy series we will not use df = dd.from_pandas(data[‘data’])
#dask has various ways to convert numpy and pandas to dask dataframes      
df = dd.from_array(data['data'])
df.columns = data['feature_names']

#print a few lines
print("\n Dataframe: ")
print(df.head())

#Get target variable
dt = dd.from_array(data['target'])
dt.columns = ["target"]
      
#print target classes example
print("\n Target: ")
print(dt.head())      

# train and test split
from dask_ml.model_selection import train_test_split
train, test, train_labels, test_labels = train_test_split(df,dt,random_state=123)      
      
#xgboost
from dask_ml.xgboost import XGBClassifier
est = XGBClassifier()      
      
#fit model      
model = est.fit(train, train_labels)

#which features contribute most
import pandas as pd
featureimp = pd.DataFrame(model.feature_importances_)
featureimp.columns = ['classifier_feature_importance']
featureimp["variable"] = data['feature_names']
print("\n\n === Xgboost Classifier Feature Importance: === ")
print(featureimp.sort_values(by="classifier_feature_importance", ascending=False))
#featureimp.to_csv()


#predictions
ypred = model.predict(test)

#sample some predictions
print("\n Sample initial five predictions: ")      
print(ypred[[0,1,2,3,4]].compute())

#ensure model is predicting all classes - not just 0
print("\n Check classes other than zero predicted: ")
print(ypred[ypred>0].compute())
      
#check accuracy on test set      
from dask_ml import metrics
print("\n\n Model Accuracy: ")      
print(metrics.accuracy_score(test_labels,model.predict(test)))
      
print("\n === End Dask Xgboost === \n")



 Dataframe: 
   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline  
0    