In [1]:
#%%time
#create DASK scheduler and worker remote containers
#this will take at least one minute as there are delays added on purpose to allow containers to spawn
#on successful run you should see scheduler URL printed

#!python3 daskmaster.py

In [2]:
# defined as global
daskschurl = ""

In [3]:
import cdsw
import os
import time

def dask_distributed_launch(nworkers=1,ncpu=2,nmemory=4):
    # modify global copy
    global daskschurl
    
    #check if already running
    if daskschurl!="":
        #print(cdsw.list_workers())
        print(" Dask Scheduler Already Launched " + daskschurl)
        return(daskschurl)
    
    # Launch CDSW workers. These are engines that will run in 
    # the same project, execute a given code or script, and exit.
    # Scheduler engine will keep running in background until session is closed
    dask_scheduler = cdsw.launch_workers(n=1, cpu=2, memory=4, 
                                  kernel="python3",script="daskschedular.py")


    # IP of launched container comes up unknown for a while
    # Wait for a while so IP is available in data structure
    time.sleep(30)

    # Get schedular IP
    schedulerid = dask_scheduler[0]["id"]
    listtemp = cdsw.list_workers()

    for x in listtemp:
      if x["id"] == schedulerid:
        schedulerip = x["ip_address"]


    print(" Scheduler IP: " + schedulerip)

    #Scheduler protocol and port - defaults from Dask
    schproto = "tcp://"
    schport = ":8786"

    schloc = schproto + schedulerip + schport
    print(" Scheduler URL: " + schloc)

    dask_client = []
    # Launch at least one Dask Worker
    for c in range(nworkers):
        dask_client = dask_client + cdsw.launch_workers(n=1, cpu=ncpu, memory=nmemory, 
                                  kernel="python3",script="daskworker.py",
                                      env={"DASKSCHURL": schloc})

        # wait for a while until the container is launched successfully
        time.sleep(10)
    
    #set scheduler URL as environment variable
    #os.putenv("DASKSCHURL", schloc)
    daskschurl = schloc

    #return scheduler URL
    return(schloc)


def dask_stop_workers():
    global daskschurl
    cdsw.stop_workers()
    daskschurl = ""
    

In [4]:
def dask_test():
    from dask.distributed import Client
    client = Client(daskschurl)
    import dask.array as da
    x = da.random.random((40000,40000),chunks=(1000,1000))
    y = da.exp(x).sum()
    print(" Result of DASK distributed array test: " + str(y.compute()) + "\n")


In [5]:
#stop any previous dask distributed containers
dask_stop_workers()

In [6]:
#launch scheduler and worker container(s) - you can specify number of workers as argument
#to relaunch - first call dask_stop_workers() and then call this again
dask_distributed_launch(1,4,8)

 Scheduler IP: 10.10.7.245
 Scheduler URL: tcp://10.10.7.245:8786


'tcp://10.10.7.245:8786'

In [7]:
#check if global variable has the right URL
#we will use this to register a client
print(daskschurl)

tcp://10.10.7.245:8786


In [8]:
import pandas as pd
import cdsw
#any previously stopped pods may show up with status failed
#we are not showing those
workers_list = cdsw.list_workers()
print(" === List of launched running pods === ")
print(" ===      (scheduler + workers)    === ")

#collect relevant fields
workersl = []
for l in workers_list:
    #print(l)
    #print(" id: " + l["id"] + " IP Addr: " + l["ip_address"] + " CPUs: " + str(l["cpu"]) + " Memory: " + str(l["memory"]) \
    #     + " Status: " + l["status"])
    workersl = workersl + [[l["id"],l["ip_address"],l["cpu"],l["memory"],l["status"]]]

workersactive = pd.DataFrame(workersl,columns=["Id","IP Address","CPUs","Memory","Status"])
print(workersactive[workersactive.Status=="running"])

 === List of launched running pods === 
 ===      (scheduler + workers)    === 
                 Id   IP Address  CPUs  Memory   Status
0  ocphcw9dh377rj4h  10.10.16.13     4       8  running
1  14q2ncjinpwm6khn  10.10.7.245     2       4  running


In [9]:
%%time
#check distributed dask is working
dask_test()

#Register a DASK client and run a test
#from dask.distributed import Client
#client = Client(daskschurl)
#import dask.array as da
#x = da.random.random((40000,40000),chunks=(1000,1000))
#y = da.exp(x).sum()
#print("DASK test result: ") 
#print(y.compute())

 Result of DASK distributed array test: 2749263366.92

CPU times: user 6.35 s, sys: 48.4 ms, total: 6.4 s
Wall time: 12.3 s


In [27]:
%%time
#Read file from S3
#configure access via aws configure in terminal
#need pip3 install awscli for it to work

#Dask dataframe has poor ability to infer column types
#Need to be fixed manually
#options can be modified in ~/.config/dask/distributed.yaml 


import dask.dataframe as dd
import s3fs
df = dd.read_csv("s3://harshalpatil-s3/loans_accepted_2007_to_2018Q4.csv", \
                blocksize="25MB",sample=10000000,dtype={'id': 'object', \
       'sec_app_earliest_cr_line': 'object', 'desc': 'object'})

CPU times: user 516 ms, sys: 69.6 ms, total: 585 ms
Wall time: 3.55 s


In [28]:
#check number of partitions 

#df.known_divisions
#df.set_index("grade")
#df = df.repartition(npartitions=10000)
df.npartitions


68

In [45]:
nonnullcnt=pd.DataFrame(df.count().compute())

In [47]:
from IPython.display import display, HTML
display(HTML(nonnullcnt.to_html()))

Unnamed: 0,0
id,2260701
member_id,0
loan_amnt,2260668
funded_amnt,2260668
funded_amnt_inv,2260668
term,2260668
int_rate,2260668
installment,2260668
grade,2260668
sub_grade,2260668


In [29]:
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,...,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,...,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,...,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,...,,,Cash,N,,,,,,
4,68476807,,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,...,,,Cash,N,,,,,,


In [31]:
df=df.drop(columns=["member_id"])

In [32]:
df.describe().compute()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,delinq_2yrs,fico_range_low,fico_range_high,...,deferral_term,hardship_amount,hardship_length,hardship_dpd,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,settlement_amount,settlement_percentage,settlement_term
count,2260668.0,2260668.0,2260668.0,2260668.0,2260668.0,2260664.0,2258957.0,2260639.0,2260668.0,2260668.0,...,10917.0,10917.0,10917.0,10917.0,8651.0,10917.0,10917.0,34246.0,34246.0,34246.0
mean,15046.93,15041.66,15023.44,13.09283,445.8068,77992.43,18.8242,0.3068792,698.5882,702.5884,...,3.0,155.045981,3.0,13.743886,454.798089,11636.883942,193.994321,5010.664267,47.780365,13.191322
std,9190.245,9188.413,9192.332,4.832138,267.1735,112696.2,14.18333,0.8672303,33.01038,33.01124,...,0.0,129.040594,0.0,9.671178,375.3855,7625.988281,198.629496,3693.12259,7.311822,8.15998
min,500.0,500.0,0.0,5.31,4.93,0.0,-1.0,0.0,610.0,614.0,...,3.0,0.64,3.0,0.0,1.92,55.73,0.01,44.21,0.2,0.0
25%,9000.0,9000.0,9000.0,11.47,293.905,50000.0,12.88,0.0,685.0,689.0,...,3.0,84.145,3.0,10.5,245.61,7356.245,100.4425,3235.13,45.0,12.0
50%,15000.0,15000.0,15000.0,14.33,439.97,70000.0,19.05,0.0,705.0,709.0,...,3.0,154.71,3.0,17.0,457.47,12819.92,197.95,5443.0,50.0,18.0
75%,30000.0,30000.0,30000.0,17.77,697.9,122750.0,26.04,1.0,735.0,739.0,...,3.0,402.07,3.0,24.0,1213.83,29664.02,1045.41,13610.0,65.06,24.0
max,40000.0,40000.0,40000.0,30.99,1719.83,110000000.0,999.0,58.0,845.0,850.0,...,3.0,943.94,3.0,37.0,2680.89,40306.41,1407.86,33601.0,521.35,181.0


In [17]:
#Use the wine dataset
from sklearn.datasets import load_wine
data = load_wine()

from dask import dataframe as dd
#since the data is numpy series we will not use df = dd.from_pandas(data[‘data’])
#dask has various ways to convert numpy and pandas to dask dataframes      
df = dd.from_array(data['data'])
df.columns = data['feature_names']

#print a few lines
print("\n Dataframe: ")
print(df.head())

#Get target variable
dt = dd.from_array(data['target'])
dt.columns = ["target"]
      
#print target classes example
print("\n Target: ")
print(dt.head())      

# train and test split
from dask_ml.model_selection import train_test_split
train, test, train_labels, test_labels = train_test_split(df,dt,random_state=123)      
      
#xgboost
from dask_ml.xgboost import XGBClassifier
est = XGBClassifier()      
      
#fit model      
model = est.fit(train, train_labels)

#which features contribute most
import pandas as pd
featureimp = pd.DataFrame(model.feature_importances_)
featureimp.columns = ['classifier_feature_importance']
featureimp["variable"] = data['feature_names']
print("\n\n === Xgboost Classifier Feature Importance: === ")
print(featureimp.sort_values(by="classifier_feature_importance", ascending=False))
#featureimp.to_csv()


#predictions
ypred = model.predict(test)

#sample some predictions
print("\n Sample initial five predictions: ")      
print(ypred[[0,1,2,3,4]].compute())

#ensure model is predicting all classes - not just 0
print("\n Check classes other than zero predicted: ")
print(ypred[ypred>0].compute())
      
#check accuracy on test set      
from dask_ml import metrics
print("\n\n Model Accuracy: ")      
print(metrics.accuracy_score(test_labels,model.predict(test)))
      
print("\n === End Dask Xgboost === \n")



 Dataframe: 
   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline  
0    