# Getting Started with Dask

I installed dask via conda :-)

In [None]:
# Import libraries
import numpy as np
import pandas as pd

import dask.dataframe as dd
import dask.array as da
import dask
from dask.distributed import Client

In [None]:
# Create a client. Here you could also connect to a cluster
client = Client(n_workers=4)
# Let's checkout
client.dashboard_link

In [None]:
# Read dataset from git
df = dd.read_csv('data/taiwanese-bankruptcy.csv', dtype={' Research and development expense rate': 'float64',
       ' Total Asset Growth Rate': 'float64'})

In [None]:
def start_pipeline(df: dd):
    return df.copy()

def rename_columns(df: dd):
    columns = df.columns.to_list()
    columns_without_spaces = [column.strip() for column in columns]
    return df.rename(columns=dict(zip(columns, columns_without_spaces)))


cleaned_dataset = (df.pipe(start_pipeline)
                    .pipe(rename_columns))

# Because we cleaned it now, we can persist the result, so we do not need to recompute it all the time
# It's interesting to see, how the CPU usage differs if this command is not called and the other cells are called subsequently
cleaned_dataset = cleaned_dataset.persist()

In [None]:
mean_after_tax_net_interestrate = cleaned_dataset['After-tax net Interest Rate'].mean() # Gets not computed yet
mean_operating_gross_margin = cleaned_dataset['Operating Gross Margin'].mean() # Gets not computed yet

print(mean_after_tax_net_interestrate, mean_operating_gross_margin)

# Let's compute it
mean_after_tax_net_interestrate_computed, mean_operating_gross_margin_computed = dask.compute(mean_after_tax_net_interestrate, mean_operating_gross_margin)
print(mean_after_tax_net_interestrate_computed, mean_operating_gross_margin_computed)

In [None]:
# Calculate correlation (20x so we can see it in the dashboard)
for i in range(200):
    cleaned_dataset.corr().compute()

In [None]:
client.close()