# Computation API

In [None]:
from kywy.client.kawa_client import KawaClient as K

kawa = K.load_client_from_environment()
cmd = kawa.commands

In [None]:
# Creates a sample dataset to illustrate the API behaviour
import pandas as pd
import zoneinfo

from uuid import uuid4
from datetime import date, datetime

def utc(year, month, day, hour, minute, second):
    return datetime(year, month, day, hour, minute, second, tzinfo=zoneinfo.ZoneInfo('UTC'))

sample_orders_df = pd.DataFrame([
    {
       'id': 1,
       'flag':True,
       'comment':'Sold 10 items',
       'price': 1.124,
       'order_date': date(2035,1,1),
       'update': utc(2035,1,1,23,45,6),
       'client':'Wayne Enterprises',
    },
    {
       'id': 2,
       'flag':False,
       'comment':'Sold 20 items',
       'price': 2.228,
       'order_date': date(2035,1,2),
       'update': utc(2035,1,2,3,45,6),
       'client':'Wayne Enterprises',
    },
    {
       'id': 3,
       'flag':False,
       'comment':'Sold 100 items',
       'price': 10.124,
       'order_date': date(2035,3,3),
       'update': utc(2035,3,3,5,2,1),
       'client':'Wonka',
    },
    {
       'id': 4,
       'flag':False,
       'comment':'Sold 1 items',
       'price': 0.1,
       'order_date': date(2035,3,6),
       'update': utc(2035,1,2,3,45,6),
       'client':'Cyberdyne Systems',
    },
    {
       'id': 5,
       'flag':False,
       'comment':'Sold 1 items',
       'price': 0.1,
       'order_date': date(2035,3,7),
       'update': utc(2035,3,7,7,2,7),
       'client':'Cyberdyne Systems',
    },
])

loader = kawa.new_data_loader(
    df=sample_orders_df, 
    datasource_name='Computation API sandbox' 
)

loader.create_datasource(primary_keys=['id'])

loader.load_data(
    create_sheet=True,
    reset_before_insert=True,
);

## 1. The computation DSL

This will perform computations on a given sheet, inheriting all the row/column level security, as well as the datasource and sheet level filters.

 ### 1.a Simple example

In its simplest form, this DSL can be used to select all the rows and all the columns of a sheet as below:

In [None]:
# Build the query (no computation is done at this point)
query = (kawa
         .sheet(
             sheet_name='Computation API sandbox', 
             force_tz='UTC',
         )
         .select(K.cols()))

# Send the query to KAWA and returns a pandas df
query.compute()

### 1.b using group_by and aggregations

The `group_by` operator lets you specify which columns you want to group your data on.

When using this operator, you can specify aggregations on the columns you select.

The available aggregations are the following:
The __first__ aggregation is available for all types.

__decimal and integer:__
- sum 
- avg, median
- min, max. min_abs, max_abs
- var_sample, var_pop, std_dev_sample, std_dev_pop
- lowest_decile, lowest_quartile, highest_decile, highest_quartile

__text and boolean:__
- count, count_unique, percent_filled, percent_empty, count_empty
- identical, identical_ignore_empty

__date and date_time:__
- min, max
- identical
- count_unique

In [None]:
# Example with one level of grouping
(kawa
  .sheet(
      sheet_name='Computation API sandbox', 
      force_tz='UTC',
  )
  .select(
      # Add an aggregation on the columns
      K.col('id').count(),
      K.col('price').sum(),
      K.col('update').max(),
      K.col('order_date').first(),
  )
  .group_by('client')
 
).compute()

In [None]:
# Example with two levels of grouping
# Notice the grouping(N) columns that are added to the resulting dataframe
(kawa
  .sheet(
      sheet_name='Computation API sandbox', 
      force_tz='UTC',
  )
  .select(
      K.col('id').count(),
      K.col('price').sum(),
      K.col('update').max(),
      K.col('order_date').first(),
  )
  .group_by('client', 'flag')
 
).compute(use_group_names=True)

### 1.c using upsampling and data binning

When grouping by __date__, __date_time__, __integer__ or __decimal__ columns, you can specify upsampling/binning.

Here are the available samplers:


__date:__ 
- WEEK
- MONTH
- QUARTER
- SEMESTER
- YEAR
- YEAR_AND_WEEK
- YEAR_AND_MONTH
- YEAR_AND_QUARTER
- YEAR_AND_SEMESTER
- DAY_OF_YEAR
- DAY_OF_WEEK

__date_time:__ 
Same as for date plus:
- DAY
- TWELVE_HOURS
- SIX_HOURS
- HOUR
- THIRTY_MINUTES
- TWENTY_MINUTES
- FIFTEEN_MINUTES
- TEN_MINUTES
- FIVE_MINUTES
- MINUTE
- THIRTY_SECONDS

Those are computed in your local timezone by default or in the timezone you specify when building the sheet object.

__decimal and integer:__ (data binning)
- FIXED_NUMBER_OF_BINS (with extra argument: `how_many_buckets`)
- LIST_OF_BINS (with extra argument: `buckets`)
- FIXED_SIZE_BINS (with extra argument: `bucket_size`)



In [None]:
(kawa
  .sheet(
      sheet_name='Computation API sandbox', 
      force_tz='UTC',
  )
  .select(
      K.col('id').count(),
      K.col('price').sum(),
      K.col('order_date').max(),
      K.col('flag').first(),
  )
  # Add upsampling per month on the order date
  .sample(
      sampler='YEAR_AND_MONTH', 
      column_name='order_date',
  )
 .group_by('order_date')
 
).compute()

In [None]:
(kawa
  .sheet(
      sheet_name='Computation API sandbox', 
      # When doing date_time upsampling, the usage force_tz is recommended
      # to explicietely defining in which TZ the sampling will be defined
      force_tz='UTC',
  )
  .select(
      K.col('id').count(),
      K.col('price').avg(),
      K.col('order_date').max(),
      K.col('flag').first(),
  )
  .sample(
      sampler='TWENTY_MINUTES', 
      column_name='update',
  )
 .group_by('update')
 
).compute()

In [None]:
(kawa
  .sheet(
      sheet_name='Computation API sandbox', 
      force_tz='UTC',
  )
  .select(
      K.col('id').count(),
      K.col('price').avg(),
      K.col('order_date').max(),
      K.col('flag').first(),
  )
  # specify a list of buckets
  .sample(
      sampler='LIST_OF_BINS', 
      column_name='price',
      buckets=[0,1,10],
  )
 .group_by('price')
 
).compute()

### 1.d using order_by and limit

Those two operators can help selecting TOP or WORST performers according to a given metric.
Please note that if you omit the `limit` operator, it will be automacically set to 100.
If you want to load data without limit, use the `no_limit()` operator (or alternatively, specify a negative number in the limit operator).

In [None]:
# Example of query to get the top 1 week for total price
(kawa
  .sheet(
      sheet_name='Computation API sandbox', 
      force_tz='UTC',
  )
  .select(
      K.col('id').count(),
      K.col('price').sum(),
      K.col('order_date').max(),
      K.col('flag').first(),
  )
  .sample(
      sampler='YEAR_AND_WEEK', 
      column_name='order_date',
  )
 .group_by('order_date')
 .limit(1)
 .order_by(column_name='price', ascending=False)
 
).compute()

### 1.e special syntax: group_by(1)

Use this syntax to retrieve the global aggregation of the entire dataset.
You can also use aliases when you select multiple times the same column with different aggregations

In [None]:
(kawa
  .sheet(
      sheet_name='Computation API sandbox', 
      force_tz='UTC',
  )
  .select(
      K.col('price').sum().alias('total price'),
      K.col('price').avg().alias('avg price'),
      K.col('price').median().alias('median price'),
  )
 .group_by('1')
).compute()