## ORMA -> Recipe Update:
1. separate facet before applying transformations 
2. dependency relationships return
3. color changes represent being affected 

In [1]:
import json
from ORMA.extra_info import generate_recipe as gen
import re
import pandas as pd
from utils import *
from itertools import product

In [2]:
from graphviz import Digraph

In [3]:
# Load recipe 
with open('demo_recipes/depen_analysis_exp2.json', 'r')as json_f:
    json_data = json.load(json_f)
json_data

[{'op': 'core/column-removal',
  'columnName': 'Youtube',
  'description': 'Remove column Youtube'},
 {'op': 'core/text-transform',
  'engineConfig': {'facets': [], 'mode': 'row-based'},
  'columnName': 'State',
  'expression': 'value.toLowercase()',
  'onError': 'keep-original',
  'repeat': False,
  'repeatCount': 10,
  'description': 'Text transform on cells in column State using expression value.toLowercase()'},
 {'op': 'core/text-transform',
  'engineConfig': {'facets': [], 'mode': 'row-based'},
  'columnName': 'County',
  'expression': 'value.trim()',
  'onError': 'keep-original',
  'repeat': False,
  'repeatCount': 10,
  'description': 'Text transform on cells in column County using expression value.trim()'},
 {'op': 'core/text-transform',
  'engineConfig': {'facets': [], 'mode': 'row-based'},
  'columnName': 'State',
  'expression': 'value.toTitlecase()',
  'onError': 'keep-original',
  'repeat': False,
  'repeatCount': 10,
  'description': 'Text transform on cells in column Sta

In [4]:
enhanced_recipe, schemas = gen(project_id=1689182305388)
print(len(schemas))

13


### Read transformation data into model 
> trans_model: (step_id, transformation, from_schema, to_schema)

In [5]:
# save data into triples (step_id, transformation, from_schema, to_schema)
trans_data = model_process(schemas, json_data)

In [6]:
trans_data

[(1, 'core/column-removal', ['Youtube'], ['null']),
 (2, 'core/text-transform', ['State'], ['State']),
 (3, 'core/text-transform', ['County'], ['County']),
 (4, 'core/text-transform', ['State'], ['State']),
 (5, 'core/column-rename', ['city'], ['City']),
 (6, 'core/column-addition', ['State', 'City'], ['Place']),
 (7,
  'core/column-split',
  ['Season1Date'],
  ['Season1Date 1', 'Season1Date 2', 'Season1Date 3']),
 (8, 'core/column-rename', ['Season1Date 1'], ['Season1Date_from']),
 (9, 'core/column-rename', ['Season1Date 2'], ['Season1Date_to']),
 (10,
  'core/column-addition',
  ['Season1Date_from'],
  ['valid_Season1Date_from_flag']),
 (11, 'core/text-transform', ['Season1Date_from'], ['Season1Date_from']),
 (12, 'core/text-transform', ['State'], ['State'])]

In [7]:
# Load process model into pandas dataframe
process_df = pd.DataFrame(trans_data, columns=['step_id', 'transformation', 'from_schema', 'to_schema'], index=None)

In [8]:
process_df

Unnamed: 0,step_id,transformation,from_schema,to_schema
0,1,core/column-removal,[Youtube],[null]
1,2,core/text-transform,[State],[State]
2,3,core/text-transform,[County],[County]
3,4,core/text-transform,[State],[State]
4,5,core/column-rename,[city],[City]
5,6,core/column-addition,"[State, City]",[Place]
6,7,core/column-split,[Season1Date],"[Season1Date 1, Season1Date 2, Season1Date 3]"
7,8,core/column-rename,[Season1Date 1],[Season1Date_from]
8,9,core/column-rename,[Season1Date 2],[Season1Date_to]
9,10,core/column-addition,[Season1Date_from],[valid_Season1Date_from_flag]


In [9]:
# facts file: process(step_id, op_name, from_nodes, to_nodes)
# dep(x,y) :- process(x, op_name1, from_nodes1, to_nodes1), process(y, op_name2, to_nodes1, to_nodes2), y>x.
# dep(x,y) :- process(x, op_name1, from_nodes1, to_nodes1), process(z, op_name2, to_nodes1, to_nodes2), z>x, dep(z,y).

## Create a Graph to Save Dependency Relationships at Step/Operation Level:
> Return Dependency Relationships at Step/Operation Level Saved in a Dictionary for Easy Query: 
>>  {Step_x: [list of dependencies of Step_x], Step_y: [list of dependencies of Step_y],...}

In [10]:
# Three modes: modify; remove; insert 
ops_dep_dict = depend_step(process_df)
ops_dep_dict

0                                     [(Youtube, null)]
1                                      [(State, State)]
2                                    [(County, County)]
3                                      [(State, State)]
4                                        [(city, City)]
5                       [(State, Place), (City, Place)]
6     [(Season1Date, Season1Date 1), (Season1Date, S...
7                   [(Season1Date 1, Season1Date_from)]
8                     [(Season1Date 2, Season1Date_to)]
9     [(Season1Date_from, valid_Season1Date_from_flag)]
10               [(Season1Date_from, Season1Date_from)]
11                                     [(State, State)]
Name: dependency, dtype: object


{1: [11, 1, 3, 5],
 3: [11, 3, 5],
 4: [4, 5],
 6: [6, 7, 8, 9, 10],
 7: [9, 10, 7],
 0: [0],
 2: [2],
 5: [5],
 8: [8],
 9: [9],
 10: [10],
 11: [11]}

## Mermaid-js Play Graph in Jupyter Notebook 

In [11]:
import base64
from IPython.display import Image, display
import matplotlib.pyplot as plt

def mm(graph):
  graphbytes = graph.encode("ascii")
  base64_bytes = base64.b64encode(graphbytes)
  base64_string = base64_bytes.decode("ascii")
  display(Image(url="https://mermaid.ink/img/" + base64_string))

mm("""

graph TD 
   col0(Youtube) --> id0[step 0]
   id0 --> col1(Null);
   style col0 fill:#ffd
   style id0 fill:#afe1af
   style col1 fill:#FFBF00

   col2(State) --> id1[step 1]
   id1[step 1] --> col3(State1);
   style col2 fill:#ffd
   style id1 fill:#afe1af
   style col3 fill:#ffd


   col4(Country) --> id2[step 2]
   id2 --> col5(Country1);
   style col3 fill:#ffd
   style id2 fill:#afe1af
   style col4 fill:#ffd
   style col5 fill:#ffd


   col3 --> id3[step 3]
   id3[step 3] --> col6(State2);
   style id3 fill:#afe1af
   style col6 fill:#ffd



   col7(city) --> id4[step 4]
   id4[step 4]--> col8(City)
   style col7 fill:#ffd
   style id4 fill:#afe1af
   style col8 fill:#FFBF00


   col6 --> id5[step 5]
   id5[step 5] --> col9(Place)
   col8 --> id5[step 5]
   style id5 fill:#afe1af
   style col9 fill:#FFBF00


   col10(Season1Date) --> id6[step 6]
   id6[step 6] --> col11(Season1Date 1)
   id6[step 6] --> col12(Season1Date 2)
   id6[step 6] --> col13(Season1Date 3)
   style id6 fill:#afe1af
   style col10 fill:#ffd
   style col11 fill:#FFBF00
   style col12 fill:#FFBF00
   style col13 fill:#FFBF00

   col11 --> id7[step 7]
   id7[step 7] --> col14(Season1Date_from)
   style id7 fill:#afe1af
   style col14 fill:#FFBF00

   col12 --> id8[step 8]
   id8[step 8] --> col15(Season1Date_to)
   style id8 fill:#afe1af
   style col15 fill:#FFBF00

   col14 --> id9[step 9]
   id9[step 9] --> col16(valid_Season1Date_from_flag)
   style id9 fill:#afe1af
   style col16 fill:#FFBF00

   col14 --> id10[step 10]
   id10[step 10] --> col17(Season1Date_from1)
   style id10 fill:#afe1af
   style col17 fill:#ffd


   col6 --> id11[step 11]
   id11[step 11] --> col18(State3)
   style id11 fill:#afe1af
   style col18 fill:#ffd


   col10(Season1Date) --> id12[new step 6]
   id12 --> col19(new Season1Date 1)
   id12 --> col20(new Season1Date 2)
   style id12 fill:#a9a
   style col19 fill:#fff
   style col20 fill:#fff

   col19 -. matching&replace.->col11
   col20 -. matching&replace.->col12

   id0 -.-> id1
   id1 -.-> id2
   id2 -.-> id3
   id3 -.-> id4
   id4 -.-> id5
   id5 -.-> id6
   id6 -.-> id7
   id7 -.-> id8
   id8 -.-> id9
   id9 -.-> id10
   id10 -.-> id11
   id11 -.-> id12

   """)


## Use ORMA to Generate Graph to Help Execute Dependency Information at Column Level

In [12]:
dep_cols, df = depend_col(process_df)
dep_cols

{'Youtube_0': ['Youtube_0', 'null'],
 'State_0': ['State_0', 'State_1', 'State_2', 'Place_0', 'State_3'],
 'County_0': ['County_0', 'County_1'],
 'State_1': ['State_1', 'State_2', 'Place_0', 'State_3'],
 'city_0': ['city_0', 'City_0', 'Place_0'],
 'State_2': ['State_2', 'Place_0', 'State_3'],
 'City_0': ['City_0', 'Place_0'],
 'Season1Date_0': ['Season1Date_0',
  'Season1Date 1_0',
  'Season1Date_from_0',
  'valid_Season1Date_from_flag_0',
  'Season1Date_from_1',
  'Season1Date 2_0',
  'Season1Date_to_0',
  'Season1Date 3_0'],
 'Season1Date 1_0': ['Season1Date 1_0',
  'Season1Date_from_0',
  'valid_Season1Date_from_flag_0',
  'Season1Date_from_1'],
 'Season1Date 2_0': ['Season1Date 2_0', 'Season1Date_to_0'],
 'Season1Date_from_0': ['Season1Date_from_0',
  'valid_Season1Date_from_flag_0',
  'Season1Date_from_1'],
 'null': ['null'],
 'County_1': ['County_1'],
 'Place_0': ['Place_0'],
 'Season1Date 3_0': ['Season1Date 3_0'],
 'Season1Date_to_0': ['Season1Date_to_0'],
 'valid_Season1Date_f

### Explore affected dependent columns  
> Provided affected step_id (transformation): modified/inserted/removed

> Return: Affected steps and columns
  -  Challenge: column with status should be clarified

In [13]:
# Dataframe that save the dependency information among data transformations 
df

Unnamed: 0,step_id,transformation,from_schema,to_schema,dependency,from_schema_label,to_schema_label
0,1,core/column-removal,[Youtube],[null],"[(Youtube, null)]",[Youtube_0],[null]
1,2,core/text-transform,[State],[State],"[(State, State)]",[State_0],[State_1]
2,3,core/text-transform,[County],[County],"[(County, County)]",[County_0],[County_1]
3,4,core/text-transform,[State],[State],"[(State, State)]",[State_1],[State_2]
4,5,core/column-rename,[city],[City],"[(city, City)]",[city_0],[City_0]
5,6,core/column-addition,"[State, City]",[Place],"[(State, Place), (City, Place)]","[City_0, State_2]",[Place_0]
6,7,core/column-split,[Season1Date],"[Season1Date 1, Season1Date 2, Season1Date 3]","[(Season1Date, Season1Date 1), (Season1Date, S...",[Season1Date_0],"[Season1Date 1_0, Season1Date 3_0, Season1Date..."
7,8,core/column-rename,[Season1Date 1],[Season1Date_from],"[(Season1Date 1, Season1Date_from)]",[Season1Date 1_0],[Season1Date_from_0]
8,9,core/column-rename,[Season1Date 2],[Season1Date_to],"[(Season1Date 2, Season1Date_to)]",[Season1Date 2_0],[Season1Date_to_0]
9,10,core/column-addition,[Season1Date_from],[valid_Season1Date_from_flag],"[(Season1Date_from, valid_Season1Date_from_flag)]",[Season1Date_from_0],[valid_Season1Date_from_flag_0]


In [14]:
desc_steps, desc_cols = exe_descendants(df, ops_dep_dict, dep_cols, mode="insert", 
                               operator= '{"op": "core/text-transform",\
                                           "engineConfig": {"facets": [],"mode": "row-based"},\
                                           "columnName": "State",\
                                           "expression": "value.trim()",\
                                           "onError": "keep-original",\
                                           "repeat": "false",\
                                           "repeatCount": 10,\
                                           "description": "Text transform on cells in column State using expression value.trim()"}', 
                               step_id=4)