## ORMA -> History Update:
1. separate facet before applying transformations 
2. dependency relationships return
3. color changes represent being affected 

In [1]:
import json
from ORMA.extra_info import generate_recipe as gen
import re
import pandas as pd
from utils import *
from itertools import product

In [2]:
from graphviz import Digraph

In [3]:
r1 = {
        'd': 'g',
        'g': 't',
        't': 'e'
    } 

In [None]:
# Draw r1
base_graph = Digraph('Base-Graph', filename='Figure/exp.gv')
base_nodes = get_node(r1)
for base_node in base_nodes:
    base_graph.node(base_node) # add nodes to base graph
base_graph = get_edges(r1, base_graph) # add edges to base graph
base_graph.attr('node', shape=feature_data['shape'], style=feature_data['style'], fillcolor=feature_data['fillcolor'])
# base_graph.view()
base_graph

In [None]:
# if value-level changed: color 1
# if schema-level changed: color 2 
# convert the color of the nodes 

In [2]:
# Load recipe 
with open('demo_recipes/depen_analysis_exp2.json', 'r')as json_f:
    json_data = json.load(json_f)
json_data

[{'op': 'core/column-removal',
  'columnName': 'Youtube',
  'description': 'Remove column Youtube'},
 {'op': 'core/text-transform',
  'engineConfig': {'facets': [], 'mode': 'row-based'},
  'columnName': 'State',
  'expression': 'value.toLowercase()',
  'onError': 'keep-original',
  'repeat': False,
  'repeatCount': 10,
  'description': 'Text transform on cells in column State using expression value.toLowercase()'},
 {'op': 'core/text-transform',
  'engineConfig': {'facets': [], 'mode': 'row-based'},
  'columnName': 'County',
  'expression': 'value.trim()',
  'onError': 'keep-original',
  'repeat': False,
  'repeatCount': 10,
  'description': 'Text transform on cells in column County using expression value.trim()'},
 {'op': 'core/text-transform',
  'engineConfig': {'facets': [], 'mode': 'row-based'},
  'columnName': 'State',
  'expression': 'value.toTitlecase()',
  'onError': 'keep-original',
  'repeat': False,
  'repeatCount': 10,
  'description': 'Text transform on cells in column Sta

In [3]:
enhanced_recipe, schemas = gen(project_id=1689182305388)
print(len(schemas))

13


### Read transformation data into model 
> trans_model: (step_id, transformation, from_schema, to_schema)

In [4]:
# save data into triples (step_id, transformation, from_schema, to_schema)
trans_data = []

for idx,schema in enumerate(schemas[1:]):
    step_id = idx+1
    cur_col_list = schema['schema']
    prev_col_list = schemas[step_id-1]['schema']
    op = json_data[idx]['op']
    if len(cur_col_list) < len(prev_col_list):
        assert op == 'core/column-removal'
        colname = json_data[idx]['columnName']
        from_node = [colname]
        to_nodes = ['null']
    elif len(cur_col_list) > len(prev_col_list):
        if op == 'core/column-split':
            from_node = [json_data[idx]['columnName']]
            to_nodes = [x for x in cur_col_list if x not in prev_col_list]
        elif op == 'core/column-addition':
            # we only consider decoding grel expression now...
            # 'expression': "grel:cells.State.value + ',' + cells.City.value"  
            #  cells["Column 1"].value + cells["Column 2"].value            
            exp = json_data[idx]['expression']
            if exp.split(':')[0] == 'grel':
                if re.findall(r'cells.(\w+).value', exp):
                    from_node = re.findall(r'cells.(\w+).value', exp)
                elif re.findall(r'cells\[\"(\w+\s*\d*\w*)\"\]\.value', exp):
                    from_node = re.findall(r'cells\[\"(\w+\s*\d*\w*)\"\]\.value', exp)
                else:
                    from_node = [json_data[idx]['columnName']]
                to_nodes = [json_data[idx]['newColumnName']]
            else:
                from_node = [json_data[idx]['baseColumnName']]
                to_nodes = [json_data[idx]['newColumnName']]

    elif len(cur_col_list) == len(prev_col_list):
        if cur_col_list == prev_col_list:
            from_node = [json_data[idx]['columnName']]
            to_nodes = from_node
        else:
            if op == 'core/column-rename':
                from_node = [json_data[idx]['oldColumnName']]
                to_nodes = [json_data[idx]['newColumnName']]
    trans_data.append((step_id, op, from_node, to_nodes))

In [5]:
trans_data

[(1, 'core/column-removal', ['Youtube'], ['null']),
 (2, 'core/text-transform', ['State'], ['State']),
 (3, 'core/text-transform', ['County'], ['County']),
 (4, 'core/text-transform', ['State'], ['State']),
 (5, 'core/column-rename', ['city'], ['City']),
 (6, 'core/column-addition', ['State', 'City'], ['Place']),
 (7,
  'core/column-split',
  ['Season1Date'],
  ['Season1Date 1', 'Season1Date 2', 'Season1Date 3']),
 (8, 'core/column-rename', ['Season1Date 1'], ['Season1Date_from']),
 (9, 'core/column-rename', ['Season1Date 2'], ['Season1Date_to']),
 (10,
  'core/column-addition',
  ['Season1Date_from'],
  ['valid_Season1Date_from_flag']),
 (11, 'core/text-transform', ['Season1Date_from'], ['Season1Date_from']),
 (12, 'core/text-transform', ['State'], ['State'])]

In [6]:
# Load process model into pandas dataframe
process_df = pd.DataFrame(trans_data, columns=['step_id', 'transformation', 'from_schema', 'to_schema'], index=None)

In [7]:
process_df

Unnamed: 0,step_id,transformation,from_schema,to_schema
0,1,core/column-removal,[Youtube],[null]
1,2,core/text-transform,[State],[State]
2,3,core/text-transform,[County],[County]
3,4,core/text-transform,[State],[State]
4,5,core/column-rename,[city],[City]
5,6,core/column-addition,"[State, City]",[Place]
6,7,core/column-split,[Season1Date],"[Season1Date 1, Season1Date 2, Season1Date 3]"
7,8,core/column-rename,[Season1Date 1],[Season1Date_from]
8,9,core/column-rename,[Season1Date 2],[Season1Date_to]
9,10,core/column-addition,[Season1Date_from],[valid_Season1Date_from_flag]


In [None]:
# facts file: process(step_id, op_name, from_nodes, to_nodes)
# dep(x,y) :- process(x, op_name1, from_nodes1, to_nodes1), process(y, op_name2, to_nodes1, to_nodes2), y>x.
# dep(x,y) :- process(x, op_name1, from_nodes1, to_nodes1), process(z, op_name2, to_nodes1, to_nodes2), z>x, dep(z,y).

## Create a Graph to Save Dependency Relationships at Step/Operation Level:
> Return Dependency Relationships at Step/Operation Level Saved in a Dictionary for Easy Query: 
>>  {Step_x: [list of dependencies of Step_x], Step_y: [list of dependencies of Step_y],...}

In [19]:
def depend_step(json_data, df):
    # @params json_data: recipe data in JSON format 
    # @params df: process data model in pandas dataframe 
    # @return: dictionary of dependency relationships at Step Level
    df['dependency'] = df.apply(lambda row: list(product(row['from_schema'], row['to_schema'])),
                                             axis=1)
    dep_col = df['dependency']
    print(dep_col)
    steps_list = list(dep_col.index)
    graph_steps = graph_op_model(steps_list, dep_col)
    
    for step in steps_list:
        graph_steps[step] = list(set(dfs(graph_steps, step)))
    print(graph_steps)
    return graph_steps 

In [20]:
# Three modes: modify; remove; insert 
depend_step(json_data, process_df)

0                                     [(Youtube, null)]
1                                      [(State, State)]
2                                    [(County, County)]
3                                      [(State, State)]
4                                        [(city, City)]
5                       [(State, Place), (City, Place)]
6     [(Season1Date, Season1Date 1), (Season1Date, S...
7                   [(Season1Date 1, Season1Date_from)]
8                     [(Season1Date 2, Season1Date_to)]
9     [(Season1Date_from, valid_Season1Date_from_flag)]
10               [(Season1Date_from, Season1Date_from)]
11                                     [(State, State)]
Name: dependency, dtype: object
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
0 >>> 1
0 >>> 2
0 >>> 3
0 >>> 4
0 >>> 5
0 >>> 6
0 >>> 7
0 >>> 8
0 >>> 9
0 >>> 10
0 >>> 11
1 >>> 2
1 >>> 3
1 >>> 4
1 >>> 5
1 >>> 6
1 >>> 7
1 >>> 8
1 >>> 9
1 >>> 10
1 >>> 11
2 >>> 3
2 >>> 4
2 >>> 5
2 >>> 6
2 >>> 7
2 >>> 8
2 >>> 9
2 >>> 10
2 >>> 11
3 >>> 4
3 >

{1: [11, 1, 3, 5],
 3: [11, 3, 5],
 4: [4, 5],
 6: [6, 7, 8, 9, 10],
 7: [9, 10, 7],
 0: [0],
 2: [2],
 5: [5],
 8: [8],
 9: [9],
 10: [10],
 11: [11]}

### Explore affected dependencies 
> Provided affected step_id (transformation): modified/inserted/removed

> Return: Affected steps and columns

In [9]:
res = extract_col({'op':'core/column-rename', 'oldColumnName': 'apple'})
res

'apple'

In [None]:
# col_node = extract_col(operator)
"""Given different mode, determine the json data that required to be updated"""
if mode=='insert':
    # Note: default insert type: insert before the step 
    repair_recipe = json_data[step_id:]
    repair_df = df[df['step_id']>=step_id]
else:
    repair_recipe = json_data[step_id+1:]
    repair_df = df[df['step_id']>step_id]