In [1]:
import re, glob, os, sys, gc
import pandas as pd
import numpy as np
import numba

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
angular_df = pd.read_parquet('dataset/df/angular.parquet')
angular_df.shape

(6447, 19)

In [3]:
assert angular_df.element_id.nunique() == angular_df.shape[0] , 'Duplicated nodes'
print("Tree is ok")

Tree is ok


In [4]:
def get_leafs(df):
    return set(df.element_id.values) - set(df.parent_id.values)

In [5]:
displayed_leafs_df = angular_df[angular_df.element_id.isin(get_leafs(angular_df)) & angular_df.displayed]
displayed_leafs_df.shape[0], angular_df[angular_df.displayed].shape[0]

(2338, 4961)

In [6]:
# pg.displayed_leafs_df.groupby('tag_name').size()

In [7]:
displayed_leafs_df.iloc[111]

parent_id                              7839ed69-b14a-42a0-865f-878ca50161b6
element_id                             c52342c3-37a2-4bf7-b668-db5d0f2e99a5
tag_name                                                                div
x                                                                       256
y                                                                      1720
height                                                                 16.0
width                                                                  16.0
displayed                                                              True
enabled                                                                True
selected                                                              False
text                                                                   None
is_hover                                                               True
base64png_before_hover    iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAA...
base64png_af

In [8]:
@numba.jit(forceobj=True)
def get_parents_list(tree_dict:dict, element_id:str, paternts_list:list=[]):
    parent_id = tree_dict.get(element_id)
    if parent_id is None:
        return paternts_list
    else:
        paternts_list.append(parent_id)
        return get_parents_list(tree_dict, element_id=parent_id, paternts_list=paternts_list)
    

In [9]:
tree_dict = dict(zip(angular_df.element_id.values, angular_df.parent_id.values))

In [10]:
el_id = 'c52342c3-37a2-4bf7-b668-db5d0f2e99a5'
list_of_parents=[]

get_parents_list(tree_dict=tree_dict, element_id= el_id, paternts_list=list_of_parents)

['7839ed69-b14a-42a0-865f-878ca50161b6',
 '48b64ba2-98d7-4cd2-9d1f-663296683175',
 '16d3bd3c-8f7a-41dd-a95c-b0793c968cec',
 '7930decd-39b2-4c8b-b0f3-99cbf0ed90a8',
 '5ae4fca9-4d82-4b60-8e7d-a802a71d0366',
 'cbedcdf8-4a6e-4ec6-a9d9-03ca49034499',
 '1e10782a-0399-4c42-953a-184d1a07c0df',
 'ff4e087b-7c4a-444e-8ec4-cedcaad7bf68',
 'ce4f5657-7d10-43aa-91f8-25495162c942',
 '1b39fc83-f378-48b0-b34e-37dabddbead5',
 'dee6f6bb-f389-4684-b1c0-da2f854c8515',
 '91df6dab-a0b3-4a93-9851-fb1c6ac64640']

In [11]:
tag_name_dict = dict(zip(angular_df.element_id.values, angular_df.tag_name))
width_dict = dict(zip(angular_df.element_id.values, angular_df.width))
height_dict = dict(zip(angular_df.element_id.values, angular_df.height))
displayed_dict = dict(zip(angular_df.element_id.values, angular_df.displayed))
x_dict = dict(zip(angular_df.element_id.values, angular_df.x))
y_dict = dict(zip(angular_df.element_id.values, angular_df.y))



In [12]:
path = []
path = "/".join([ tag_name_dict[i]+':'+str(int(width_dict[i])) +':'+str(int(height_dict[i])) for i in get_parents_list(tree_dict=tree_dict, element_id=el_id, paternts_list=path)])
path

'div:16:16/label:73:24/mat-checkbox:73:29/section:908:60/mat-card-content:908:209/mat-card:940:241/slide-toggle-configurable-example:940:398/div:1000:44678/div:1000:44678/div:1200:44678/body:1200:44802/html:1183:44785'

In [13]:
path = []
path = "/".join([tag_name_dict[i]+':'+str(int(y_dict[i])) for i in get_parents_list(tree_dict=tree_dict, element_id=el_id, paternts_list=path)])
path

'div:1720/label:1716/mat-checkbox:1716/section:1700/mat-card-content:1611/mat-card:1595/slide-toggle-configurable-example:1559/div:60/div:60/div:60/body:0/html:0'

In [None]:
d

## Detect leafs and their parents, grandparents (nodes without children)

In [14]:
from collections import Counter

def get_children(df):
    
    # select all leafs (nodes which are not parents)
    leafs_set = set(df.element_id.values) - set(df.parent_id.values)
    print(f'Leafs set size: {len(leafs_set)} (nodes which have no children)')
    
    # count number of references (nodes which have one level of children)
    parents_dict = Counter(df[df.element_id.isin(leafs_set)].parent_id.values)
    print(f'Parents set size: {len(parents_dict)} (nodes which have only children)')
        
    # select all nodes which have at least two levels of children 
    grandpa_dict = Counter(df[df.element_id.isin(parents_dict.keys())].parent_id.values)
    print(f'Grandpa set size: {len(grandpa_dict)} (nodes which have children and grandchildren)')
    
    return  {'leafs':leafs_set, 'parents':parents_dict, 'grandpa':grandpa_dict}

get_children(angular_df);   

Leafs set size: 3249 (nodes which have no children)
Parents set size: 1749 (nodes which have only children)
Grandpa set size: 1256 (nodes which have children and grandchildren)
