In [1]:
import sys
import re
import os
from datetime import datetime
from time import sleep
from IPython.display import display, clear_output, HTML
import logging

# set working dir
WORKING_DIR = re.sub('jdi-qasp-ml.*$','',os.path.normpath(os.getcwd())) + 'jdi-qasp-ml'
os.chdir(WORKING_DIR)

import utils
from utils import *
import torch
import pandas as pd
import numpy as np
import torch
from glob import glob
import selenium
from tqdm.auto import tqdm, trange

START_TS = datetime.now()
display(HTML(f'{START_TS} - Current working directory: <b>{WORKING_DIR}</b>'))

import matplotlib.pyplot as plt
import matplotlib.patches as patches
%matplotlib inline

import numba


2021-06-12 01:06:43,672 -INFO - config:config.py:16 - Module utils.config was loaded
2021-06-12 01:06:43,674 -INFO - common:common.py:618 - Module utils.common is loaded...
2021-06-12 01:06:43,676 -INFO - hidden:hidden.py:121 - hidden module is loaded
2021-06-12 01:06:43,678 -INFO - dataset_builder:dataset_builder.py:207 - dataset package is loaded...
2021-06-12 01:06:44,287 -INFO - dataset:dataset.py:653 - dataset module is loaded...


In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer



In [3]:
datasets = JDIDataset.gen_dataset_names()
# datasets  

In [4]:
logger.setLevel(logging.WARNING)

In [5]:
ds_list = []
with trange(len(datasets)) as bar:
    for ds in datasets:
        bar.set_postfix_str(f'Processing dataset: "{ds}"')
        df_file_path = f'dataset/df/{ds}.parquet'
        ann_file_path = f'dataset/annotations/{ds}.txt'
        
        if not os.path.exists(ann_file_path):
            print('skip:', (ann_file_path, df_file_path))
        else:
            df = pd.read_parquet(df_file_path)
            df = build_features(df)
            df = assign_labels(df=df, annotations_file_path=ann_file_path)
            ds_list.append(df)
            
        bar.update(1)

df = pd.concat(ds_list).copy()

  0%|          | 0/42 [00:00<?, ?it/s]

skip: ('dataset/annotations/ac-avto161.txt', 'dataset/df/ac-avto161.parquet')


In [6]:
df[df.label_text != 'n/a'].shape[0]

4773

In [7]:
df.label_text.value_counts()

n/a                 43193
link                 2410
button               1411
checkbox              246
textfield             213
radiobutton           130
selector               82
datetimeselector       34
table                  32
slider-toggle          30
slider                 29
dropdown               27
tab                    27
textarea               23
progressbar            21
range                  18
tree-node              11
fileinput               8
steper                  7
colorpicker             6
iframe                  5
numberselector          3
Name: label_text, dtype: int64

In [8]:
df.tag_name.value_counts()

DIV                             12932
SPAN                             9201
A                                3908
TD                               2775
LI                               1973
                                ...  
MAT-TABLE                           1
DIALOG-OVERVIEW-EXAMPLE             1
MAT-CARD-HEADER                     1
DATEPICKER-LOCALE-EXAMPLE           1
TREE-NESTED-OVERVIEW-EXAMPLE        1
Name: tag_name, Length: 308, dtype: int64

# tag_name

In [9]:
tag_name_series = df[df.label_text != 'n/a'].tag_name.value_counts()
display(tag_name_series)
tag_name_set = set(tag_name_series.index)

A                    2305
DIV                   971
BUTTON                515
INPUT                 493
LABEL                  86
SPAN                   77
LI                     54
SELECT                 52
P                      44
MAT-RADIO-BUTTON       28
TABLE                  24
MAT-CHECKBOX           23
TEXTAREA               22
MAT-SELECT             21
SVG                    13
IMG                     8
MAT-TREE-NODE           8
IFRAME                  5
MAT-SLIDER              5
MAT-BUTTON-TOGGLE       5
MAT-SLIDE-TOGGLE        3
RECT                    2
PROGRESS                2
CIRCLE                  2
G                       1
I                       1
H2                      1
MAT-TABLE               1
H5                      1
Name: tag_name, dtype: int64

In [10]:
tag_name_ohe = OneHotEncoder(handle_unknown='ignore', categories=[list(tag_name_set)])
tag_name_ohe

OneHotEncoder(categories=[['IFRAME', 'P', 'SPAN', 'PROGRESS', 'A', 'SVG',
                           'MAT-SLIDER', 'SELECT', 'MAT-CHECKBOX', 'H2', 'I',
                           'BUTTON', 'LI', 'MAT-SELECT', 'TABLE', 'MAT-TABLE',
                           'H5', 'MAT-RADIO-BUTTON', 'DIV', 'MAT-TREE-NODE',
                           'TEXTAREA', 'MAT-SLIDE-TOGGLE', 'RECT', 'LABEL', 'G',
                           'IMG', 'INPUT', 'MAT-BUTTON-TOGGLE', 'CIRCLE']],
              handle_unknown='ignore')

In [11]:
np.expand_dims(df.tag_name.values, -1)

array([['HTML'],
       ['HEAD'],
       ['META'],
       ...,
       ['SCRIPT'],
       ['SCRIPT'],
       ['SCRIPT']], dtype=object)

In [12]:
tag_name_ohe.fit(np.expand_dims(df[df.label_text != 'n/a'].tag_name.values, -1))

OneHotEncoder(categories=[['IFRAME', 'P', 'SPAN', 'PROGRESS', 'A', 'SVG',
                           'MAT-SLIDER', 'SELECT', 'MAT-CHECKBOX', 'H2', 'I',
                           'BUTTON', 'LI', 'MAT-SELECT', 'TABLE', 'MAT-TABLE',
                           'H5', 'MAT-RADIO-BUTTON', 'DIV', 'MAT-TREE-NODE',
                           'TEXTAREA', 'MAT-SLIDE-TOGGLE', 'RECT', 'LABEL', 'G',
                           'IMG', 'INPUT', 'MAT-BUTTON-TOGGLE', 'CIRCLE']],
              handle_unknown='ignore')

In [13]:
tag_name_sm = tag_name_ohe.transform(np.expand_dims(df.tag_name.values, -1))
tag_name_sm

<47966x29 sparse matrix of type '<class 'numpy.float64'>'
	with 34389 stored elements in Compressed Sparse Row format>

# attributes.role

In [14]:
attr_role_series = df[df.label_text != 'n/a'].attributes.apply(lambda x: None if x is None else x.get('role')).fillna('')
attr_role_series.value_counts()

               3843
button          820
tab              34
listbox          21
combobox         15
grid             12
switch            9
group             8
slider            5
progressbar       3
option            3
Name: attributes, dtype: int64

In [15]:
attr_role_ohe = OneHotEncoder(handle_unknown='ignore').fit(np.expand_dims(attr_role_series.values, -1))
attr_role_ohe

OneHotEncoder(handle_unknown='ignore')

In [16]:
attr_role_sm = attr_role_ohe.transform(
    np.expand_dims(
        df.attributes.apply(lambda x: None if x is None else x.get('role')).fillna(''),
        -1
    )
)
attr_role_sm

<47966x11 sparse matrix of type '<class 'numpy.float64'>'
	with 46773 stored elements in Compressed Sparse Row format>

# attributes.type

In [17]:
attr_type_series = df[df.label_text != 'n/a'].attributes.apply(lambda x: None if x is None else x.get('type')).fillna('')
attr_type_series.value_counts()

                  3964
button             345
checkbox           151
text               102
submit              41
number              34
radio               31
range               27
search              15
email               13
password            13
file                 9
reset                5
color                5
month                3
datetime-local       3
week                 3
time                 3
date                 3
tel                  2
url                  1
Name: attributes, dtype: int64

In [18]:
attr_type_ohe = OneHotEncoder(handle_unknown='ignore').fit(np.expand_dims(attr_type_series.values, -1))
attr_type_ohe

OneHotEncoder(handle_unknown='ignore')

In [19]:
attr_type_sm = attr_type_ohe.transform(
    np.expand_dims(
        df.attributes.apply(lambda x: None if x is None else x.get('type')).fillna(''),
        -1
    )
)
attr_type_sm

<47966x21 sparse matrix of type '<class 'numpy.float64'>'
	with 47703 stored elements in Compressed Sparse Row format>

# attributes.ui

In [20]:
attr_ui_series = df[df.label_text != 'n/a'].attributes.apply(lambda x: None if x is None else x.get('ui')).fillna('')
attr_ui_series.value_counts()

               4714
label            40
furniture         5
table             3
d-table           3
dropdown          2
github-link       2
link              1
textarea          1
products          1
products-2        1
Name: attributes, dtype: int64

In [21]:
attr_ui_ohe = OneHotEncoder(handle_unknown='ignore').fit(np.expand_dims(attr_ui_series.values, -1))
attr_ui_ohe

OneHotEncoder(handle_unknown='ignore')

In [22]:
attr_ui_sm = attr_ui_ohe.transform(
    np.expand_dims(
        df.attributes.apply(lambda x: None if x is None else x.get('ui')).fillna(''),
        -1
    )
)
attr_ui_sm

<47966x11 sparse matrix of type '<class 'numpy.float64'>'
	with 47957 stored elements in Compressed Sparse Row format>

# attributes.class

In [23]:
attr_class_series = df[df.label_text != 'n/a'].attributes.apply(lambda x: None if x is None else x.get('class')).fillna('')
attr_class_series.value_counts()

                                                                                                                                       1410
MuiButtonBase-root MuiListItem-root MuiListItem-gutters MuiListItem-button                                                              796
news-link                                                                                                                               165
_1-6r                                                                                                                                   130
_1V0q                                                                                                                                   103
                                                                                                                                       ... 
mat-input-element mat-form-field-autofill-control ng-tns-c95-92 cdk-text-field-autofill-monitored ng-untouched ng-pristine ng-valid       1
mat-select ng-tns-c1

In [24]:
class_cv = CountVectorizer()

In [25]:
class_cv.fit(attr_class_series.values)

CountVectorizer()

In [26]:
vocabulary = [v for v in class_cv.vocabulary_.keys() if re.match(r'^[a-z]+$',v ) and len(v) > 2]
print(len(vocabulary))
print(vocabulary)

363
['navbar', 'brand', 'nav', 'link', 'active', 'btn', 'download', 'inline', 'block', 'form', 'control', 'input', 'light', 'dropdown', 'toggle', 'flex', 'align', 'items', 'center', 'rounded', 'collapsed', 'carbon', 'img', 'text', 'poweredby', 'primary', 'clipboard', 'card', 'item', 'disabled', 'header', 'button', 'account', 'breaking', 'accent', 'news', 'selection', 'muted', 'outline', 'secondary', 'low', 'footer', 'company', 'info', 'border', 'bold', 'document', 'follow', 'profile', 'photo', 'icon', 'search', 'danger', 'check', 'label', 'custom', 'range', 'select', 'radio', 'cnb', 'container', 'html', 'logo', 'innovations', 'other', 'ntl', 'content', 'tizer', 'body', 'black', 'scroll', 'down', 'image', 'owl', 'dot', 'mni', 'overlay', 'column', 'cnm', 'inputs', 'slider', 'corner', 'all', 'horizontal', 'widget', 'filter', 'char', 'min', 'arrow', 'max', 'filters', 'more', 'buy', 'links', 'page', 'ani', 'postname', 'grow', 'technoblog', 'testic', 'eid', 'name', 'userpic', 'lazyloaded', '

In [27]:
attr_class_ohe = OneHotEncoder(handle_unknown='ignore', categories=[vocabulary]).fit(np.expand_dims(attr_class_series.values, -1))
attr_class_ohe

OneHotEncoder(categories=[['navbar', 'brand', 'nav', 'link', 'active', 'btn',
                           'download', 'inline', 'block', 'form', 'control',
                           'input', 'light', 'dropdown', 'toggle', 'flex',
                           'align', 'items', 'center', 'rounded', 'collapsed',
                           'carbon', 'img', 'text', 'poweredby', 'primary',
                           'clipboard', 'card', 'item', 'disabled', ...]],
              handle_unknown='ignore')

In [28]:
attr_class_ohe.transform(np.expand_dims(df.attributes.apply(lambda x: None if x is None else x.get('class')).fillna(''), -1))

<47966x363 sparse matrix of type '<class 'numpy.float64'>'
	with 413 stored elements in Compressed Sparse Row format>

# Explore attributes

In [29]:
from collections import Counter

stats_na = Counter()
stats_ctl = Counter()

print('Number of labels:', df[df.label_text != 'n/a'].shape[0])

# Will Only attributes from labeled classes 

for _, r in tqdm(df[['attributes', 'label_text', 'displayed']].iterrows()):
    if r.attributes is not None:
        lst = [k for k in r.attributes.keys() if (r.attributes.get(k) is not None) and (str(r.attributes.get(k)).strip() != "")]
        if r.label_text == 'n/a':
            stats_na.update(lst)
        else:
            stats_ctl.update(lst)

attributes_na_df = pd.DataFrame({'keys': stats_na.keys(), 'counts': stats_na.values()}).sort_values(by='counts', ascending=False)
attributes_ctl_df = pd.DataFrame({'keys': stats_ctl.keys(), 'counts': stats_ctl.values()}).sort_values(by='counts', ascending=False)

Number of labels: 4773


0it [00:00, ?it/s]

In [30]:
attr_stats_df = attributes_na_df.merge(attributes_ctl_df, on='keys').sort_values(by='counts_x', ascending=False)

counts_x_sum, counts_y_sum = attr_stats_df.agg({ 'counts_x': 'sum', 'counts_y': 'sum'}).values
attr_stats_df['p_x'] = attr_stats_df.counts_x / counts_x_sum
attr_stats_df['p_y'] = attr_stats_df.counts_y / counts_y_sum

attr_stats_df['importance'] = attr_stats_df['p_y'] / attr_stats_df['p_x']

attr_stats_df[(attr_stats_df.p_x < attr_stats_df.p_y) & (attr_stats_df.counts_y > 30)].sort_values(by='counts_y', ascending=False).head(50)

Unnamed: 0,keys,counts_x,counts_y,p_x,p_y,importance
1,href,2958,2192,0.059948,0.167341,2.79145
8,tabindex,701,1163,0.014207,0.088785,6.249555
5,role,1542,930,0.031251,0.070998,2.271883
19,aria-disabled,324,885,0.006566,0.067562,10.289296
7,type,789,809,0.01599,0.06176,3.862415
4,id,2065,609,0.04185,0.046492,1.110925
20,target,283,595,0.005735,0.045423,7.919868
23,aria-label,192,229,0.003891,0.017482,4.492847
51,placeholder,24,145,0.000486,0.01107,22.758528
68,aria-invalid,15,134,0.000304,0.01023,33.65123
