In [1]:
import sys
import re
import os
from datetime import datetime
from time import sleep
from IPython.display import display, clear_output, HTML
import logging

# set working dir
WORKING_DIR = re.sub('jdi-qasp-ml.*$','',os.path.normpath(os.getcwd())) + 'jdi-qasp-ml'
os.chdir(WORKING_DIR)

import utils
from utils import *
import torch
import pandas as pd
import numpy as np
import torch
from glob import glob
import selenium
from tqdm.auto import tqdm, trange

START_TS = datetime.now()
display(HTML(f'{START_TS} - Current working directory: <b>{WORKING_DIR}</b>'))

import matplotlib.pyplot as plt
import matplotlib.patches as patches
%matplotlib inline
from scipy.sparse import csc_matrix, csr_matrix

import numba


2021-06-18 00:25:46,678 -INFO - config:config.py:16 - Module utils.config was loaded
2021-06-18 00:25:46,681 -INFO - common:common.py:618 - Module utils.common is loaded...
2021-06-18 00:25:46,682 -INFO - hidden:hidden.py:121 - hidden module is loaded
2021-06-18 00:25:47,016 -INFO - features_builder:features_builder.py:305 - feature_bilder module is loaded...
2021-06-18 00:25:47,018 -INFO - dataset_builder:dataset_builder.py:207 - dataset package is loaded...
2021-06-18 00:25:47,421 -INFO - dataset:dataset.py:634 - dataset module is loaded...


In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer



In [3]:
datasets = JDIDataset.gen_dataset_names()
# datasets  

In [4]:
logger.setLevel(logging.WARNING)

In [5]:
ds_list = []
with trange(len(datasets)) as bar:
    for ds in datasets:
        bar.set_postfix_str(f'Processing dataset: "{ds}"')
        df_file_path = f'dataset/df/{ds}.parquet'
        ann_file_path = f'dataset/annotations/{ds}.txt'
        
        if not os.path.exists(ann_file_path):
            print('skip:', (ann_file_path, df_file_path))
        else:
            df = pd.read_parquet(df_file_path)
            df = build_features(df)
            df = assign_labels(df=df, annotations_file_path=ann_file_path)
            ds_list.append(df)
            
        bar.update(1)

df = pd.concat(ds_list).copy()

  0%|          | 0/42 [00:00<?, ?it/s]

skip: ('dataset/annotations/ac-avto161.txt', 'dataset/df/ac-avto161.parquet')


In [6]:
df.element_id.shape, df.element_id.nunique()

((47750,), 47750)

In [7]:
df[df.label_text != 'n/a'].shape[0]

4558

In [8]:
df.label_text.value_counts()

n/a                 43192
link                 2404
button               1210
checkbox              246
textfield             229
radiobutton           130
selector               85
datetimeselector       34
table                  32
slider                 30
switch                 29
tab                    27
textarea               23
progressbar            21
range                  12
tree-node              10
fileinput               8
steper                  7
dropdown                7
colorpicker             6
iframe                  5
numberselector          3
Name: label_text, dtype: int64

In [9]:
df.tag_name.value_counts()

DIV                           12727
SPAN                           9201
A                              3908
TD                             2775
LI                             1973
                              ...  
SIDENAV-OVERVIEW-EXAMPLE          1
LIST-OVERVIEW-EXAMPLE             1
TREE-LOADMORE-EXAMPLE             1
PROGRESS-BAR-QUERY-EXAMPLE        1
SLIDER-OVERVIEW-EXAMPLE           1
Name: tag_name, Length: 308, dtype: int64

# tag_name

In [10]:
tag_name_series = df[df.label_text != 'n/a'].tag_name.value_counts()
display(tag_name_series)
tag_name_set = set(tag_name_series.index)

A                    2306
DIV                   772
BUTTON                510
INPUT                 481
LABEL                  86
SPAN                   77
LI                     53
SELECT                 52
P                      44
MAT-RADIO-BUTTON       28
TABLE                  24
MAT-CHECKBOX           23
TEXTAREA               22
MAT-SELECT             22
SVG                    13
MAT-TREE-NODE           8
IMG                     8
MAT-BUTTON-TOGGLE       5
MAT-SLIDER              5
IFRAME                  5
MAT-SLIDE-TOGGLE        3
CIRCLE                  2
PROGRESS                2
RECT                    2
MAT-TABLE               1
G                       1
I                       1
H5                      1
H2                      1
Name: tag_name, dtype: int64

In [11]:
tag_name_ohe = OneHotEncoder(handle_unknown='ignore', categories=[list(tag_name_set)])
tag_name_ohe

OneHotEncoder(categories=[['MAT-SELECT', 'MAT-TABLE', 'MAT-CHECKBOX', 'G', 'H5',
                           'SELECT', 'I', 'H2', 'TABLE', 'TEXTAREA', 'DIV', 'A',
                           'INPUT', 'LI', 'SVG', 'CIRCLE', 'SPAN',
                           'MAT-BUTTON-TOGGLE', 'PROGRESS', 'LABEL', 'IMG',
                           'MAT-SLIDE-TOGGLE', 'BUTTON', 'MAT-SLIDER', 'P',
                           'MAT-TREE-NODE', 'IFRAME', 'MAT-RADIO-BUTTON',
                           'RECT']],
              handle_unknown='ignore')

In [12]:
np.expand_dims(df.tag_name.values, -1)

array([['DIV'],
       ['DIV'],
       ['BR'],
       ...,
       ['A'],
       ['IMG'],
       ['DIV']], dtype=object)

In [13]:
tag_name_ohe.fit(np.expand_dims(df[df.label_text != 'n/a'].tag_name.values, -1))

OneHotEncoder(categories=[['MAT-SELECT', 'MAT-TABLE', 'MAT-CHECKBOX', 'G', 'H5',
                           'SELECT', 'I', 'H2', 'TABLE', 'TEXTAREA', 'DIV', 'A',
                           'INPUT', 'LI', 'SVG', 'CIRCLE', 'SPAN',
                           'MAT-BUTTON-TOGGLE', 'PROGRESS', 'LABEL', 'IMG',
                           'MAT-SLIDE-TOGGLE', 'BUTTON', 'MAT-SLIDER', 'P',
                           'MAT-TREE-NODE', 'IFRAME', 'MAT-RADIO-BUTTON',
                           'RECT']],
              handle_unknown='ignore')

In [14]:
tag_name_sm = tag_name_ohe.transform(np.expand_dims(df.tag_name.values, -1))
tag_name_sm

<47750x29 sparse matrix of type '<class 'numpy.float64'>'
	with 34173 stored elements in Compressed Sparse Row format>

In [15]:
logger.setLevel(logging.DEBUG)
tag_name_sm = build_tag_name_feature(df)
tag_name_sm

2021-06-18 00:26:20,172 -INFO - features_builder:features_builder.py:228 - used column: tag_name
2021-06-18 00:26:20,173 -INFO - features_builder:features_builder.py:231 - loading model/ohe_tag_name.pkl


<47750x29 sparse matrix of type '<class 'numpy.float64'>'
	with 34173 stored elements in Compressed Sparse Row format>

# attributes.role

In [16]:
attr_role_series = df[df.label_text != 'n/a'].attributes.apply(lambda x: None if x is None else x.get('role')).fillna('')
attr_role_series.value_counts()

               3838
button          616
tab              34
listbox          22
grid             12
switch            9
group             8
combobox          8
slider            5
option            3
progressbar       3
Name: attributes, dtype: int64

In [17]:
attr_role_ohe = OneHotEncoder(handle_unknown='ignore').fit(np.expand_dims(attr_role_series.values, -1))
attr_role_ohe

OneHotEncoder(handle_unknown='ignore')

In [18]:
attr_role_sm = attr_role_ohe.transform(
    np.expand_dims(
        df.attributes.apply(lambda x: None if x is None else x.get('role')).fillna(''),
        -1
    )
)
attr_role_sm

<47750x11 sparse matrix of type '<class 'numpy.float64'>'
	with 46557 stored elements in Compressed Sparse Row format>

In [19]:
logger.setLevel(logging.DEBUG)
role_sm = build_role_feature(df)
role_sm

2021-06-18 00:26:20,281 -INFO - features_builder:features_builder.py:251 - used column: attributes
2021-06-18 00:26:20,281 -INFO - features_builder:features_builder.py:254 - loading model/ohe_role.pkl


<47750x37 sparse matrix of type '<class 'numpy.float64'>'
	with 47731 stored elements in Compressed Sparse Row format>

# attributes.type

In [20]:
attr_type_series = df[df.label_text != 'n/a'].attributes.apply(lambda x: None if x is None else x.get('type')).fillna('')
attr_type_series.value_counts()

                               3760
button                          340
checkbox                        151
text                             95
submit                           41
number                           34
radio                            31
range                            22
search                           15
password                         13
email                            13
file                              9
addToCartButtonWithQuantity       6
reset                             5
color                             5
time                              3
datetime-local                    3
month                             3
week                              3
date                              3
tel                               2
url                               1
Name: attributes, dtype: int64

In [21]:
attr_type_ohe = OneHotEncoder(handle_unknown='ignore').fit(np.expand_dims(attr_type_series.values, -1))
attr_type_ohe

OneHotEncoder(handle_unknown='ignore')

In [22]:
attr_type_sm = attr_type_ohe.transform(
    np.expand_dims(
        df.attributes.apply(lambda x: None if x is None else x.get('type')).fillna(''),
        -1
    )
)
attr_type_sm

<47750x22 sparse matrix of type '<class 'numpy.float64'>'
	with 47493 stored elements in Compressed Sparse Row format>

In [23]:
logger.setLevel(logging.DEBUG)
type_sm = build_type_feature(df)
type_sm

2021-06-18 00:26:20,408 -INFO - features_builder:features_builder.py:279 - used column: attributes
2021-06-18 00:26:20,410 -INFO - features_builder:features_builder.py:282 - loading model/ohe_type.pkl


<47750x22 sparse matrix of type '<class 'numpy.float64'>'
	with 47493 stored elements in Compressed Sparse Row format>

# attributes.ui

In [24]:
attr_ui_series = df[df.label_text != 'n/a'].attributes.apply(lambda x: None if x is None else x.get('ui')).fillna('')
attr_ui_series.value_counts()

               4499
label            40
furniture         5
d-table           3
table             3
github-link       2
dropdown          2
link              1
products          1
textarea          1
products-2        1
Name: attributes, dtype: int64

In [25]:
attr_ui_ohe = OneHotEncoder(handle_unknown='ignore').fit(np.expand_dims(attr_ui_series.values, -1))
attr_ui_ohe

OneHotEncoder(handle_unknown='ignore')

In [26]:
attr_ui_sm = attr_ui_ohe.transform(
    np.expand_dims(
        df.attributes.apply(lambda x: None if x is None else x.get('ui')).fillna(''),
        -1
    )
)
attr_ui_sm

<47750x11 sparse matrix of type '<class 'numpy.float64'>'
	with 47741 stored elements in Compressed Sparse Row format>

- I find that column UI is useless

# attributes.class

In [27]:
attr_class_series = df[df.label_text != 'n/a'].attributes.apply(lambda x: None if x is None else x.get('class')).fillna('')
attr_class_series.value_counts()

                                                                                                                                                                                                                                                                                                              1410
MuiButtonBase-root MuiListItem-root MuiListItem-gutters MuiListItem-button                                                                                                                                                                                                                                     592
news-link                                                                                                                                                                                                                                                                                                      165
_1-6r                                                                          

In [28]:
class_cv = CountVectorizer()

In [29]:
class_cv.fit(attr_class_series.values)

CountVectorizer()

In [30]:
vocabulary = sorted([v for v in class_cv.vocabulary_.keys() if re.match(r'^[a-z]+$',v ) and len(v) > 2])
print(len(vocabulary))
print(vocabulary)

363
['above', 'accent', 'account', 'action', 'active', 'add', 'alert', 'align', 'all', 'anchorjs', 'ani', 'ant', 'appearance', 'arrow', 'autocomplete', 'autofill', 'autosize', 'badge', 'bannerclose', 'bar', 'base', 'before', 'benefits', 'black', 'block', 'blue', 'body', 'bold', 'border', 'bottom', 'brand', 'breaking', 'btn', 'btnprimary', 'buffer', 'button', 'buttonlink', 'buttonnew', 'buttontext', 'buy', 'carbon', 'card', 'carousel', 'cascader', 'cat', 'cdk', 'center', 'change', 'char', 'check', 'checkbox', 'checked', 'checkmark', 'children', 'chip', 'circle', 'circledisableshrink', 'circleindeterminate', 'click', 'clipboard', 'close', 'cnb', 'cnm', 'collapsed', 'colorprimary', 'colorsecondary', 'column', 'company', 'comparisons', 'contained', 'containedprimary', 'containedsecondary', 'containedsizelarge', 'containedsizesmall', 'container', 'content', 'control', 'corner', 'cta', 'current', 'custom', 'danger', 'dark', 'dashed', 'dashedcolorprimary', 'datatable', 'default', 'delete', 'd

In [31]:
attr_class_cv = CountVectorizer(vocabulary=vocabulary) #.fit(attr_class_series.values)
attr_class_cv

CountVectorizer(vocabulary=['above', 'accent', 'account', 'action', 'active',
                            'add', 'alert', 'align', 'all', 'anchorjs', 'ani',
                            'ant', 'appearance', 'arrow', 'autocomplete',
                            'autofill', 'autosize', 'badge', 'bannerclose',
                            'bar', 'base', 'before', 'benefits', 'black',
                            'block', 'blue', 'body', 'bold', 'border', 'bottom', ...])

In [32]:
attr_class_sm = attr_class_cv.transform(attr_class_series.values)
attr_class_mx = attr_class_sm.todense()
attr_class_sm

<4558x363 sparse matrix of type '<class 'numpy.int64'>'
	with 10711 stored elements in Compressed Sparse Row format>

In [33]:
attr_class_mx[10]

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [34]:
attr_class_tfidf = TfidfVectorizer(vocabulary=vocabulary).fit(attr_class_series.values)

In [35]:
attr_class_sm = attr_class_tfidf.transform(attr_class_series.values)
attr_class_mx = attr_class_sm.todense()
attr_class_sm, type(attr_class_sm)

(<4558x363 sparse matrix of type '<class 'numpy.float64'>'
 	with 10711 stored elements in Compressed Sparse Row format>,
 scipy.sparse.csr.csr_matrix)

In [36]:
logger.setLevel(logging.DEBUG)
attr_class_sm = build_class_feature(df)
attr_class_sm

2021-06-18 00:26:20,741 -INFO - features_builder:features_builder.py:188 - used column: attributes
2021-06-18 00:26:20,741 -INFO - features_builder:features_builder.py:191 - TfIdfVectorizer for class attribute exists. Loaging...


<47750x389 sparse matrix of type '<class 'numpy.float64'>'
	with 64025 stored elements in Compressed Sparse Row format>

# Explore attributes

In [37]:
from collections import Counter

stats_na = Counter()
stats_ctl = Counter()
stats_all = Counter()

print('Number of labels:', df[df.label_text != 'n/a'].shape[0])

# Will use Only attributes from labeled classes 

for _, r in tqdm(df[['attributes', 'label_text', 'displayed']].iterrows()):
    if r.attributes is not None:
        lst = [k for k in r.attributes.keys() if (r.attributes.get(k) is not None) and (str(r.attributes.get(k)).strip() != "")]
        if r.label_text == 'n/a':
            stats_na.update(lst)
        else:
            stats_ctl.update(lst)
        stats_all.update(lst)
attributes_na_df = pd.DataFrame({'keys': stats_na.keys(), 'counts': stats_na.values()}).sort_values(by='counts', ascending=False)

attributes_ctl_df = pd.DataFrame({'keys': stats_ctl.keys(), 'counts': stats_ctl.values()}).sort_values(by='counts', ascending=False)
attributes_all_df = pd.DataFrame({'keys': stats_all.keys(), 'counts': stats_all.values()}).sort_values(by='counts', ascending=False)


Number of labels: 4558


0it [00:00, ?it/s]

In [38]:
attributes_all_df['p_all']=attributes_all_df.counts/df.shape[0]
attributes_all_df

Unnamed: 0,keys,counts,p_all
0,class,30793,0.644880
2,href,5150,0.107853
3,id,2669,0.055895
19,role,2262,0.047372
8,style,2227,0.046639
...,...,...,...
255,svgicon,1,0.000021
254,tickinterval,1,0.000021
132,data-fl-category-id,1,0.000021
234,data-provides,1,0.000021


In [39]:
attr_stats_df = attributes_na_df.merge(attributes_ctl_df, on='keys', how='left')
attr_stats_df = attr_stats_df.merge(attributes_all_df[['keys', 'p_all']], on = 'keys', how='left')

counts_x_sum, counts_y_sum = attr_stats_df.agg({ 'counts_x': 'sum', 'counts_y': 'sum'}).values
attr_stats_df['p_x'] = attr_stats_df.counts_x / counts_x_sum
attr_stats_df['p_y'] = attr_stats_df.counts_y / counts_y_sum


attr_stats_df['importance'] = attr_stats_df['p_y'] / attr_stats_df['p_x']

attr_stats_df[(attr_stats_df.p_x < attr_stats_df.p_y) & (attr_stats_df.counts_y > 30)].sort_values(by='p_y', ascending=False).head(50)



Unnamed: 0,keys,counts_x,counts_y,p_all,p_x,p_y,importance
1,href,2957,2193.0,0.107853,0.052902,0.179828,3.399274
10,tabindex,707,952.0,0.034743,0.012648,0.078065,6.171866
9,type,789,798.0,0.033236,0.014116,0.065437,4.635801
5,role,1542,720.0,0.047372,0.027587,0.059041,2.140164
23,aria-disabled,323,682.0,0.021047,0.005779,0.055925,9.677892
4,id,2064,605.0,0.055895,0.036926,0.04961,1.343521
29,target,282,596.0,0.018387,0.005045,0.048872,9.687151
34,aria-label,191,227.0,0.008754,0.003417,0.018614,5.447427
87,placeholder,24,141.0,0.003455,0.000429,0.011562,26.928167
116,aria-invalid,14,130.0,0.003016,0.00025,0.01066,42.561237


In [40]:
_keys = set([
    'aria-invalid', 
    'aria-haspopup', 
    'aria-expanded',      
    'aria-required',
    'aria-disabled',
    'aria-selected',
    'aria-describedby',
    'aria-controls',
    'aria-invalid',
    'placeholder',
    'value',
    'for',
    'onclick',
    'target',
    'role',
    'type',
    'id',
    'name',
    'href',
    'min',
    'max'
])

report = []
with trange(df.shape[0]) as bar:
    for _, r in df.iterrows():
        attr = r.attributes
        if type(attr) is dict:
            d = {}
            d['label'] = 1 if r.label_text != 'n/a' else 0
            for k in _keys:
                v = attr.get(k)
                if v is not None and v.strip() != "":
                    d[k]=1
                else:
                    d[k]=0
        else:
            d = {k:0 for k in _keys}
                
        report.append(d)
        bar.update(1)

report_df = pd.DataFrame(report)

  0%|          | 0/47750 [00:00<?, ?it/s]

In [41]:
report_df.sum(axis=1).value_counts()

0.0    35797
1.0     6326
2.0     3537
3.0     1573
4.0      299
5.0      103
6.0      100
7.0        9
9.0        4
8.0        2
dtype: int64

In [42]:
df.attributes.apply(lambda x: x.get('target') if x is not None else None).value_counts()

_blank    762
_self     115
_top        1
Name: attributes, dtype: int64

In [43]:
df.columns

Index(['attributes', 'displayed', 'element_id', 'height', 'onmouseenter',
       'onmouseover', 'parent_id', 'style', 'tag_name', 'text', 'width', 'x',
       'y', 'is_hidden', 'upper_sibling', 'lower_sibling', 'siblings', 'idx',
       'label', 'annotation_line_no', 'iou', 'tag', 'label_text', 'dataset'],
      dtype='object')

In [44]:
report_df.corr()

Unnamed: 0,label,href,for,aria-required,value,aria-controls,target,aria-haspopup,aria-invalid,placeholder,...,min,max,aria-disabled,onclick,aria-expanded,name,type,id,aria-describedby,role
label,1.0,0.386107,0.013142,0.130391,0.051951,0.066226,0.269947,0.090801,0.150572,0.151494,...,0.082169,0.082972,0.289132,0.05908,0.069229,0.009278,0.254101,0.103249,0.084114,0.164666
href,0.386107,1.0,-0.037186,-0.017084,-0.034889,0.00117,0.393633,-0.001288,-0.019123,-0.020474,...,-0.011914,-0.01159,-0.039693,0.009583,-0.002342,-0.034926,-0.045635,-0.030524,-0.011699,-0.071814
for,0.013142,-0.037186,1.0,-0.005255,-0.010732,-0.004298,-0.014638,-0.003856,-0.005882,-0.006298,...,-0.003665,-0.003565,-0.015682,-0.004298,-0.005092,-0.010743,-0.01983,0.073126,-0.003599,-0.023849
aria-required,0.130391,-0.017084,-0.005255,1.0,0.020874,-0.001975,-0.006725,0.271119,0.893378,0.244657,...,0.04825,0.011194,0.070185,-0.001975,0.051625,0.003657,0.055246,0.201933,0.176319,0.053403
value,0.051951,-0.034889,-0.010732,0.020874,1.0,-0.004033,-0.013734,-0.003618,0.040616,0.094678,...,0.187501,0.20558,-0.010307,0.043252,-0.004778,0.318488,0.296574,0.186649,0.021713,-0.018407
aria-controls,0.066226,0.00117,-0.004298,-0.001975,-0.004033,1.0,-0.0055,0.027539,-0.00221,0.006528,...,-0.001377,-0.00134,0.095919,-0.001615,0.283752,0.00646,0.059519,0.101552,-0.001352,0.108972
target,0.269947,0.393633,-0.014638,-0.006725,-0.013734,-0.0055,1.0,-0.000606,-0.007527,-0.008059,...,-0.00469,-0.004562,-0.003779,0.017809,-0.003235,-0.013748,-0.025377,-0.028552,-0.004605,-0.029053
aria-haspopup,0.090801,-0.001288,-0.003856,0.271119,-0.003618,0.027539,-0.000606,1.0,0.24196,0.06725,...,0.015757,0.016264,-0.001235,-0.001449,0.230889,-0.003622,0.07767,0.079834,-0.001213,0.043975
aria-invalid,0.150572,-0.019123,-0.005882,0.893378,0.040616,-0.00221,-0.007527,0.24196,1.0,0.224561,...,0.042752,0.009637,0.061116,-0.00221,0.045621,0.044403,0.075031,0.222708,0.202698,0.045268
placeholder,0.151494,-0.020474,-0.006298,0.244657,0.094678,0.006528,-0.008059,0.06725,0.224561,1.0,...,0.1961,0.190961,-0.008634,-0.002367,0.057297,0.044326,0.237951,0.161223,0.199768,0.000309


In [45]:
logger.setLevel(logging.DEBUG)
attributes_df = build_attributes_feature(df)
attributes_df

2021-06-18 00:26:59,573 -INFO - features_builder:features_builder.py:152 - used column: attributes


  0%|          | 0/47750 [00:00<?, ?it/s]

Unnamed: 0,aria-invalid,aria-haspopup,aria-expanded,aria-required,aria-disabled,aria-selected,aria-describedby,aria-controls,placeholder,value,for,onclick,target,id,name,min,max
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47745,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
47746,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
47747,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
47748,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [46]:
csr_matrix(attributes_df.values), attributes_df.values.sum()

(<47750x17 sparse matrix of type '<class 'numpy.int64'>'
 	with 7002 stored elements in Compressed Sparse Row format>,
 7002)

In [47]:
df.columns

Index(['attributes', 'displayed', 'element_id', 'height', 'onmouseenter',
       'onmouseover', 'parent_id', 'style', 'tag_name', 'text', 'width', 'x',
       'y', 'is_hidden', 'upper_sibling', 'lower_sibling', 'siblings', 'idx',
       'label', 'annotation_line_no', 'iou', 'tag', 'label_text', 'dataset'],
      dtype='object')

# Check for duplicates

In [48]:
assert df.element_id.nunique() == df.shape[0], "There are duplicates in the dataset"
display(HTML("<h1><b>The dataset is OK, there are no duplicates</b></h1>"))

<hr style="height: 5px">

# DatasetCollector

In [49]:
COLS = ['element_id', 'tag_name', 'attributes', 'displayed', 'is_hidden']
TARGET_PARENT_COLUMNS = ['parent_id', 'tag_name_parent', 'attributes_parent', 'displayed_parent', 'is_hidden_parent']
df_parent = df[COLS].copy()
df_parent.columns = TARGET_PARENT_COLUMNS
df.shape

TARGET_UP_SIBLING_COLUMNS = ['upper_sibling', 'tag_name_upsib', 'attributes_upsib', 'displayed_upsib', 'is_hidden_upsib']
upsib_df = df[COLS].copy()
upsib_df.columns = TARGET_UP_SIBLING_COLUMNS

TARGET_DN_SIBLING_COLUMNS = ['lower_sibling', 'tag_name_dnsib', 'attributes_dnsib', 'displayed_dnsib', 'is_hidden_dnsib']
dnsib_df = df[COLS].copy()
dnsib_df.columns = TARGET_DN_SIBLING_COLUMNS



In [50]:
train_df = df.merge(df_parent, on='parent_id', how='left')
train_df = train_df.merge(upsib_df, on='upper_sibling', how='left')
train_df = train_df.merge(dnsib_df, on='lower_sibling', how='left')


In [51]:
train_df.shape, train_df.columns

((47750, 36),
 Index(['attributes', 'displayed', 'element_id', 'height', 'onmouseenter',
        'onmouseover', 'parent_id', 'style', 'tag_name', 'text', 'width', 'x',
        'y', 'is_hidden', 'upper_sibling', 'lower_sibling', 'siblings', 'idx',
        'label', 'annotation_line_no', 'iou', 'tag', 'label_text', 'dataset',
        'tag_name_parent', 'attributes_parent', 'displayed_parent',
        'is_hidden_parent', 'tag_name_upsib', 'attributes_upsib',
        'displayed_upsib', 'is_hidden_upsib', 'tag_name_dnsib',
        'attributes_dnsib', 'displayed_dnsib', 'is_hidden_dnsib'],
       dtype='object'))

In [52]:
parent_attributes_sm = csr_matrix(build_attributes_feature(df = train_df, colname = 'attributes_parent').values)
parent_attributes_sm

2021-06-18 00:27:03,474 -INFO - features_builder:features_builder.py:152 - used column: attributes_parent


  0%|          | 0/47750 [00:00<?, ?it/s]

<47750x17 sparse matrix of type '<class 'numpy.int64'>'
	with 6456 stored elements in Compressed Sparse Row format>

In [53]:
upsib_attributes_sm = csr_matrix(build_attributes_feature(df = train_df, colname = 'attributes_upsib').values)
upsib_attributes_sm

2021-06-18 00:27:07,149 -INFO - features_builder:features_builder.py:152 - used column: attributes_upsib


  0%|          | 0/47750 [00:00<?, ?it/s]

<47750x17 sparse matrix of type '<class 'numpy.int64'>'
	with 3630 stored elements in Compressed Sparse Row format>

In [54]:
dnsib_attributes_sm = csr_matrix(build_attributes_feature(df = train_df, colname = 'attributes_dnsib').values)
dnsib_attributes_sm

2021-06-18 00:27:10,699 -INFO - features_builder:features_builder.py:152 - used column: attributes_dnsib


  0%|          | 0/47750 [00:00<?, ?it/s]

<47750x17 sparse matrix of type '<class 'numpy.int64'>'
	with 2877 stored elements in Compressed Sparse Row format>

<hr>

- # class

In [55]:
build_class_feature(train_df, colname='attributes')

2021-06-18 00:27:14,268 -INFO - features_builder:features_builder.py:188 - used column: attributes
2021-06-18 00:27:14,269 -INFO - features_builder:features_builder.py:191 - TfIdfVectorizer for class attribute exists. Loaging...


<47750x389 sparse matrix of type '<class 'numpy.float64'>'
	with 64025 stored elements in Compressed Sparse Row format>

In [56]:
build_class_feature(train_df, colname='attributes_parent')

2021-06-18 00:27:14,476 -INFO - features_builder:features_builder.py:188 - used column: attributes_parent
2021-06-18 00:27:14,476 -INFO - features_builder:features_builder.py:191 - TfIdfVectorizer for class attribute exists. Loaging...


<47750x389 sparse matrix of type '<class 'numpy.float64'>'
	with 77982 stored elements in Compressed Sparse Row format>

In [57]:
build_class_feature(train_df, colname='attributes_upsib')

2021-06-18 00:27:14,683 -INFO - features_builder:features_builder.py:188 - used column: attributes_upsib
2021-06-18 00:27:14,684 -INFO - features_builder:features_builder.py:191 - TfIdfVectorizer for class attribute exists. Loaging...


<47750x389 sparse matrix of type '<class 'numpy.float64'>'
	with 28251 stored elements in Compressed Sparse Row format>

In [58]:
build_class_feature(train_df, colname='attributes_dnsib')

2021-06-18 00:27:14,840 -INFO - features_builder:features_builder.py:188 - used column: attributes_dnsib
2021-06-18 00:27:14,842 -INFO - features_builder:features_builder.py:191 - TfIdfVectorizer for class attribute exists. Loaging...


<47750x389 sparse matrix of type '<class 'numpy.float64'>'
	with 27468 stored elements in Compressed Sparse Row format>

<hr>

- # tag_name

In [59]:
build_tag_name_feature(train_df, colname='tag_name')

2021-06-18 00:27:14,999 -INFO - features_builder:features_builder.py:228 - used column: tag_name
2021-06-18 00:27:15,001 -INFO - features_builder:features_builder.py:231 - loading model/ohe_tag_name.pkl


<47750x29 sparse matrix of type '<class 'numpy.float64'>'
	with 34173 stored elements in Compressed Sparse Row format>

In [60]:
build_tag_name_feature(train_df, colname='tag_name_parent')

2021-06-18 00:27:15,031 -INFO - features_builder:features_builder.py:228 - used column: tag_name_parent
2021-06-18 00:27:15,032 -INFO - features_builder:features_builder.py:231 - loading model/ohe_tag_name.pkl


<47750x29 sparse matrix of type '<class 'numpy.float64'>'
	with 30194 stored elements in Compressed Sparse Row format>

In [61]:
build_tag_name_feature(train_df, colname='tag_name_upsib')

2021-06-18 00:27:15,064 -INFO - features_builder:features_builder.py:228 - used column: tag_name_upsib
2021-06-18 00:27:15,064 -INFO - features_builder:features_builder.py:231 - loading model/ohe_tag_name.pkl


<47750x29 sparse matrix of type '<class 'numpy.float64'>'
	with 17026 stored elements in Compressed Sparse Row format>

In [62]:
build_tag_name_feature(train_df, colname='tag_name_dnsib')

2021-06-18 00:27:15,095 -INFO - features_builder:features_builder.py:228 - used column: tag_name_dnsib
2021-06-18 00:27:15,096 -INFO - features_builder:features_builder.py:231 - loading model/ohe_tag_name.pkl


<47750x29 sparse matrix of type '<class 'numpy.float64'>'
	with 17262 stored elements in Compressed Sparse Row format>

<hr>

- # type

In [63]:
build_type_feature(train_df, colname='attributes')

2021-06-18 00:27:15,127 -INFO - features_builder:features_builder.py:279 - used column: attributes
2021-06-18 00:27:15,128 -INFO - features_builder:features_builder.py:282 - loading model/ohe_type.pkl


<47750x22 sparse matrix of type '<class 'numpy.float64'>'
	with 47493 stored elements in Compressed Sparse Row format>

In [64]:
build_type_feature(train_df, colname='attributes_parent')

2021-06-18 00:27:15,175 -INFO - features_builder:features_builder.py:279 - used column: attributes_parent
2021-06-18 00:27:15,176 -INFO - features_builder:features_builder.py:282 - loading model/ohe_type.pkl


<47750x22 sparse matrix of type '<class 'numpy.float64'>'
	with 47750 stored elements in Compressed Sparse Row format>

In [65]:
build_type_feature(train_df, colname='attributes_upsib')

2021-06-18 00:27:15,222 -INFO - features_builder:features_builder.py:279 - used column: attributes_upsib
2021-06-18 00:27:15,223 -INFO - features_builder:features_builder.py:282 - loading model/ohe_type.pkl


<47750x22 sparse matrix of type '<class 'numpy.float64'>'
	with 47530 stored elements in Compressed Sparse Row format>

In [66]:
build_type_feature(train_df, colname='attributes_dnsib')

2021-06-18 00:27:15,271 -INFO - features_builder:features_builder.py:279 - used column: attributes_dnsib
2021-06-18 00:27:15,272 -INFO - features_builder:features_builder.py:282 - loading model/ohe_type.pkl


<47750x22 sparse matrix of type '<class 'numpy.float64'>'
	with 47521 stored elements in Compressed Sparse Row format>

<hr>

- # role

In [67]:
build_role_feature(train_df, colname='attributes')

2021-06-18 00:27:15,317 -INFO - features_builder:features_builder.py:251 - used column: attributes
2021-06-18 00:27:15,318 -INFO - features_builder:features_builder.py:254 - loading model/ohe_role.pkl


<47750x37 sparse matrix of type '<class 'numpy.float64'>'
	with 47731 stored elements in Compressed Sparse Row format>

In [68]:
build_role_feature(train_df, colname='attributes_parent')

2021-06-18 00:27:15,369 -INFO - features_builder:features_builder.py:251 - used column: attributes_parent
2021-06-18 00:27:15,370 -INFO - features_builder:features_builder.py:254 - loading model/ohe_role.pkl


<47750x37 sparse matrix of type '<class 'numpy.float64'>'
	with 47718 stored elements in Compressed Sparse Row format>

In [69]:
build_role_feature(train_df, colname='attributes_upsib')

2021-06-18 00:27:15,412 -INFO - features_builder:features_builder.py:251 - used column: attributes_upsib
2021-06-18 00:27:15,413 -INFO - features_builder:features_builder.py:254 - loading model/ohe_role.pkl


<47750x37 sparse matrix of type '<class 'numpy.float64'>'
	with 47735 stored elements in Compressed Sparse Row format>

In [70]:
build_role_feature(train_df, colname='attributes_dnsib')

2021-06-18 00:27:15,460 -INFO - features_builder:features_builder.py:251 - used column: attributes_dnsib
2021-06-18 00:27:15,461 -INFO - features_builder:features_builder.py:254 - loading model/ohe_role.pkl


<47750x37 sparse matrix of type '<class 'numpy.float64'>'
	with 47735 stored elements in Compressed Sparse Row format>

In [71]:
df.columns

Index(['attributes', 'displayed', 'element_id', 'height', 'onmouseenter',
       'onmouseover', 'parent_id', 'style', 'tag_name', 'text', 'width', 'x',
       'y', 'is_hidden', 'upper_sibling', 'lower_sibling', 'siblings', 'idx',
       'label', 'annotation_line_no', 'iou', 'tag', 'label_text', 'dataset'],
      dtype='object')

In [72]:
X, y = collect_dataset(df)

2021-06-18 00:27:15,680 -INFO - features_builder:features_builder.py:228 - used column: tag_name
2021-06-18 00:27:15,680 -INFO - features_builder:features_builder.py:231 - loading model/ohe_tag_name.pkl
2021-06-18 00:27:15,699 -INFO - features_builder:features_builder.py:228 - used column: tag_name_parent
2021-06-18 00:27:15,700 -INFO - features_builder:features_builder.py:231 - loading model/ohe_tag_name.pkl
2021-06-18 00:27:15,721 -INFO - features_builder:features_builder.py:228 - used column: tag_name_upsib
2021-06-18 00:27:15,721 -INFO - features_builder:features_builder.py:231 - loading model/ohe_tag_name.pkl
2021-06-18 00:27:15,745 -INFO - features_builder:features_builder.py:228 - used column: tag_name_dnsib
2021-06-18 00:27:15,746 -INFO - features_builder:features_builder.py:231 - loading model/ohe_tag_name.pkl
2021-06-18 00:27:15,770 -INFO - dataset_collector:dataset_collector.py:45 - tag_name: (47750, 29)
2021-06-18 00:27:15,771 -INFO - features_builder:features_builder.py:15

  0%|          | 0/47750 [00:00<?, ?it/s]

2021-06-18 00:27:19,435 -INFO - features_builder:features_builder.py:152 - used column: attributes_parent


  0%|          | 0/47750 [00:00<?, ?it/s]

2021-06-18 00:27:23,037 -INFO - features_builder:features_builder.py:152 - used column: attributes_upsib


  0%|          | 0/47750 [00:00<?, ?it/s]

2021-06-18 00:27:26,535 -INFO - features_builder:features_builder.py:152 - used column: attributes_dnsib


  0%|          | 0/47750 [00:00<?, ?it/s]

2021-06-18 00:27:30,013 -INFO - dataset_collector:dataset_collector.py:52 - attributes_sm: (47750, 17)
2021-06-18 00:27:30,014 -INFO - features_builder:features_builder.py:188 - used column: attributes
2021-06-18 00:27:30,015 -INFO - features_builder:features_builder.py:191 - TfIdfVectorizer for class attribute exists. Loaging...
2021-06-18 00:27:30,194 -INFO - features_builder:features_builder.py:188 - used column: attributes_parent
2021-06-18 00:27:30,194 -INFO - features_builder:features_builder.py:191 - TfIdfVectorizer for class attribute exists. Loaging...
2021-06-18 00:27:30,379 -INFO - features_builder:features_builder.py:188 - used column: attributes_upsib
2021-06-18 00:27:30,380 -INFO - features_builder:features_builder.py:191 - TfIdfVectorizer for class attribute exists. Loaging...
2021-06-18 00:27:30,518 -INFO - features_builder:features_builder.py:188 - used column: attributes_dnsib
2021-06-18 00:27:30,518 -INFO - features_builder:features_builder.py:191 - TfIdfVectorizer f

In [73]:
with open('dataset/classes.txt', 'r') as f:
    decoder_dict = { i:v.strip() for i, v in enumerate(f.readlines())}
results_df = pd.DataFrame(pd.Series(y).value_counts(), columns=['cnt'])
results_df['class'] = results_df.index.map(decoder_dict)
results_df

Unnamed: 0,cnt,class
21,43192,
1,2404,link
0,1210,button
4,246,checkbox
2,229,textfield
5,130,radiobutton
14,85,selector
11,34,datetimeselector
15,32,table
17,30,slider


In [74]:
df[df.label_text == 'dropdown'][['dataset', 'tag_name', 'x', 'y', 'width', 'height']]

Unnamed: 0,dataset,tag_name,x,y,width,height
1417,ozon,DIV,479.5,257.0,206.0,18.0
907,bootstrap,BUTTON,240.0,6560.140625,114.0,38.0
414,bootstrap,A,309.46875,2032.0,107.640625,36.0
915,bootstrap,BUTTON,407.21875,6603.140625,29.59375,38.0
435,bootstrap,A,307.46875,2124.0,105.640625,34.0
480,bootstrap,A,516.265625,2310.5,72.859375,34.0
503,bootstrap,A,529.15625,2472.0,89.640625,34.0


In [75]:
# def rebalance(y: np.ndarray):
#     with open('dataset/classes.txt', 'r') as f:
#         decoder_dict = { i: v.strip()  for i, v in enumerate(f.readlines())}

#     proportion_df = pd.DataFrame([{'label': i, 'label_text': decoder_dict[i], 'cnt': np.where(y==i)[0].shape[0] } for i in range(0, len(decoder_dict))])
#     labels_cnt = proportion_df[proportion_df.label_text != 'n/a'][['cnt']].sum().values[0]
#     na_label_cnt = proportion_df[proportion_df.label_text == 'n/a'].cnt.values[0]
#     print(na_label_cnt, labels_cnt)
#     proportion_df['ratio'] = proportion_df.apply(lambda r: na_label_cnt//r.cnt//7 if r.label_text != 'n/a' else 1, axis=1)
#     proportion_df['cnt2'] = proportion_df.ratio * proportion_df.cnt
#     display(proportion_df)
    
#     indices = []
#     for i, r in proportion_df.iterrows():
#         lst = np.where(y == r.label)[0].tolist()
#         for _ in range(r.ratio):
#             indices.extend(lst)
            
#     print(len(indices))
#     np.random.shuffle(indices)
#     return indices


In [78]:
indices = rebalance(y)

2021-06-18 00:28:33,235 -INFO - dataset:dataset.py:602 - Rebalance dataset
2021-06-18 00:28:33,240 -INFO - dataset:dataset.py:614 - "n/a" count: 43192, labels count: 4558


Unnamed: 0,label,label_text,cnt,ratio,cnt_rebalanced
0,0,button,1210,5,6050
1,1,link,2404,2,4808
2,2,textfield,229,26,5954
3,3,dropdown,7,881,6167
4,4,checkbox,246,25,6150
5,5,radiobutton,130,47,6110
6,6,textarea,23,268,6164
7,7,fileinput,8,771,6168
8,8,iframe,5,1234,6170
9,9,range,12,514,6168


2021-06-18 00:28:33,261 -INFO - dataset:dataset.py:629 - Rebalanced and shuffled indices: 170799
