In [1]:
import sys
import re
import os
from datetime import datetime
from time import sleep
from IPython.display import display, clear_output, HTML
import logging

# set working dir
WORKING_DIR = re.sub('jdi-qasp-ml.*$','',os.path.normpath(os.getcwd())) + 'jdi-qasp-ml'
os.chdir(WORKING_DIR)

import utils
from utils import *
import torch
import pandas as pd
import numpy as np
import torch
from glob import glob
import selenium
from tqdm.auto import tqdm, trange

START_TS = datetime.now()
display(HTML(f'{START_TS} - Current working directory: <b>{WORKING_DIR}</b>'))

import matplotlib.pyplot as plt
import matplotlib.patches as patches
%matplotlib inline
from scipy.sparse import csc_matrix, csr_matrix

import numba


2021-06-16 20:50:22,949 -INFO - config:config.py:16 - Module utils.config was loaded
2021-06-16 20:50:22,951 -INFO - common:common.py:618 - Module utils.common is loaded...
2021-06-16 20:50:22,952 -INFO - hidden:hidden.py:121 - hidden module is loaded
2021-06-16 20:50:23,248 -INFO - dataset_builder:dataset_builder.py:207 - dataset package is loaded...
2021-06-16 20:50:23,551 -INFO - dataset:dataset.py:653 - dataset module is loaded...


In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer



In [3]:
datasets = JDIDataset.gen_dataset_names()
# datasets  

In [4]:
logger.setLevel(logging.WARNING)

In [5]:
ds_list = []
with trange(len(datasets)) as bar:
    for ds in datasets:
        bar.set_postfix_str(f'Processing dataset: "{ds}"')
        df_file_path = f'dataset/df/{ds}.parquet'
        ann_file_path = f'dataset/annotations/{ds}.txt'
        
        if not os.path.exists(ann_file_path):
            print('skip:', (ann_file_path, df_file_path))
        else:
            df = pd.read_parquet(df_file_path)
            df = build_features(df)
            df = assign_labels(df=df, annotations_file_path=ann_file_path)
            ds_list.append(df)
            
        bar.update(1)

df = pd.concat(ds_list).copy()

  0%|          | 0/42 [00:00<?, ?it/s]

skip: ('dataset/annotations/ac-avto161.txt', 'dataset/df/ac-avto161.parquet')


In [6]:
df.element_id.shape, df.element_id.nunique()

((47750,), 47750)

In [7]:
df[df.label_text != 'n/a'].shape[0]

4557

In [8]:
df.label_text.value_counts()

n/a                 43193
link                 2408
button               1207
checkbox              246
textfield             212
radiobutton           130
selector               83
datetimeselector       34
table                  32
slider-toggle          30
slider                 29
tab                    27
textarea               23
dropdown               22
progressbar            21
range                  13
tree-node              11
fileinput               8
steper                  7
colorpicker             6
iframe                  5
numberselector          3
Name: label_text, dtype: int64

In [9]:
df.tag_name.value_counts()

DIV                           12727
SPAN                           9201
A                              3908
TD                             2775
LI                             1973
                              ...  
SORT-OVERVIEW-EXAMPLE             1
TAB-GROUP-BASIC-EXAMPLE           1
TOOLTIP-OVERVIEW-EXAMPLE          1
DATEPICKER-MIN-MAX-EXAMPLE        1
TREE-LOADMORE-EXAMPLE             1
Name: tag_name, Length: 308, dtype: int64

# tag_name

In [10]:
tag_name_series = df[df.label_text != 'n/a'].tag_name.value_counts()
display(tag_name_series)
tag_name_set = set(tag_name_series.index)

A                    2305
DIV                   772
BUTTON                509
INPUT                 481
LABEL                  86
SPAN                   77
LI                     54
SELECT                 52
P                      44
MAT-RADIO-BUTTON       28
TABLE                  24
MAT-CHECKBOX           23
TEXTAREA               22
MAT-SELECT             22
SVG                    13
IMG                     8
MAT-TREE-NODE           8
MAT-SLIDER              5
MAT-BUTTON-TOGGLE       5
IFRAME                  5
MAT-SLIDE-TOGGLE        3
CIRCLE                  2
PROGRESS                2
RECT                    2
MAT-TABLE               1
G                       1
H5                      1
H2                      1
I                       1
Name: tag_name, dtype: int64

In [11]:
tag_name_ohe = OneHotEncoder(handle_unknown='ignore', categories=[list(tag_name_set)])
tag_name_ohe

OneHotEncoder(categories=[['MAT-CHECKBOX', 'BUTTON', 'P', 'DIV', 'H5', 'G',
                           'MAT-BUTTON-TOGGLE', 'TABLE', 'SVG', 'MAT-SLIDER',
                           'IFRAME', 'MAT-TREE-NODE', 'CIRCLE', 'PROGRESS',
                           'TEXTAREA', 'A', 'IMG', 'MAT-RADIO-BUTTON',
                           'MAT-SELECT', 'MAT-SLIDE-TOGGLE', 'RECT', 'LABEL',
                           'LI', 'H2', 'INPUT', 'SELECT', 'I', 'MAT-TABLE',
                           'SPAN']],
              handle_unknown='ignore')

In [12]:
np.expand_dims(df.tag_name.values, -1)

array([['HTML'],
       ['HEAD'],
       ['META'],
       ...,
       ['LI'],
       ['A'],
       ['SCRIPT']], dtype=object)

In [13]:
tag_name_ohe.fit(np.expand_dims(df[df.label_text != 'n/a'].tag_name.values, -1))

OneHotEncoder(categories=[['MAT-CHECKBOX', 'BUTTON', 'P', 'DIV', 'H5', 'G',
                           'MAT-BUTTON-TOGGLE', 'TABLE', 'SVG', 'MAT-SLIDER',
                           'IFRAME', 'MAT-TREE-NODE', 'CIRCLE', 'PROGRESS',
                           'TEXTAREA', 'A', 'IMG', 'MAT-RADIO-BUTTON',
                           'MAT-SELECT', 'MAT-SLIDE-TOGGLE', 'RECT', 'LABEL',
                           'LI', 'H2', 'INPUT', 'SELECT', 'I', 'MAT-TABLE',
                           'SPAN']],
              handle_unknown='ignore')

In [14]:
tag_name_sm = tag_name_ohe.transform(np.expand_dims(df.tag_name.values, -1))
tag_name_sm

<47750x29 sparse matrix of type '<class 'numpy.float64'>'
	with 34173 stored elements in Compressed Sparse Row format>

In [15]:
logger.setLevel(logging.DEBUG)
tag_name_sm = build_tag_name_feature(df)
tag_name_sm

2021-06-16 20:50:56,296 -INFO - features_builder:features_builder.py:228 - used column: tag_name
2021-06-16 20:50:56,297 -INFO - features_builder:features_builder.py:231 - loading model/ohe_tag_name.pkl


<47750x29 sparse matrix of type '<class 'numpy.float64'>'
	with 34173 stored elements in Compressed Sparse Row format>

# attributes.role

In [16]:
attr_role_series = df[df.label_text != 'n/a'].attributes.apply(lambda x: None if x is None else x.get('role')).fillna('')
attr_role_series.value_counts()

               3837
button          616
tab              34
listbox          22
grid             12
switch            9
combobox          8
group             8
slider            5
option            3
progressbar       3
Name: attributes, dtype: int64

In [17]:
attr_role_ohe = OneHotEncoder(handle_unknown='ignore').fit(np.expand_dims(attr_role_series.values, -1))
attr_role_ohe

OneHotEncoder(handle_unknown='ignore')

In [18]:
attr_role_sm = attr_role_ohe.transform(
    np.expand_dims(
        df.attributes.apply(lambda x: None if x is None else x.get('role')).fillna(''),
        -1
    )
)
attr_role_sm

<47750x11 sparse matrix of type '<class 'numpy.float64'>'
	with 46557 stored elements in Compressed Sparse Row format>

In [19]:
logger.setLevel(logging.DEBUG)
role_sm = build_role_feature(df)
role_sm

2021-06-16 20:50:56,427 -INFO - features_builder:features_builder.py:251 - used column: attributes
2021-06-16 20:50:56,428 -INFO - features_builder:features_builder.py:254 - loading model/ohe_role.pkl


<47750x37 sparse matrix of type '<class 'numpy.float64'>'
	with 47731 stored elements in Compressed Sparse Row format>

# attributes.type

In [20]:
attr_type_series = df[df.label_text != 'n/a'].attributes.apply(lambda x: None if x is None else x.get('type')).fillna('')
attr_type_series.value_counts()

                               3760
button                          339
checkbox                        151
text                             95
submit                           41
number                           34
radio                            31
range                            22
search                           15
email                            13
password                         13
file                              9
addToCartButtonWithQuantity       6
color                             5
reset                             5
datetime-local                    3
date                              3
time                              3
week                              3
month                             3
tel                               2
url                               1
Name: attributes, dtype: int64

In [21]:
attr_type_ohe = OneHotEncoder(handle_unknown='ignore').fit(np.expand_dims(attr_type_series.values, -1))
attr_type_ohe

OneHotEncoder(handle_unknown='ignore')

In [22]:
attr_type_sm = attr_type_ohe.transform(
    np.expand_dims(
        df.attributes.apply(lambda x: None if x is None else x.get('type')).fillna(''),
        -1
    )
)
attr_type_sm

<47750x22 sparse matrix of type '<class 'numpy.float64'>'
	with 47493 stored elements in Compressed Sparse Row format>

In [23]:
logger.setLevel(logging.DEBUG)
type_sm = build_type_feature(df)
type_sm

2021-06-16 20:50:56,577 -INFO - features_builder:features_builder.py:279 - used column: attributes
2021-06-16 20:50:56,579 -INFO - features_builder:features_builder.py:282 - loading model/ohe_type.pkl


<47750x1548 sparse matrix of type '<class 'numpy.float64'>'
	with 47723 stored elements in Compressed Sparse Row format>

# attributes.ui

In [24]:
attr_ui_series = df[df.label_text != 'n/a'].attributes.apply(lambda x: None if x is None else x.get('ui')).fillna('')
attr_ui_series.value_counts()

               4498
label            40
furniture         5
d-table           3
table             3
dropdown          2
github-link       2
textarea          1
link              1
products-2        1
products          1
Name: attributes, dtype: int64

In [25]:
attr_ui_ohe = OneHotEncoder(handle_unknown='ignore').fit(np.expand_dims(attr_ui_series.values, -1))
attr_ui_ohe

OneHotEncoder(handle_unknown='ignore')

In [26]:
attr_ui_sm = attr_ui_ohe.transform(
    np.expand_dims(
        df.attributes.apply(lambda x: None if x is None else x.get('ui')).fillna(''),
        -1
    )
)
attr_ui_sm

<47750x11 sparse matrix of type '<class 'numpy.float64'>'
	with 47741 stored elements in Compressed Sparse Row format>

- I found that column UI is useless

# attributes.class

In [27]:
attr_class_series = df[df.label_text != 'n/a'].attributes.apply(lambda x: None if x is None else x.get('class')).fillna('')
attr_class_series.value_counts()

                                                                                              1410
MuiButtonBase-root MuiListItem-root MuiListItem-gutters MuiListItem-button                     592
news-link                                                                                      165
_1-6r                                                                                          124
_1V0q                                                                                          103
                                                                                              ... 
MuiButtonBase-root MuiButton-root MuiButton-contained Mui-disabled Mui-disabled                  1
MuiInputBase-input MuiInput-input MuiInputBase-inputMarginDense MuiInput-inputMarginDense        1
MuiFormControlLabel-root MuiFormControlLabel-labelPlacementStart                                 1
mat-select ng-tns-c171-37 ng-tns-c95-36 ng-star-inserted ng-untouched ng-pristine ng-valid       1
mat-sort-h

In [28]:
class_cv = CountVectorizer()

In [29]:
class_cv.fit(attr_class_series.values)

CountVectorizer()

In [30]:
vocabulary = sorted([v for v in class_cv.vocabulary_.keys() if re.match(r'^[a-z]+$',v ) and len(v) > 2])
print(len(vocabulary))
print(vocabulary)

362
['above', 'accent', 'account', 'action', 'active', 'add', 'alert', 'align', 'all', 'anchorjs', 'ani', 'ant', 'appearance', 'arrow', 'autocomplete', 'autofill', 'autosize', 'badge', 'bannerclose', 'bar', 'base', 'before', 'benefits', 'black', 'block', 'blue', 'body', 'bold', 'border', 'bottom', 'brand', 'breaking', 'btn', 'btnprimary', 'buffer', 'button', 'buttonlink', 'buttonnew', 'buttontext', 'buy', 'carbon', 'card', 'carousel', 'cascader', 'cat', 'cdk', 'center', 'change', 'char', 'check', 'checkbox', 'checked', 'checkmark', 'children', 'chip', 'circle', 'circledisableshrink', 'circleindeterminate', 'click', 'clipboard', 'close', 'cnb', 'cnm', 'collapsed', 'colorprimary', 'colorsecondary', 'column', 'company', 'comparisons', 'contained', 'containedprimary', 'containedsecondary', 'containedsizelarge', 'containedsizesmall', 'container', 'content', 'control', 'corner', 'cta', 'current', 'custom', 'danger', 'dark', 'dashed', 'dashedcolorprimary', 'datatable', 'default', 'delete', 'd

In [31]:
attr_class_cv = CountVectorizer(vocabulary=vocabulary) #.fit(attr_class_series.values)
attr_class_cv

CountVectorizer(vocabulary=['above', 'accent', 'account', 'action', 'active',
                            'add', 'alert', 'align', 'all', 'anchorjs', 'ani',
                            'ant', 'appearance', 'arrow', 'autocomplete',
                            'autofill', 'autosize', 'badge', 'bannerclose',
                            'bar', 'base', 'before', 'benefits', 'black',
                            'block', 'blue', 'body', 'bold', 'border', 'bottom', ...])

In [32]:
attr_class_sm = attr_class_cv.transform(attr_class_series.values)
attr_class_mx = attr_class_sm.todense()
attr_class_sm

<4557x362 sparse matrix of type '<class 'numpy.int64'>'
	with 10706 stored elements in Compressed Sparse Row format>

In [33]:
attr_class_mx[10]

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [34]:
attr_class_tfidf = TfidfVectorizer(vocabulary=vocabulary).fit(attr_class_series.values)

In [35]:
attr_class_sm = attr_class_tfidf.transform(attr_class_series.values)
attr_class_mx = attr_class_sm.todense()
attr_class_sm, type(attr_class_sm)

(<4557x362 sparse matrix of type '<class 'numpy.float64'>'
 	with 10706 stored elements in Compressed Sparse Row format>,
 scipy.sparse.csr.csr_matrix)

In [36]:
logger.setLevel(logging.DEBUG)
attr_class_sm = build_class_feature(df)
attr_class_sm

2021-06-16 20:50:56,926 -INFO - features_builder:features_builder.py:188 - used column: attributes
2021-06-16 20:50:56,927 -INFO - features_builder:features_builder.py:191 - TfIdfVectorizer for class attribute exists. Loaging...


<47750x389 sparse matrix of type '<class 'numpy.float64'>'
	with 64025 stored elements in Compressed Sparse Row format>

# Explore attributes

In [37]:
from collections import Counter

stats_na = Counter()
stats_ctl = Counter()
stats_all = Counter()

print('Number of labels:', df[df.label_text != 'n/a'].shape[0])

# Will use Only attributes from labeled classes 

for _, r in tqdm(df[['attributes', 'label_text', 'displayed']].iterrows()):
    if r.attributes is not None:
        lst = [k for k in r.attributes.keys() if (r.attributes.get(k) is not None) and (str(r.attributes.get(k)).strip() != "")]
        if r.label_text == 'n/a':
            stats_na.update(lst)
        else:
            stats_ctl.update(lst)
        stats_all.update(lst)
attributes_na_df = pd.DataFrame({'keys': stats_na.keys(), 'counts': stats_na.values()}).sort_values(by='counts', ascending=False)

attributes_ctl_df = pd.DataFrame({'keys': stats_ctl.keys(), 'counts': stats_ctl.values()}).sort_values(by='counts', ascending=False)
attributes_all_df = pd.DataFrame({'keys': stats_all.keys(), 'counts': stats_all.values()}).sort_values(by='counts', ascending=False)


Number of labels: 4557


0it [00:00, ?it/s]

In [38]:
attributes_all_df['p_all']=attributes_all_df.counts/df.shape[0]
attributes_all_df

Unnamed: 0,keys,counts,p_all
9,class,30793,0.644880
5,href,5150,0.107853
16,id,2669,0.055895
10,role,2262,0.047372
11,style,2227,0.046639
...,...,...,...
211,data-videotype,1,0.000021
210,data-videotitle,1,0.000021
209,data-videoid,1,0.000021
208,data-player,1,0.000021


In [39]:
attr_stats_df = attributes_na_df.merge(attributes_ctl_df, on='keys', how='left')
attr_stats_df = attr_stats_df.merge(attributes_all_df[['keys', 'p_all']], on = 'keys', how='left')

counts_x_sum, counts_y_sum = attr_stats_df.agg({ 'counts_x': 'sum', 'counts_y': 'sum'}).values
attr_stats_df['p_x'] = attr_stats_df.counts_x / counts_x_sum
attr_stats_df['p_y'] = attr_stats_df.counts_y / counts_y_sum


attr_stats_df['importance'] = attr_stats_df['p_y'] / attr_stats_df['p_x']

attr_stats_df[(attr_stats_df.p_x < attr_stats_df.p_y) & (attr_stats_df.counts_y > 30)].sort_values(by='p_y', ascending=False).head(50)



Unnamed: 0,keys,counts_x,counts_y,p_all,p_x,p_y,importance
1,href,2958,2192.0,0.107853,0.052912,0.179864,3.399292
10,tabindex,707,952.0,0.034743,0.012647,0.078116,6.176801
9,type,790,797.0,0.033236,0.014131,0.065398,4.627829
5,role,1542,720.0,0.047372,0.027583,0.059079,2.141875
23,aria-disabled,323,682.0,0.021047,0.005778,0.055961,9.685631
4,id,2064,605.0,0.055895,0.03692,0.049643,1.344596
29,target,283,595.0,0.018387,0.005062,0.048823,9.644431
34,aria-label,191,227.0,0.008754,0.003417,0.018626,5.451783
86,placeholder,24,141.0,0.003455,0.000429,0.01157,26.949701
117,aria-invalid,14,130.0,0.003016,0.00025,0.010667,42.595271


In [40]:
_keys = set([
    'aria-invalid', 
    'aria-haspopup', 
    'aria-expanded',      
    'aria-required',
    'aria-disabled',
    'aria-selected',
    'aria-describedby',
    'aria-controls',
    'aria-invalid',
    'placeholder',
    'value',
    'for',
    'onclick',
    'target',
    'role',
    'type',
    'id',
    'name',
    'href',
    'min',
    'max'
])

report = []
with trange(df.shape[0]) as bar:
    for _, r in df.iterrows():
        attr = r.attributes
        if type(attr) is dict:
            d = {}
            d['label'] = 1 if r.label_text != 'n/a' else 0
            for k in _keys:
                v = attr.get(k)
                if v is not None and v.strip() != "":
                    d[k]=1
                else:
                    d[k]=0
        else:
            d = {k:0 for k in _keys}
                
        report.append(d)
        bar.update(1)

report_df = pd.DataFrame(report)

  0%|          | 0/47750 [00:00<?, ?it/s]

In [41]:
report_df.sum(axis=1).value_counts()

0.0    35796
1.0     6327
2.0     3538
3.0     1573
4.0      298
5.0      103
6.0      100
7.0        9
9.0        4
8.0        2
dtype: int64

In [42]:
df.attributes.apply(lambda x: x.get('target') if x is not None else None).value_counts()

_blank    762
_self     115
_top        1
Name: attributes, dtype: int64

In [43]:
df.columns

Index(['attributes', 'displayed', 'element_id', 'height', 'onmouseenter',
       'onmouseover', 'parent_id', 'style', 'tag_name', 'text', 'width', 'x',
       'y', 'is_hidden', 'upper_sibling', 'lower_sibling', 'siblings', 'idx',
       'label', 'annotation_line_no', 'iou', 'tag', 'label_text', 'dataset'],
      dtype='object')

In [44]:
report_df.corr()

Unnamed: 0,label,aria-disabled,max,role,aria-expanded,aria-controls,name,placeholder,aria-describedby,aria-haspopup,...,id,aria-invalid,value,onclick,target,type,href,aria-selected,for,min
label,1.0,0.289171,0.082982,0.164699,0.06773,0.066235,0.009287,0.151513,0.084125,0.088822,...,0.103278,0.150591,0.051964,0.059088,0.26945,0.25374,0.385939,0.067623,0.013152,0.082179
aria-disabled,0.289171,1.0,-0.000506,0.592287,0.008375,0.095919,-0.011795,-0.008634,0.008089,-0.001235,...,0.008781,0.061116,-0.010307,-0.005893,-0.003779,-0.027187,-0.039693,0.131831,-0.015682,-0.000762
max,0.082982,-0.000506,1.0,-0.004473,-0.001587,-0.00134,-0.003348,0.190961,-0.001122,0.016264,...,0.063075,0.009637,0.20558,-0.00134,-0.004562,0.165749,-0.01159,-0.001035,-0.003565,0.936062
role,0.164699,0.592287,-0.004473,1.0,0.08692,0.108972,-0.016452,0.000309,0.039428,0.043975,...,0.035433,0.045268,-0.018407,-0.008962,-0.029053,-0.027597,-0.071814,0.139253,-0.023849,-0.004761
aria-expanded,0.06773,0.008375,-0.001587,0.08692,1.0,0.283752,-0.00035,0.057297,-0.001602,0.230889,...,0.04215,0.045621,-0.004778,-0.001913,-0.003235,0.089549,-0.002342,-0.001478,-0.005092,-0.001631
aria-controls,0.066235,0.095919,-0.00134,0.108972,0.283752,1.0,0.00646,0.006528,-0.001352,0.027539,...,0.101552,-0.00221,-0.004033,-0.001615,-0.0055,0.059519,0.00117,0.671722,-0.004298,-0.001377
name,0.009287,-0.011795,-0.003348,-0.016452,-0.00035,0.00646,1.0,0.044326,0.009151,-0.003622,...,0.169903,0.044403,0.318488,-0.004037,-0.013748,0.286829,-0.034926,-0.003119,-0.010743,-0.003442
placeholder,0.151513,-0.008634,0.190961,0.000309,0.057297,0.006528,0.044326,1.0,0.199768,0.06725,...,0.161223,0.224561,0.094678,-0.002367,-0.008059,0.237951,-0.020474,-0.001829,-0.006298,0.1961
aria-describedby,0.084125,0.008089,-0.001122,0.039428,-0.001602,-0.001352,0.009151,0.199768,1.0,-0.001213,...,0.070474,0.202698,0.021713,-0.001352,-0.004605,0.056332,-0.011699,-0.001045,-0.003599,-0.001153
aria-haspopup,0.088822,-0.001235,0.016264,0.043975,0.230889,0.027539,-0.003622,0.06725,-0.001213,1.0,...,0.079834,0.24196,-0.003618,-0.001449,-0.000606,0.07767,-0.001288,-0.00112,-0.003856,0.015757


In [45]:
logger.setLevel(logging.DEBUG)
attributes_df = build_attributes_feature(df)
attributes_df

2021-06-16 20:51:34,325 -INFO - features_builder:features_builder.py:152 - used column: attributes


  0%|          | 0/47750 [00:00<?, ?it/s]

Unnamed: 0,aria-invalid,aria-haspopup,aria-expanded,aria-required,aria-disabled,aria-selected,aria-describedby,aria-controls,placeholder,value,for,onclick,target,id,name,min,max
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47745,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
47746,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
47747,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
47748,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [46]:
csr_matrix(attributes_df.values), attributes_df.values.sum()

(<47750x17 sparse matrix of type '<class 'numpy.int64'>'
 	with 7002 stored elements in Compressed Sparse Row format>,
 7002)

In [47]:
df.columns

Index(['attributes', 'displayed', 'element_id', 'height', 'onmouseenter',
       'onmouseover', 'parent_id', 'style', 'tag_name', 'text', 'width', 'x',
       'y', 'is_hidden', 'upper_sibling', 'lower_sibling', 'siblings', 'idx',
       'label', 'annotation_line_no', 'iou', 'tag', 'label_text', 'dataset'],
      dtype='object')

# Check for duplicates

In [48]:
assert df.element_id.nunique() == df.shape[0], "There are duplicates in the dataset"
display(HTML("<h1><b>The dataset is OK, there are no duplicates</b></h1>"))

<hr style="height: 5px">

# DatasetCollector

In [49]:
COLS = ['element_id', 'tag_name', 'attributes', 'displayed', 'is_hidden']
TARGET_PARENT_COLUMNS = ['parent_id', 'tag_name_parent', 'attributes_parent', 'displayed_parent', 'is_hidden_parent']
df_parent = df[COLS].copy()
df_parent.columns = TARGET_PARENT_COLUMNS
df.shape

TARGET_UP_SIBLING_COLUMNS = ['upper_sibling', 'tag_name_upsib', 'attributes_upsib', 'displayed_upsib', 'is_hidden_upsib']
upsib_df = df[COLS].copy()
upsib_df.columns = TARGET_UP_SIBLING_COLUMNS

TARGET_DN_SIBLING_COLUMNS = ['lower_sibling', 'tag_name_dnsib', 'attributes_dnsib', 'displayed_dnsib', 'is_hidden_dnsib']
dnsib_df = df[COLS].copy()
dnsib_df.columns = TARGET_DN_SIBLING_COLUMNS



In [50]:
train_df = df.merge(df_parent, on='parent_id', how='left')
train_df = train_df.merge(upsib_df, on='upper_sibling', how='left')
train_df = train_df.merge(dnsib_df, on='lower_sibling', how='left')


In [51]:
train_df.shape, train_df.columns

((47750, 36),
 Index(['attributes', 'displayed', 'element_id', 'height', 'onmouseenter',
        'onmouseover', 'parent_id', 'style', 'tag_name', 'text', 'width', 'x',
        'y', 'is_hidden', 'upper_sibling', 'lower_sibling', 'siblings', 'idx',
        'label', 'annotation_line_no', 'iou', 'tag', 'label_text', 'dataset',
        'tag_name_parent', 'attributes_parent', 'displayed_parent',
        'is_hidden_parent', 'tag_name_upsib', 'attributes_upsib',
        'displayed_upsib', 'is_hidden_upsib', 'tag_name_dnsib',
        'attributes_dnsib', 'displayed_dnsib', 'is_hidden_dnsib'],
       dtype='object'))

In [52]:
parent_attributes_sm = csr_matrix(build_attributes_feature(df = train_df, colname = 'attributes_parent').values)
parent_attributes_sm

2021-06-16 20:51:38,229 -INFO - features_builder:features_builder.py:152 - used column: attributes_parent


  0%|          | 0/47750 [00:00<?, ?it/s]

<47750x17 sparse matrix of type '<class 'numpy.int64'>'
	with 6456 stored elements in Compressed Sparse Row format>

In [53]:
upsib_attributes_sm = csr_matrix(build_attributes_feature(df = train_df, colname = 'attributes_upsib').values)
upsib_attributes_sm

2021-06-16 20:51:41,890 -INFO - features_builder:features_builder.py:152 - used column: attributes_upsib


  0%|          | 0/47750 [00:00<?, ?it/s]

<47750x17 sparse matrix of type '<class 'numpy.int64'>'
	with 3630 stored elements in Compressed Sparse Row format>

In [54]:
dnsib_attributes_sm = csr_matrix(build_attributes_feature(df = train_df, colname = 'attributes_dnsib').values)
dnsib_attributes_sm

2021-06-16 20:51:45,526 -INFO - features_builder:features_builder.py:152 - used column: attributes_dnsib


  0%|          | 0/47750 [00:00<?, ?it/s]

<47750x17 sparse matrix of type '<class 'numpy.int64'>'
	with 2877 stored elements in Compressed Sparse Row format>

<hr>

- # class

In [55]:
build_class_feature(train_df, colname='attributes')

2021-06-16 20:51:49,167 -INFO - features_builder:features_builder.py:188 - used column: attributes
2021-06-16 20:51:49,168 -INFO - features_builder:features_builder.py:191 - TfIdfVectorizer for class attribute exists. Loaging...


<47750x389 sparse matrix of type '<class 'numpy.float64'>'
	with 64025 stored elements in Compressed Sparse Row format>

In [56]:
build_class_feature(train_df, colname='attributes_parent')

2021-06-16 20:51:49,366 -INFO - features_builder:features_builder.py:188 - used column: attributes_parent
2021-06-16 20:51:49,367 -INFO - features_builder:features_builder.py:191 - TfIdfVectorizer for class attribute exists. Loaging...


<47750x389 sparse matrix of type '<class 'numpy.float64'>'
	with 77982 stored elements in Compressed Sparse Row format>

In [57]:
build_class_feature(train_df, colname='attributes_upsib')

2021-06-16 20:51:49,591 -INFO - features_builder:features_builder.py:188 - used column: attributes_upsib
2021-06-16 20:51:49,592 -INFO - features_builder:features_builder.py:191 - TfIdfVectorizer for class attribute exists. Loaging...


<47750x389 sparse matrix of type '<class 'numpy.float64'>'
	with 28251 stored elements in Compressed Sparse Row format>

In [58]:
build_class_feature(train_df, colname='attributes_dnsib')

2021-06-16 20:51:49,746 -INFO - features_builder:features_builder.py:188 - used column: attributes_dnsib
2021-06-16 20:51:49,748 -INFO - features_builder:features_builder.py:191 - TfIdfVectorizer for class attribute exists. Loaging...


<47750x389 sparse matrix of type '<class 'numpy.float64'>'
	with 27468 stored elements in Compressed Sparse Row format>

<hr>

- # tag_name

In [59]:
build_tag_name_feature(train_df, colname='tag_name')

2021-06-16 20:51:49,897 -INFO - features_builder:features_builder.py:228 - used column: tag_name
2021-06-16 20:51:49,898 -INFO - features_builder:features_builder.py:231 - loading model/ohe_tag_name.pkl


<47750x29 sparse matrix of type '<class 'numpy.float64'>'
	with 34173 stored elements in Compressed Sparse Row format>

In [60]:
build_tag_name_feature(train_df, colname='tag_name_parent')

2021-06-16 20:51:49,930 -INFO - features_builder:features_builder.py:228 - used column: tag_name_parent
2021-06-16 20:51:49,931 -INFO - features_builder:features_builder.py:231 - loading model/ohe_tag_name.pkl


<47750x29 sparse matrix of type '<class 'numpy.float64'>'
	with 30194 stored elements in Compressed Sparse Row format>

In [61]:
build_tag_name_feature(train_df, colname='tag_name_upsib')

2021-06-16 20:51:49,961 -INFO - features_builder:features_builder.py:228 - used column: tag_name_upsib
2021-06-16 20:51:49,962 -INFO - features_builder:features_builder.py:231 - loading model/ohe_tag_name.pkl


<47750x29 sparse matrix of type '<class 'numpy.float64'>'
	with 17026 stored elements in Compressed Sparse Row format>

In [62]:
build_tag_name_feature(train_df, colname='tag_name_dnsib')

2021-06-16 20:51:49,992 -INFO - features_builder:features_builder.py:228 - used column: tag_name_dnsib
2021-06-16 20:51:49,993 -INFO - features_builder:features_builder.py:231 - loading model/ohe_tag_name.pkl


<47750x29 sparse matrix of type '<class 'numpy.float64'>'
	with 17262 stored elements in Compressed Sparse Row format>

<hr>

- # type

In [63]:
build_type_feature(train_df, colname='attributes')

2021-06-16 20:51:50,023 -INFO - features_builder:features_builder.py:279 - used column: attributes
2021-06-16 20:51:50,024 -INFO - features_builder:features_builder.py:282 - loading model/ohe_type.pkl


<47750x1548 sparse matrix of type '<class 'numpy.float64'>'
	with 47723 stored elements in Compressed Sparse Row format>

In [64]:
build_type_feature(train_df, colname='attributes_parent')

2021-06-16 20:51:50,070 -INFO - features_builder:features_builder.py:279 - used column: attributes_parent
2021-06-16 20:51:50,071 -INFO - features_builder:features_builder.py:282 - loading model/ohe_type.pkl


<47750x1548 sparse matrix of type '<class 'numpy.float64'>'
	with 47744 stored elements in Compressed Sparse Row format>

In [65]:
build_type_feature(train_df, colname='attributes_upsib')

2021-06-16 20:51:50,116 -INFO - features_builder:features_builder.py:279 - used column: attributes_upsib
2021-06-16 20:51:50,116 -INFO - features_builder:features_builder.py:282 - loading model/ohe_type.pkl


<47750x1548 sparse matrix of type '<class 'numpy.float64'>'
	with 47723 stored elements in Compressed Sparse Row format>

In [66]:
build_type_feature(train_df, colname='attributes_dnsib')

2021-06-16 20:51:50,158 -INFO - features_builder:features_builder.py:279 - used column: attributes_dnsib
2021-06-16 20:51:50,158 -INFO - features_builder:features_builder.py:282 - loading model/ohe_type.pkl


<47750x1548 sparse matrix of type '<class 'numpy.float64'>'
	with 47729 stored elements in Compressed Sparse Row format>

<hr>

- # role

In [67]:
build_role_feature(train_df, colname='attributes')

2021-06-16 20:51:50,199 -INFO - features_builder:features_builder.py:251 - used column: attributes
2021-06-16 20:51:50,200 -INFO - features_builder:features_builder.py:254 - loading model/ohe_role.pkl


<47750x37 sparse matrix of type '<class 'numpy.float64'>'
	with 47731 stored elements in Compressed Sparse Row format>

In [68]:
build_role_feature(train_df, colname='attributes_parent')

2021-06-16 20:51:50,250 -INFO - features_builder:features_builder.py:251 - used column: attributes_parent
2021-06-16 20:51:50,251 -INFO - features_builder:features_builder.py:254 - loading model/ohe_role.pkl


<47750x37 sparse matrix of type '<class 'numpy.float64'>'
	with 47718 stored elements in Compressed Sparse Row format>

In [69]:
build_role_feature(train_df, colname='attributes_upsib')

2021-06-16 20:51:50,293 -INFO - features_builder:features_builder.py:251 - used column: attributes_upsib
2021-06-16 20:51:50,294 -INFO - features_builder:features_builder.py:254 - loading model/ohe_role.pkl


<47750x37 sparse matrix of type '<class 'numpy.float64'>'
	with 47735 stored elements in Compressed Sparse Row format>

In [70]:
build_role_feature(train_df, colname='attributes_dnsib')

2021-06-16 20:51:50,332 -INFO - features_builder:features_builder.py:251 - used column: attributes_dnsib
2021-06-16 20:51:50,333 -INFO - features_builder:features_builder.py:254 - loading model/ohe_role.pkl


<47750x37 sparse matrix of type '<class 'numpy.float64'>'
	with 47735 stored elements in Compressed Sparse Row format>