In [1]:
import sys
import re
import os
from datetime import datetime
from time import sleep
from IPython.display import display, clear_output, HTML
import logging

# set working dir
WORKING_DIR = re.sub('jdi-qasp-ml.*$','',os.path.normpath(os.getcwd())) + 'jdi-qasp-ml'
os.chdir(WORKING_DIR)

import utils
from utils import *
import torch
import pandas as pd
import numpy as np
import torch
from glob import glob
import selenium
from tqdm.auto import tqdm, trange

START_TS = datetime.now()
display(HTML(f'{START_TS} - Current working directory: <b>{WORKING_DIR}</b>'))

import matplotlib.pyplot as plt
import matplotlib.patches as patches
%matplotlib inline
from scipy.sparse import csc_matrix, csr_matrix

import numba


2021-06-15 20:49:06,419 -INFO - config:config.py:16 - Module utils.config was loaded
2021-06-15 20:49:06,419 -INFO - common:common.py:618 - Module utils.common is loaded...
2021-06-15 20:49:06,419 -INFO - hidden:hidden.py:121 - hidden module is loaded
2021-06-15 20:49:06,722 -INFO - dataset_builder:dataset_builder.py:207 - dataset package is loaded...
2021-06-15 20:49:07,087 -INFO - dataset:dataset.py:653 - dataset module is loaded...


In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer



In [3]:
datasets = JDIDataset.gen_dataset_names()
# datasets  

In [4]:
logger.setLevel(logging.WARNING)

In [5]:
ds_list = []
with trange(len(datasets)) as bar:
    for ds in datasets:
        bar.set_postfix_str(f'Processing dataset: "{ds}"')
        df_file_path = f'dataset/df/{ds}.parquet'
        ann_file_path = f'dataset/annotations/{ds}.txt'
        
        if not os.path.exists(ann_file_path):
            print('skip:', (ann_file_path, df_file_path))
        else:
            df = pd.read_parquet(df_file_path)
            df = build_features(df)
            df = assign_labels(df=df, annotations_file_path=ann_file_path)
            ds_list.append(df)
            
        bar.update(1)

df = pd.concat(ds_list).copy()

  0%|          | 0/42 [00:00<?, ?it/s]

skip: ('dataset/annotations/ac-avto161.txt', 'dataset/df/ac-avto161.parquet')


In [6]:
df[df.label_text != 'n/a'].shape[0]

4773

In [7]:
df.label_text.value_counts()

n/a                 43193
link                 2410
button               1411
checkbox              246
textfield             213
radiobutton           130
selector               82
datetimeselector       34
table                  32
slider-toggle          30
slider                 29
tab                    27
dropdown               27
textarea               23
progressbar            21
range                  18
tree-node              11
fileinput               8
steper                  7
colorpicker             6
iframe                  5
numberselector          3
Name: label_text, dtype: int64

In [8]:
df.tag_name.value_counts()

DIV                               12932
SPAN                               9201
A                                  3908
TD                                 2775
LI                                 1973
                                  ...  
FORM-FIELD-ERROR-EXAMPLE              1
ICON-OVERVIEW-EXAMPLE                 1
MAT-CARD-TITLE                        1
SIDENAV-DISABLE-CLOSE-EXAMPLE         1
BUTTON-TOGGLE-OVERVIEW-EXAMPLE        1
Name: tag_name, Length: 308, dtype: int64

# tag_name

In [9]:
tag_name_series = df[df.label_text != 'n/a'].tag_name.value_counts()
display(tag_name_series)
tag_name_set = set(tag_name_series.index)

A                    2305
DIV                   971
BUTTON                515
INPUT                 493
LABEL                  86
SPAN                   77
LI                     54
SELECT                 52
P                      44
MAT-RADIO-BUTTON       28
TABLE                  24
MAT-CHECKBOX           23
TEXTAREA               22
MAT-SELECT             21
SVG                    13
MAT-TREE-NODE           8
IMG                     8
IFRAME                  5
MAT-BUTTON-TOGGLE       5
MAT-SLIDER              5
MAT-SLIDE-TOGGLE        3
CIRCLE                  2
PROGRESS                2
RECT                    2
H5                      1
I                       1
G                       1
H2                      1
MAT-TABLE               1
Name: tag_name, dtype: int64

In [10]:
tag_name_ohe = OneHotEncoder(handle_unknown='ignore', categories=[list(tag_name_set)])
tag_name_ohe

OneHotEncoder(categories=[['H5', 'INPUT', 'SELECT', 'MAT-RADIO-BUTTON', 'SVG',
                           'CIRCLE', 'I', 'MAT-TREE-NODE', 'G', 'RECT',
                           'TEXTAREA', 'H2', 'MAT-TABLE', 'A', 'TABLE',
                           'IFRAME', 'MAT-SELECT', 'BUTTON', 'LI',
                           'MAT-SLIDE-TOGGLE', 'PROGRESS', 'MAT-CHECKBOX',
                           'MAT-BUTTON-TOGGLE', 'MAT-SLIDER', 'DIV', 'SPAN',
                           'LABEL', 'IMG', 'P']],
              handle_unknown='ignore')

In [11]:
np.expand_dims(df.tag_name.values, -1)

array([['HTML'],
       ['HEAD'],
       ['TITLE'],
       ...,
       ['SCRIPT'],
       ['SCRIPT'],
       ['SCRIPT']], dtype=object)

In [12]:
tag_name_ohe.fit(np.expand_dims(df[df.label_text != 'n/a'].tag_name.values, -1))

OneHotEncoder(categories=[['H5', 'INPUT', 'SELECT', 'MAT-RADIO-BUTTON', 'SVG',
                           'CIRCLE', 'I', 'MAT-TREE-NODE', 'G', 'RECT',
                           'TEXTAREA', 'H2', 'MAT-TABLE', 'A', 'TABLE',
                           'IFRAME', 'MAT-SELECT', 'BUTTON', 'LI',
                           'MAT-SLIDE-TOGGLE', 'PROGRESS', 'MAT-CHECKBOX',
                           'MAT-BUTTON-TOGGLE', 'MAT-SLIDER', 'DIV', 'SPAN',
                           'LABEL', 'IMG', 'P']],
              handle_unknown='ignore')

In [13]:
tag_name_sm = tag_name_ohe.transform(np.expand_dims(df.tag_name.values, -1))
tag_name_sm

<47966x29 sparse matrix of type '<class 'numpy.float64'>'
	with 34389 stored elements in Compressed Sparse Row format>

In [14]:
logger.setLevel(logging.DEBUG)
tag_name_sm = build_tag_name_feature(df)
tag_name_sm

2021-06-15 20:49:39,231 -INFO - features_builder:features_builder.py:216 - loading model/ohe_tag_name.pkl


<47966x29 sparse matrix of type '<class 'numpy.float64'>'
	with 34389 stored elements in Compressed Sparse Row format>

# attributes.role

In [15]:
attr_role_series = df[df.label_text != 'n/a'].attributes.apply(lambda x: None if x is None else x.get('role')).fillna('')
attr_role_series.value_counts()

               3843
button          820
tab              34
listbox          21
combobox         15
grid             12
switch            9
group             8
slider            5
option            3
progressbar       3
Name: attributes, dtype: int64

In [16]:
attr_role_ohe = OneHotEncoder(handle_unknown='ignore').fit(np.expand_dims(attr_role_series.values, -1))
attr_role_ohe

OneHotEncoder(handle_unknown='ignore')

In [17]:
attr_role_sm = attr_role_ohe.transform(
    np.expand_dims(
        df.attributes.apply(lambda x: None if x is None else x.get('role')).fillna(''),
        -1
    )
)
attr_role_sm

<47966x11 sparse matrix of type '<class 'numpy.float64'>'
	with 46773 stored elements in Compressed Sparse Row format>

In [18]:
logger.setLevel(logging.DEBUG)
role_sm = build_role_feature(df)
role_sm

2021-06-15 20:49:39,359 -INFO - features_builder:features_builder.py:238 - loading model/ohe_role.pkl


<47966x11 sparse matrix of type '<class 'numpy.float64'>'
	with 46773 stored elements in Compressed Sparse Row format>

# attributes.type

In [19]:
attr_type_series = df[df.label_text != 'n/a'].attributes.apply(lambda x: None if x is None else x.get('type')).fillna('')
attr_type_series.value_counts()

                  3964
button             345
checkbox           151
text               102
submit              41
number              34
radio               31
range               27
search              15
email               13
password            13
file                 9
color                5
reset                5
month                3
week                 3
date                 3
time                 3
datetime-local       3
tel                  2
url                  1
Name: attributes, dtype: int64

In [20]:
attr_type_ohe = OneHotEncoder(handle_unknown='ignore').fit(np.expand_dims(attr_type_series.values, -1))
attr_type_ohe

OneHotEncoder(handle_unknown='ignore')

In [21]:
attr_type_sm = attr_type_ohe.transform(
    np.expand_dims(
        df.attributes.apply(lambda x: None if x is None else x.get('type')).fillna(''),
        -1
    )
)
attr_type_sm

<47966x21 sparse matrix of type '<class 'numpy.float64'>'
	with 47703 stored elements in Compressed Sparse Row format>

In [22]:
logger.setLevel(logging.DEBUG)
type_sm = build_type_feature(df)
type_sm

2021-06-15 20:49:39,508 -INFO - features_builder:features_builder.py:269 - Building OHE for "type"
2021-06-15 20:49:39,518 -INFO - features_builder:features_builder.py:275 - OHE "type" categories: auto
2021-06-15 20:49:39,519 -INFO - features_builder:features_builder.py:277 - Saving model/ohe_type.pkl


<47966x21 sparse matrix of type '<class 'numpy.float64'>'
	with 47703 stored elements in Compressed Sparse Row format>

# attributes.ui

In [20]:
attr_ui_series = df[df.label_text != 'n/a'].attributes.apply(lambda x: None if x is None else x.get('ui')).fillna('')
attr_ui_series.value_counts()

               4714
label            40
furniture         5
d-table           3
table             3
github-link       2
dropdown          2
products          1
textarea          1
products-2        1
link              1
Name: attributes, dtype: int64

In [21]:
attr_ui_ohe = OneHotEncoder(handle_unknown='ignore').fit(np.expand_dims(attr_ui_series.values, -1))
attr_ui_ohe

OneHotEncoder(handle_unknown='ignore')

In [22]:
attr_ui_sm = attr_ui_ohe.transform(
    np.expand_dims(
        df.attributes.apply(lambda x: None if x is None else x.get('ui')).fillna(''),
        -1
    )
)
attr_ui_sm

<47966x11 sparse matrix of type '<class 'numpy.float64'>'
	with 47957 stored elements in Compressed Sparse Row format>

# attributes.class

In [23]:
attr_class_series = df[df.label_text != 'n/a'].attributes.apply(lambda x: None if x is None else x.get('class')).fillna('')
attr_class_series.value_counts()

                                                                                                                                         1410
MuiButtonBase-root MuiListItem-root MuiListItem-gutters MuiListItem-button                                                                796
news-link                                                                                                                                 165
_1-6r                                                                                                                                     130
_1V0q                                                                                                                                     103
                                                                                                                                         ... 
btn btn-primary btn-lg                                                                                                                      1
mat-in

In [24]:
class_cv = CountVectorizer()

In [25]:
class_cv.fit(attr_class_series.values)

CountVectorizer()

In [26]:
vocabulary = sorted([v for v in class_cv.vocabulary_.keys() if re.match(r'^[a-z]+$',v ) and len(v) > 2])
print(len(vocabulary))
print(vocabulary)

363
['above', 'accent', 'account', 'action', 'active', 'add', 'alert', 'align', 'all', 'anchorjs', 'ani', 'ant', 'appearance', 'arrow', 'autocomplete', 'autofill', 'autosize', 'badge', 'bannerclose', 'bar', 'base', 'before', 'benefits', 'black', 'block', 'blue', 'body', 'bold', 'border', 'bottom', 'brand', 'breaking', 'btn', 'btnprimary', 'buffer', 'button', 'buttonlink', 'buttonnew', 'buttontext', 'buy', 'carbon', 'card', 'carousel', 'cascader', 'cat', 'cdk', 'center', 'change', 'char', 'check', 'checkbox', 'checked', 'checkmark', 'children', 'chip', 'circle', 'circledisableshrink', 'circleindeterminate', 'click', 'clipboard', 'close', 'cnb', 'cnm', 'collapsed', 'colorprimary', 'colorsecondary', 'column', 'company', 'comparisons', 'contained', 'containedprimary', 'containedsecondary', 'containedsizelarge', 'containedsizesmall', 'container', 'content', 'control', 'corner', 'cta', 'current', 'custom', 'danger', 'dark', 'dashed', 'dashedcolorprimary', 'datatable', 'default', 'delete', 'd

In [27]:
attr_class_cv = CountVectorizer(vocabulary=vocabulary) #.fit(attr_class_series.values)
attr_class_cv

CountVectorizer(vocabulary=['above', 'accent', 'account', 'action', 'active',
                            'add', 'alert', 'align', 'all', 'anchorjs', 'ani',
                            'ant', 'appearance', 'arrow', 'autocomplete',
                            'autofill', 'autosize', 'badge', 'bannerclose',
                            'bar', 'base', 'before', 'benefits', 'black',
                            'block', 'blue', 'body', 'bold', 'border', 'bottom', ...])

In [28]:
attr_class_sm = attr_class_cv.transform(attr_class_series.values)
attr_class_mx = attr_class_sm.todense()
attr_class_sm

<4773x363 sparse matrix of type '<class 'numpy.int64'>'
	with 11807 stored elements in Compressed Sparse Row format>

In [29]:
attr_class_mx[10]

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [30]:
attr_class_tfidf = TfidfVectorizer(vocabulary=vocabulary).fit(attr_class_series.values)

In [31]:
attr_class_sm = attr_class_tfidf.transform(attr_class_series.values)
attr_class_mx = attr_class_sm.todense()
attr_class_sm, type(attr_class_sm)

(<4773x363 sparse matrix of type '<class 'numpy.float64'>'
 	with 11807 stored elements in Compressed Sparse Row format>,
 scipy.sparse.csr.csr_matrix)

In [32]:
logger.setLevel(logging.DEBUG)
attr_class_sm = build_class_feature(df)
attr_class_sm

2021-06-15 14:42:35,888 -INFO - features_builder:features_builder.py:184 - TfIdfVectorizer for class attribute does not exist. Build the one.
2021-06-15 14:42:35,888 -INFO - features_builder:features_builder.py:188 - Extract useful class features
2021-06-15 14:42:36,086 -INFO - features_builder:features_builder.py:201 - Saving model/attr_class_tfidfd.pkl, vocabulary length: 389


<47966x389 sparse matrix of type '<class 'numpy.float64'>'
	with 65133 stored elements in Compressed Sparse Row format>

# Explore attributes

In [33]:
from collections import Counter

stats_na = Counter()
stats_ctl = Counter()
stats_all = Counter()

print('Number of labels:', df[df.label_text != 'n/a'].shape[0])

# Will Only attributes from labeled classes 

for _, r in tqdm(df[['attributes', 'label_text', 'displayed']].iterrows()):
    if r.attributes is not None:
        lst = [k for k in r.attributes.keys() if (r.attributes.get(k) is not None) and (str(r.attributes.get(k)).strip() != "")]
        if r.label_text == 'n/a':
            stats_na.update(lst)
        else:
            stats_ctl.update(lst)
        stats_all.update(lst)
attributes_na_df = pd.DataFrame({'keys': stats_na.keys(), 'counts': stats_na.values()}).sort_values(by='counts', ascending=False)

attributes_ctl_df = pd.DataFrame({'keys': stats_ctl.keys(), 'counts': stats_ctl.values()}).sort_values(by='counts', ascending=False)
attributes_all_df = pd.DataFrame({'keys': stats_all.keys(), 'counts': stats_all.values()}).sort_values(by='counts', ascending=False)


Number of labels: 4773


0it [00:00, ?it/s]

In [34]:
attributes_all_df['p_all']=attributes_all_df.counts/df.shape[0]
attributes_all_df

Unnamed: 0,keys,counts,p_all
9,class,31009,0.646479
5,href,5150,0.107368
16,id,2674,0.055748
10,role,2472,0.051537
11,style,2227,0.046429
...,...,...,...
291,data-pause,1,0.000021
290,x-placement,1,0.000021
166,__bizdiag,1,0.000021
167,data-st-active-query-class,1,0.000021


In [35]:
attr_stats_df = attributes_na_df.merge(attributes_ctl_df, on='keys', how='left')
attr_stats_df = attr_stats_df.merge(attributes_all_df[['keys', 'p_all']], on = 'keys', how='left')

counts_x_sum, counts_y_sum = attr_stats_df.agg({ 'counts_x': 'sum', 'counts_y': 'sum'}).values
attr_stats_df['p_x'] = attr_stats_df.counts_x / counts_x_sum
attr_stats_df['p_y'] = attr_stats_df.counts_y / counts_y_sum


attr_stats_df['importance'] = attr_stats_df['p_y'] / attr_stats_df['p_x']

attr_stats_df[(attr_stats_df.p_x < attr_stats_df.p_y) & (attr_stats_df.counts_y > 30)].sort_values(by='p_y', ascending=False).head(50)



Unnamed: 0,keys,counts_x,counts_y,p_all,p_x,p_y,importance
1,href,2958,2192.0,0.107368,0.052916,0.167341,3.162394
10,tabindex,701,1163.0,0.038861,0.01254,0.088785,7.080034
5,role,1542,930.0,0.051537,0.027585,0.070998,2.573785
23,aria-disabled,324,885.0,0.025205,0.005796,0.067562,11.656601
9,type,789,809.0,0.033315,0.014114,0.06176,4.375676
4,id,2065,609.0,0.055748,0.036941,0.046492,1.258551
29,target,283,595.0,0.018305,0.005063,0.045423,8.972308
34,aria-label,192,229.0,0.008777,0.003435,0.017482,5.089884
82,placeholder,24,145.0,0.003523,0.000429,0.01107,25.782821
112,aria-invalid,15,134.0,0.003106,0.000268,0.01023,38.123012


In [36]:
_keys = set([
    'aria-invalid', 
    'aria-haspopup', 
    'aria-expanded',      
    'aria-required',
    'aria-disabled',
    'aria-selected',
    'aria-describedby',
    'aria-controls',
    'aria-invalid',
    'placeholder',
    'value',
    'for',
    'onclick',
    'target',
    'role',
    'type',
    'id',
    'name',
    'href',
    'min',
    'max'
])

report = []
with trange(df.shape[0]) as bar:
    for _, r in df.iterrows():
        attr = r.attributes
        if attr is not None:
            d = {}
            d['label'] = 1 if r.label_text != 'n/a' else 0
            for k in _keys:
                v = attr.get(k)
                if v is not None and v.strip() != "":
                    d[k]=1
                else:
                    d[k]=0
        else:
            d = {k:0 for k in _keys}
                
        report.append(d)
        bar.update(1)

report_df = pd.DataFrame(report)

  0%|          | 0/47966 [00:00<?, ?it/s]

In [37]:
report_df.sum(axis=1).value_counts()

0.0    35796
1.0     6328
2.0     3538
3.0     1776
4.0      304
5.0      105
6.0       99
7.0        9
9.0        8
8.0        3
dtype: int64

In [38]:
df.attributes.apply(lambda x: x.get('target') if x is not None else None).value_counts()

_blank    762
_self     115
_top        1
Name: attributes, dtype: int64

In [39]:
df.columns

Index(['attributes', 'displayed', 'element_id', 'height', 'onmouseenter',
       'onmouseover', 'parent_id', 'style', 'tag_name', 'text', 'width', 'x',
       'y', 'is_hidden', 'upper_sibling', 'lower_sibling', 'siblings', 'idx',
       'label', 'annotation_line_no', 'iou', 'tag', 'label_text'],
      dtype='object')

In [40]:
report_df.corr()

Unnamed: 0,label,for,target,aria-invalid,aria-expanded,aria-disabled,onclick,aria-haspopup,id,placeholder,...,aria-controls,aria-describedby,max,type,href,aria-selected,aria-required,role,name,value
label,1.0,0.011313,0.261888,0.148573,0.070624,0.33788,0.057284,0.091554,0.098539,0.150005,...,0.064284,0.081901,0.086228,0.249192,0.372761,0.065777,0.129194,0.211064,0.008951,0.050002
for,0.011313,1.0,-0.014571,-0.005956,-0.005185,-0.017158,-0.004279,-0.003991,0.073118,-0.006345,...,-0.004279,-0.003582,-0.003713,-0.019809,-0.037007,-0.003306,-0.005344,-0.024873,-0.010706,-0.010694
target,0.261888,-0.014571,1.0,-0.007622,-0.003428,-0.007075,0.017833,-0.000943,-0.028434,-0.00812,...,-0.005475,-0.004584,-0.004751,-0.02535,0.393724,-0.004231,-0.006838,-0.030423,-0.0137,-0.013685
aria-invalid,0.148573,-0.005956,-0.007622,1.0,0.082293,0.053166,-0.002238,0.278783,0.226472,0.243268,...,-0.002238,0.199222,0.008838,0.08358,-0.01936,-0.00173,0.897152,0.049685,0.043432,0.039713
aria-expanded,0.070624,-0.005185,-0.003428,0.082293,1.0,0.005903,-0.001949,0.274527,0.050046,0.084202,...,0.277338,-0.001631,-0.001691,0.098813,-0.002962,-0.001506,0.09226,0.089817,-0.000546,-0.00487
aria-disabled,0.33788,-0.017158,-0.007075,0.053166,0.005903,1.0,-0.006448,-0.002453,0.001508,-0.009562,...,0.086575,0.0065,-0.001768,-0.029852,-0.045458,0.119629,0.061171,0.632681,-0.013455,-0.012095
onclick,0.057284,-0.004279,0.017833,-0.002238,-0.001949,-0.006448,1.0,-0.0015,0.033382,-0.002384,...,-0.001608,-0.001346,-0.001395,0.053495,0.009643,-0.001242,-0.002008,-0.009347,-0.004023,0.043216
aria-haspopup,0.091554,-0.003991,-0.000943,0.278783,0.274527,-0.002453,-0.0015,1.0,0.088232,0.101405,...,0.026387,-0.001256,0.014761,0.089484,-0.002152,-0.001159,0.311007,0.051878,-0.003752,-0.003748
id,0.098539,0.073118,-0.028434,0.226472,0.050046,0.001508,0.033382,0.088232,1.0,0.164977,...,0.101475,0.070424,0.059528,0.265787,-0.030262,0.109898,0.206109,0.031315,0.169564,0.186275
placeholder,0.150005,-0.006345,-0.00812,0.243268,0.084202,-0.009562,-0.002384,0.101405,0.164977,1.0,...,0.006404,0.197357,0.180171,0.24188,-0.020623,-0.001842,0.264668,0.005236,0.043626,0.093328


In [41]:
attributes_df = build_attributes_feature(df)
attributes_df

  0%|          | 0/47966 [00:00<?, ?it/s]

Unnamed: 0,aria-invalid,aria-haspopup,aria-expanded,aria-required,aria-disabled,aria-selected,aria-describedby,aria-controls,placeholder,value,for,onclick,target,id,name,min,max
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47961,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
47962,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
47963,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
47964,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [42]:
csr_matrix(attributes_df.values), attributes_df.values.sum()

(<47966x17 sparse matrix of type '<class 'numpy.int64'>'
 	with 7247 stored elements in Compressed Sparse Row format>,
 7247)