In [1]:
import os, sys, re, gc

import numpy as np
import pandas as pd
from IPython.display import display, HTML, clear_output

import glob

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
datasets = []

for fn in glob.glob('dataset/df/*.parquet'):
    _df = pd.read_parquet(fn)
    _src_name = re.sub('^.*[/\\\]','', fn).split('.')[0]
    _df['src_name']=_src_name
    print(_src_name)
    datasets.append(_df)
    
df = pd.concat(datasets)
gc.collect()
   

ac-avto161
angular
bootstrap-1
bootstrap-form-control
bootstrap-form
bootstrap-forms
bootstrap-reboot
bootstrap
complex-table
contact-form
dates
different-elemants
dummy
gitlab
google-voice
html-5
login
metals-and-colors
mobile-and-html-5
ms-office
performance
react-ant
search
support
table-with-pages
user-table


0

In [3]:
df.columns

Index(['attributes', 'displayed', 'element_id', 'height', 'onmouseenter',
       'onmouseover', 'parent_id', 'style', 'tag_name', 'text', 'width', 'x',
       'y', 'src_name'],
      dtype='object')

In [4]:
display(df[df.displayed].head())

Unnamed: 0,attributes,displayed,element_id,height,onmouseenter,onmouseover,parent_id,style,tag_name,text,width,x,y,src_name
0,"{'action': None, 'align': None, 'allow': None,...",True,6771639980839753024256585837,6221.0,,,,"[align-content, align-items, align-self, align...",HTML,Актуальные работы в соц сетях\n\nРостов-на-Дон...,1903.0,0.0,0.0,ac-avto161
63,"{'action': None, 'align': None, 'allow': None,...",True,1886445613839753021900865341,6221.0,,,6771639980839753024256585837,"[align-content, align-items, align-self, align...",BODY,Актуальные работы в соц сетях\n\nРостов-на-Дон...,1903.0,0.0,0.0,ac-avto161
64,"{'action': None, 'align': None, 'allow': None,...",True,2437095933839753025299304171,23.0,,,1886445613839753021900865341,"[align-content, align-items, align-self, align...",DIV,,16.0,0.0,0.0,ac-avto161
65,"{'action': None, 'align': None, 'allow': None,...",True,1145479253839753021188922545,16.0,,,2437095933839753025299304171,"[align-content, align-items, align-self, align...",IMG,,16.0,0.0,5.25,ac-avto161
66,"{'action': None, 'align': None, 'allow': None,...",True,1323977871839753026643765669,5998.0,,,1886445613839753021900865341,"[align-content, align-items, align-self, align...",DIV,Актуальные работы в соц сетях\n\nРостов-на-Дон...,1903.0,0.0,0.0,ac-avto161


In [5]:
display(HTML(f"<h2>Dataset shape: {df.shape}</h2>"))

In [6]:
pd.DataFrame(df.groupby('tag_name').size()).sort_values(by=0, ascending=False).head(50)

Unnamed: 0_level_0,0
tag_name,Unnamed: 1_level_1
SPAN,6744
DIV,6133
A,3054
TD,2580
LI,1922
P,1063
OPTION,675
TR,637
INPUT,625
LABEL,583


In [7]:
df[df.displayed].groupby('tag_name').size()

tag_name
A          2016
ABBR          2
ADDRESS       3
ARTICLE       4
ASIDE         3
           ... 
polygon       2
rect         35
svg         138
switch        1
text         28
Length: 279, dtype: int64

In [8]:
df.shape

(30025, 14)

In [9]:
df['attributes'].apply(lambda x: True if x.get('checked') is not None else False).value_counts()

False    30001
True        24
Name: attributes, dtype: int64

In [10]:
df['attributes'].apply(lambda x: True if x.get('style') is not None else False).value_counts()

False    28560
True      1465
Name: attributes, dtype: int64

In [11]:
df['attributes'].apply(lambda x: True if x.get('ui') is not None else False).value_counts()

False    29590
True       435
Name: attributes, dtype: int64

In [12]:
df['attributes'].apply(lambda x: True if x.get('class') is not None else False).value_counts()

True     19180
False    10845
Name: attributes, dtype: int64

In [13]:
def children_tags_tree(df:pd.DataFrame=df, children_set:set=None, level=0):

    # get leafs (nodes without children)
    if children_set is None:
        level=0
        print('level: 0')
        children_set = set(df.element_id.values) - set(df.parent_id.values)
        #print(f"children_set: {len(children_set)}")
        children_tags_df = df[df.element_id.isin(children_set)][['parent_id', 'tag_name']]\
                                .groupby('parent_id')['tag_name']\
                                .apply(lambda x: ','.join(x))\
                                .reset_index()
        children_tags_dict=dict(children_tags_df.values)
        #print(f"children_tags_dict: {len(children_tags_dict)}")
        #display(children_tags_df.values)
        #display(children_tags_dict)
        #df.children_tags.apply()
        
        # create max_depth field
        df['max_depth'] = 0
        df.max_depth = df.max_depth + df.element_id.isin(set(children_tags_dict.keys())).astype(int)
        
        df['children_tags'] = df.element_id.map(children_tags_dict).fillna('')
        children_tags_tree(df=df, children_set=set(children_tags_dict.keys()), level=level+1)
    elif len(children_set) > 0:
        print(f'level: {level}')
        children_tags_df = df[df.element_id.isin(children_set)][['parent_id', 'tag_name']]\
                                .groupby('parent_id')['tag_name']\
                                .apply(lambda x: ','.join(x))\
                                .reset_index()
        children_tags_dict=dict(children_tags_df.values)
        #print(f"children_tags_dict: {len(children_tags_dict)}")        
        #display(children_tags_df)

        # increase max_depth
        df.max_depth = df.max_depth + df.element_id.isin(set(children_tags_dict.keys())).astype(int)     
        
        df['children_tags'] = df.children_tags + ',' + df.element_id.map(children_tags_dict).fillna('')
        # display(df[['element_id', 'tag_name', 'children_tags']])
        children_tags_tree(df=df, children_set=set(children_tags_dict.keys()), level=level+1)
        
    df['children_tags'] = df['children_tags'].apply(lambda x: re.sub('\s+', ' ', x.replace(',', ' ')).lower().strip())

children_tags_tree(df=df, children_set=None)


level: 0
level: 1
level: 2
level: 3
level: 4
level: 5
level: 6
level: 7
level: 8
level: 9
level: 10
level: 11
level: 12
level: 13
level: 14
level: 15
level: 16
level: 17
level: 18
level: 19
level: 20
level: 21
level: 22
level: 23


In [14]:
df

Unnamed: 0,attributes,displayed,element_id,height,onmouseenter,onmouseover,parent_id,style,tag_name,text,width,x,y,src_name,max_depth,children_tags
0,"{'action': None, 'align': None, 'allow': None,...",True,6771639980839753024256585837,6221.0,,,,"[align-content, align-items, align-self, align...",HTML,Актуальные работы в соц сетях\n\nРостов-на-Дон...,1903.000000,0.000000,0.0,ac-avto161,16,head body body body body body body body body b...
1,"{'action': None, 'align': None, 'allow': None,...",False,6153346729839753029034506186,0.0,,,6771639980839753024256585837,"[align-content, align-items, align-self, align...",HEAD,".gm-style .gm-style-mtc label,.gm-style .gm-st...",0.000000,0.000000,0.0,ac-avto161,1,style style style style style style style styl...
2,"{'action': None, 'align': None, 'allow': None,...",False,0393988419839753028942502782,0.0,,,6153346729839753029034506186,"[align-content, align-items, align-self, align...",STYLE,".gm-style .gm-style-mtc label,.gm-style .gm-st...",0.000000,0.000000,0.0,ac-avto161,0,
3,"{'action': None, 'align': None, 'allow': None,...",False,2334363261839753022756264925,0.0,,,6153346729839753029034506186,"[align-content, align-items, align-self, align...",STYLE,.gm-control-active>img{box-sizing:content-box;...,0.000000,0.000000,0.0,ac-avto161,0,
4,"{'action': None, 'align': None, 'allow': None,...",False,4479087569839753024781875144,0.0,,,6153346729839753029034506186,"[align-content, align-items, align-self, align...",STYLE,".gm-style .gm-style-cc span,.gm-style .gm-styl...",0.000000,0.000000,0.0,ac-avto161,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,"{'alt': None, 'charset': None, 'checked': None...",True,3975481612670198446951036861,15.0,,,5386239870670198446523565866,"[align-content, align-items, align-self, align...",LI,Report a bug,64.109375,1115.890625,1137.0,user-table,1,a
365,"{'alt': None, 'charset': None, 'checked': None...",True,6702341435670198443497132378,15.0,,,3975481612670198446951036861,"[align-content, align-items, align-self, align...",A,Report a bug,64.109375,1115.890625,1137.0,user-table,0,
366,"{'alt': None, 'charset': None, 'checked': None...",False,1335277334670198441108886486,0.0,,,0847353183670198440749665772,"[align-content, align-items, align-self, align...",SCRIPT,"\n UUI.Vertical_Menu.init({""open"":true});\n",0.000000,0.000000,0.0,user-table,0,
367,"{'alt': None, 'charset': None, 'checked': None...",False,5543293597670198440200627542,0.0,,,0847353183670198440749665772,"[align-content, align-items, align-self, align...",SCRIPT,\n $('.selectpicker').selectpicker();\n,0.000000,0.000000,0.0,user-table,0,
