In [1]:
import os, sys, re, gc, glob, numba
import torch
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from utils import logger, assign_labels, build_children_features
from IPython.display import display, clear_output, HTML
import datetime as dt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack
import pickle
from collections import defaultdict

from tqdm.auto import trange
from collections import Counter

from scipy.sparse import vstack, csr_matrix

START_DT = dt.datetime.now()


2021-04-07 12:26:13,463 - INFO - dataset:dataset.py:29 - dataset package is loaded...


In [2]:
@numba.jit(forceobj=True)
def get_parents_list(tree_dict:dict, element_id:str, paternts_list:list=None) -> list:
    """
        returns ordered list of parent for a element
        starting from root which is the <html/> tag
    """
    if paternts_list is None:
        paternts_list = []
    
    parent_id = tree_dict.get(element_id)
    if parent_id is None:
        return paternts_list
    else:
        paternts_list.append(parent_id)
        return get_parents_list(tree_dict, element_id=parent_id, paternts_list=paternts_list)


In [3]:
def build_tree_dict(df:pd.DataFrame) -> dict:
    """
        Builds tree dict for
        get_parents_list 
    """
    tree_dict = dict(zip(df.element_id.values, df.parent_id.values))
    return tree_dict


In [4]:
def build_tree_features(elements_df:pd.DataFrame) -> pd.DataFrame:
    
    def empty_string():
        return ''

    tree_dict = build_tree_dict(elements_df)
    tag_name_dict = dict(zip(elements_df.element_id.values, elements_df.tag_name.values))
    width_dict = dict(zip(elements_df.element_id.values, elements_df.width.values))
    height_dict = dict(zip(elements_df.element_id.values, elements_df.height.values))

    # Build paths
    followers_counter = Counter()
    level_dict = defaultdict(int)
    children_tags_dict = defaultdict(empty_string)
    
    with trange(elements_df.shape[0]) as tbar:
        tbar.set_description('Build tree features')
        for i, r in elements_df.iterrows(): 
            list_of_parents = get_parents_list(tree_dict=tree_dict, element_id= r.element_id)
            children_tags_dict[r.parent_id] += r.tag_name.lower()+' '
            #print(list_of_parents)
            followers_counter.update(list_of_parents)  # calculate number of followers
            tbar.update(1)
            
    elements_df['children_tags'] = elements_df.element_id.map(children_tags_dict).fillna('')
    elements_df['num_followers'] = elements_df.element_id.map(followers_counter)
    return elements_df


In [5]:
elements_df = pd.read_parquet('dataset/df/bootstrap-reboot.parquet')

In [6]:
len(build_tree_dict(elements_df))

1043

In [7]:
build_tree_features(elements_df=elements_df);

  0%|          | 0/1043 [00:00<?, ?it/s]

In [8]:
# elements_df[['parent_id', 'element_id', 'tag_name', 'children_tags', 'num_followers']].tail(30)

In [9]:
class JDIDataset(Dataset):
    
    def __init__(self, dataset_names:list=None, rebalance=False):
        super(JDIDataset, self).__init__()       
        self.rebalanced = rebalance
               
        with open('dataset/classes.txt', 'r') as f:
            self.classes_dict = { class_name.strip():i for i, class_name in enumerate(f.readlines()) }
            self.classes_reverse_dict = { v:k for k, v in self.classes_dict.items()}
        self.dummy_class_value = self.classes_dict['n/a']
            
        if dataset_names is None:
            logger.warning('Using all available data to generate dataset')
            dataset_names = self._gen_dataset_names()
            
        logger.info(f"List of dataset_names:{dataset_names}")
        
        ds_list=[] # list of datasets to join
        
        for ds_name in dataset_names:
            logger.info(f'Dataset for {ds_name}')
            df = pd.read_parquet(f'dataset/df/{ds_name}.parquet')
            logger.info(f"Dataset shape: {df.shape}")

            logger.info('cleaning tag_name from dummy/auxiliary words')
            df.tag_name = df.tag_name.apply(lambda x: x.lower().replace('-example', '')) ### tag_name LOWER()
            df = build_children_features(df=df)
            df = build_tree_features(df)

            #----------------------------------------------------------------------------------------------
            # Merge children with parents
            # WARNING: There is a tag <HTML> without parent. Let's fix this issue
            df.parent_id = df.apply(lambda r: r.element_id if r.parent_id is None else r.parent_id, axis=1)            
            df = df.merge(df, left_on='parent_id', right_on='element_id', suffixes=('', '_parent'))
            logger.info(f"Dataset shape after merging with parents: {df.shape}")
            #----------------------------------------------------------------------------------------------
            
            # If annotation file exists, lets load it and assign labels
            if os.path.exists(f'dataset/annotations/{ds_name}.txt'):
                logger.warning(f'Load LABELS from dataset/annotations/{ds_name}.txt')
                img = plt.imread(f'dataset/images/{ds_name}.png')
                df = assign_labels(df, annotations_file_path=f'dataset/annotations/{ds_name}.txt', 
                                       img=img,
                                       dummy_value = self.dummy_class_value
                                  )
                del img
            else:
                logger.warning(f'assign dummy values [n/a] for labels if there is no annotations')
                df['label'] = self.dummy_class_value
            
            df['ds_name'] = ds_name
            ds_list.append(df)
            gc.collect()
                
        logger.info('Concatenate datasets')
        self.dataset = pd.concat(ds_list)
        logger.info(f"Dataset shape after reading: {self.dataset.shape}")
        
        if rebalance:
            self._oversample()
        
        self._count_vectorizer_class()

        ### add ohe_ columns to one hot encoding several attributes
        for attr in ['role', 'type', 'ui']:
            logger.info(f'Build OHE column for attribute {attr}')
            self.dataset['ohe_' + attr] = self.dataset['attributes'].apply(lambda x: x.get(attr)).fillna("").str.lower()
            
        for attr in ['role', 'type', 'ui']:
            logger.info(f'Build OHE column for attribute {attr}_parent')
            self.dataset['ohe_' + attr+'_parent'] = self.dataset['attributes_parent'].apply(lambda x: x.get(attr)).fillna("").str.lower()
            
        logger.info('OHE tag_name')
        self.tag_name_sm = self._ohe_column('tag_name')
        logger.info(f'tag_name_sm.shape: {self.tag_name_sm.shape}')

        logger.info('OHE tag_name_parent')
        self.tag_name_parent_sm = self._ohe_column(colname='tag_name_parent', ohe_file_path='model/tag_name.pkl')
        logger.info(f'tag_name_parent_sm.shape: {self.tag_name_parent_sm.shape}')

        logger.info('OHE ohe_role')
        self.ohe_role_sm = self._ohe_column('ohe_role')
        logger.info(f'ohe_role_sm.shape: {self.ohe_role_sm.shape}')

        logger.info('OHE ohe_role_parent')
        self.ohe_role_parent_sm = self._ohe_column(colname='ohe_role_parent', ohe_file_path='model/ohe_role.pkl')
        logger.info(f'ohe_role_parent_sm.shape: {self.ohe_role_parent_sm.shape}')
        
        logger.info('OHE ohe_type')
        self.ohe_type_sm = self._ohe_column('ohe_type')
        logger.info(f'ohe_type_sm.shape: {self.ohe_type_sm.shape}')

        logger.info('OHE ohe_type_parent')
        self.ohe_type_parent_sm = self._ohe_column(colname='ohe_type_parent', ohe_file_path='model/ohe_type.pkl')
        logger.info(f'ohe_type_parent_sm.shape: {self.ohe_type_parent_sm.shape}')
        
        logger.info('OHE ohe_ui')
        self.ohe_ui_sm = self._ohe_column('ohe_ui')
        logger.info(f'ohe_ui_sm.shape: {self.ohe_ui_sm.shape}')

        logger.info('OHE ohe_ui_parent')
        self.ohe_ui_parent_sm = self._ohe_column('ohe_ui_parent', ohe_file_path='model/ohe_ui.pkl')
        logger.info(f'ohe_ui_parent_sm.shape: {self.ohe_ui_parent_sm.shape}')       
        
        ## extract all non null attributes names
        self.dataset['attributes_text'] = self.dataset.attributes.apply(lambda x: " ".join([k for k in x.keys() if x[k] is not None ]))
        logger.info('Fit CountVectorizer for column "attributes"')
        self.attributes_sm = self._count_vectorizer_column('attributes_text')
        logger.info(f'attributes_sm.shape: {self.attributes_sm.shape}')    

        ## children_tags
        file_path='model/count_vectorizer_children_tags.pkl'
        if os.path.exists(file_path):
            with open(file_path, 'rb') as f:
                logger.warning(f'Load CountVectorizer for column "chidren_tags": {file_path}')
                self.count_vectorizer_chilgren_tags = pickle.load(f)
        else:
            logger.warning(f'Saving CountVectorizer for "children_tags": {file_path}')
            self.count_vectorizer_chilgren_tags = CountVectorizer().fit(self.dataset.children_tags.values)
            with open(file_path, 'wb') as f:
                pickle.dump(self.count_vectorizer_chilgren_tags, f)
        logger.info(f'CountVectorizer "chilren_tags" size: {len(self.count_vectorizer_chilgren_tags.vocabulary_)}')
        self.children_tags_sm = self.count_vectorizer_chilgren_tags.transform(self.dataset.children_tags.values)
        logger.info(f"chidren_tags_sm: {self.children_tags_sm.shape}")
        

        ## extract all non null attributes names
        self.dataset['attributes_parent_text'] = self.dataset.attributes_parent.apply(lambda x: " ".join([k for k in x.keys() if x[k] is not None ]))
        logger.info('Fit CountVectorizer for column "attributes_parent"')
        self.attributes_parent_sm = self._count_vectorizer_column(colname='attributes_parent_text', 
                                                                  file_path='model/count_vectorizer_attributes_text.pkl')
        logger.info(f'attributes_parent_sm.shape: {self.attributes_parent_sm.shape}')
        
        self._extract_features()
        
        self.labels = self.dataset.label.astype(int).map(self.classes_reverse_dict)
        self.dataset.label = self.dataset.label.astype(int)
        
        self.data = hstack([
                    self.attributes_sm, 
                    self.tag_name_sm, 
                    self.ohe_role_sm, 
                    self.ohe_type_sm, 
                    self.ohe_ui_sm,
                    self.tag_name_parent_sm,
                    self.ohe_type_parent_sm,
                    self.ohe_role_parent_sm,
                    self.ohe_ui_parent_sm,
                    self.attributes_parent_sm,
                    self.features_df.values,
                    self.children_tags_sm,
                    self.class_sm
                  ]).astype(np.float32)
        
        self.data = csr_matrix(self.data)
        logger.info(f'OHE columns sparse matrix: {self.data.shape}')
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        return idx
        #return np.array(self.data.getrow(idx).todense()[0]).squeeze(), self.dataset.iloc[idx]['label']
    
    def collate_fn(self, batch):
        return  torch.tensor(vstack([self.data.getrow(idx) for idx in batch]).todense().astype(np.float32)), \
                torch.tensor(self.dataset.iloc[batch]['label'].values.astype(np.int64))

    def _oversample(self):
        logger.warning('Oversample data to balance the dataset, this will create duplicated rows in dataset')
        
        class_counts = [ r for r in self.dataset.label.value_counts().sort_values(ascending=False).items()]
        max_count = class_counts[0][1]

        dfs = [self.dataset]
        for cc in class_counts[1:]:
            ratio = max_count//cc[1]
            ratio = 30 if ratio >= 30 else ratio
            for _ in range(ratio):
                dfs.append(self.dataset[self.dataset.label == cc[0]].copy())

        self.dataset = pd.concat(dfs)
        logger.warning(f'Rebalanced dataset size: {self.dataset.shape[0]}')
        
                
    def _ohe_column(self, colname, ohe_file_path=None):
        """
            load attr_ohe if exists model/attr_ohe.pkl
            otherwise build the one
        """
        if ohe_file_path is None:
            file_path = f'model/{colname}.pkl'
        else:
            file_path = ohe_file_path
            
        if os.path.exists(file_path):
            logger.warning(f'loading existing OHE for column "{colname}" from {file_path}')
            with open(file_path,'rb') as f: 
                ohe = pickle.load(f)
        else:
            logger.warning(f'Create and fit OHE for column "{colname}"')
            ohe = OneHotEncoder(handle_unknown='ignore')
            ohe.fit(np.expand_dims(self.dataset[colname].values, axis=1))            
            with open(file_path, 'wb') as f:
                pickle.dump(ohe, f)
                
        sm = ohe.transform(np.expand_dims(self.dataset[colname].values, axis=1))        
        return sm
        
    def _count_vectorizer_column(self, colname, file_path=None):
        """
            load count_vercorizer for a column if a pkl file exists
            otherwise create the one
        """
        if file_path is None:
            file_path = f'model/count_vectorizer_{colname}.pkl'
            
        if os.path.exists(file_path):
            logger.warning(f'loading existing count vectorizer for column "{colname}" from {file_path}')
            with open(file_path,'rb') as f: 
                vectorizer = pickle.load(f)
            self.vocabulary = vectorizer.vocabulary_
        else:
            logger.warning(f'Create and fit count vectorizer for column "{colname}"')
            vectorizer = CountVectorizer(vocabulary=self._build_vocabulary())
            vectorizer.fit(self.dataset[colname].values)            
            with open(file_path, 'wb') as f:
                pickle.dump(vectorizer, f)
                
        sm = vectorizer.transform(self.dataset[colname].values)
        return sm
    
    def _build_vocabulary(self):
        
        """
            Attempt to reduce number of features by removing rarely used attributes
        """
        
        attributes_usage = Counter()
        for attr_list in self.dataset.attributes.apply(lambda x: [field for field in x if x[field] is not None]).values:
            attributes_usage.update(attr_list)
            
        attributes_usasge_df = pd.DataFrame( 
            [[attribute, cnt] for attribute, cnt in attributes_usage.items()], 
            columns=['attribute', 'cnt']
        ).sort_values(by='cnt', ascending=False)

        ## Lets cut off attributes which are rarely used
        attributes_list_df = attributes_usasge_df[attributes_usasge_df.cnt>1].copy()
        
        attr_unique_values_map = { 
                                    attr:self.dataset['attributes'].apply(lambda x: x.get(attr)).nunique() 
                                         for attr in (attributes_list_df.attribute.values)
                                 }
        attributes_list_df['num_unique_values'] = attributes_list_df.attribute.map(attr_unique_values_map)
        attributes_list_df['k'] = attributes_list_df.cnt/attributes_list_df.num_unique_values
        
        attributes_list_df = attributes_list_df[(attributes_list_df.k>3.0) & (attributes_list_df.num_unique_values>1)]\
                                    .sort_values(by='cnt', ascending=False).copy()
        
        self.vocabulary = { w:i for i,w in enumerate(sorted(attributes_list_df.attribute.values))}
        return self.vocabulary
        
        
    def _extract_features(self):
        self.features_df = self.dataset[[
            #'tag_name_parent',
            #'tag_name',
            'width', 
            'height', 
            'width_parent', 
            'height_parent', 
            'x', 
            'x_parent', 
            'y', 
            'y_parent',
            'is_leaf',
            'is_leaf_parent',
            'num_followers',
            'num_leafs',
            #'num_leafs_parent',
            'num_children',
            'num_children_parent',
            'sum_children_widths',
            'sum_children_widths_parent',
            'sum_children_hights',
            'sum_children_hights_parent',
            'displayed',
            #'onmouseenter'
        ]].copy()
        
        self.features_df.sum_children_hights = (self.features_df.sum_children_hights/self.features_df.num_children).fillna(-1)
        self.features_df.sum_children_hights_parent = (self.features_df.sum_children_hights_parent/self.features_df.num_children_parent).fillna(-1)
        self.features_df.sum_children_widths = (self.features_df.sum_children_widths/self.features_df.num_children).fillna(-1)
        self.features_df.sum_children_widths_parent = (self.features_df.sum_children_widths_parent/self.features_df.num_children_parent).fillna(-1)

        self.features_df.x = (self.features_df.x < 0).astype(int)
        self.features_df.y = (self.features_df.y < 0).astype(int)
        self.features_df.x_parent = (self.features_df.x_parent < 0).astype(int)
        self.features_df.y_parent = (self.features_df.y_parent < 0).astype(int)
        self.features_df['w'] = (self.features_df.width <= 2).astype(int)
        self.features_df['w_parent'] = (self.features_df.width_parent <= 2).astype(int)
        self.features_df['h'] = (self.features_df.height <= 2).astype(int)
        self.features_df['h_parent'] = (self.features_df.height_parent <= 2).astype(int)
        self.features_df.displayed = self.features_df.displayed.astype(int)
        
        
    def _find_dataset_names(self, path_mask='dataset/df/*.parquet'):
        return  set([re.sub( '.*[/\\\]', '', re.sub('\\..*$', '', os.path.normpath(fn)))
                        for fn in glob.glob(path_mask)])


    def _gen_dataset_names(self):
        dfs = self._find_dataset_names('dataset/df/*.parquet')
        imgs = self._find_dataset_names('dataset/images/*.png')
        anns = self._find_dataset_names('dataset/annotations/*.txt')

        return (dfs.intersection(imgs)).intersection(anns)
        
    def _count_vectorizer_class(self):

        self.dataset['cv_class'] = self.dataset.attributes.apply(lambda x: x.get('class')).fillna('')
        file_name = 'model/count_vectorizer_class.pkl'
        if os.path.exists(file_name):
            logger.warning(f'Loading count vectorizer for column "cv_class": {file_name}')
            with open(file_name, 'rb') as f:
                self.count_vectorizer_class = pickle.load(f)
        else:
            logger.warning(f'Build count vectorizer for column "cv_class": {file_name}')
            class_dict = Counter()
            for s in self.dataset['cv_class'].values:
                class_dict.update(s.lower().replace('-', ' ').split())
            vocabulary = [cls for cls in class_dict if re.match('^[a-z]*$', cls)]
            self.count_vectorizer_class = CountVectorizer(vocabulary=vocabulary).fit(self.dataset['cv_class'].values)
            with open(file_name, 'wb') as f:
                pickle.dump(self.count_vectorizer_class, f)
        self.class_sm = self.count_vectorizer_class.transform(self.dataset['cv_class'].values)
        logger.info(f'class_sm: {self.class_sm.shape}')

    

    

In [10]:
train_dataset = JDIDataset(rebalance=True)

2021-04-07 12:26:15,106 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:16 - List of dataset_names:{'performance', 'support', 'bootstrap-reboot', 'mobile-and-html-5', 'gitlab', 'ms-office', 'google-voice', 'metals-and-colors', 'user-table', 'table-with-pages', 'angular', 'bootstrap-1', 'dates', 'bootstrap', 'react-ant', 'different-elemants', 'bootstrap-form-control', 'login', 'bootstrap-form', 'search', 'html-5', 'complex-table', 'bootstrap-forms', 'contact-form'}
2021-04-07 12:26:15,107 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for performance
2021-04-07 12:26:15,194 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (2675, 13)
2021-04-07 12:26:15,194 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:26:15,194 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-0

  0%|          | 0/2675 [00:00<?, ?it/s]

2021-04-07 12:26:18,933 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (2675, 40)


Assigning labels:   0%|          | 0/42 [00:00<?, ?it/s]

2021-04-07 12:26:19,857 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for support
2021-04-07 12:26:19,871 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (287, 13)
2021-04-07 12:26:19,872 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:26:19,874 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:26:19,874 - INFO - dataset:dataset.py:63 - Leafs set size: 128 (nodes which have no children)
2021-04-07 12:26:19,875 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:26:19,877 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 90 (nodes which have leafs as children)
2021-04-07 12:26:19,878 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:26:19,879 - INFO - dataset:dataset.py:73 - Nodes with children: 1

  0%|          | 0/287 [00:00<?, ?it/s]

2021-04-07 12:26:20,253 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (287, 40)


Assigning labels:   0%|          | 0/40 [00:00<?, ?it/s]

2021-04-07 12:26:20,343 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for bootstrap-reboot
2021-04-07 12:26:20,393 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (1043, 13)
2021-04-07 12:26:20,393 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:26:20,393 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:26:20,403 - INFO - dataset:dataset.py:63 - Leafs set size: 640 (nodes which have no children)
2021-04-07 12:26:20,404 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:26:20,406 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 309 (nodes which have leafs as children)
2021-04-07 12:26:20,408 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:26:20,408 - INFO - dataset:dataset.py:73 - Nodes with 

  0%|          | 0/1043 [00:00<?, ?it/s]

2021-04-07 12:26:21,540 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (1043, 40)


Assigning labels:   0%|          | 0/96 [00:00<?, ?it/s]

2021-04-07 12:26:22,046 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for mobile-and-html-5
2021-04-07 12:26:22,074 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (650, 13)
2021-04-07 12:26:22,074 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:26:22,074 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:26:22,074 - INFO - dataset:dataset.py:63 - Leafs set size: 398 (nodes which have no children)
2021-04-07 12:26:22,080 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:26:22,081 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 165 (nodes which have leafs as children)
2021-04-07 12:26:22,082 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:26:22,083 - INFO - dataset:dataset.py:73 - Nodes with 

  0%|          | 0/650 [00:00<?, ?it/s]

2021-04-07 12:26:23,055 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (650, 40)


Assigning labels:   0%|          | 0/114 [00:00<?, ?it/s]

2021-04-07 12:26:23,393 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for gitlab
2021-04-07 12:26:23,468 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (1803, 13)
2021-04-07 12:26:23,483 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:26:23,483 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:26:23,486 - INFO - dataset:dataset.py:63 - Leafs set size: 1018 (nodes which have no children)
2021-04-07 12:26:23,487 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:26:23,489 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 587 (nodes which have leafs as children)
2021-04-07 12:26:23,491 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:26:23,492 - INFO - dataset:dataset.py:73 - Nodes with children:

  0%|          | 0/1803 [00:00<?, ?it/s]

2021-04-07 12:26:25,793 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (1803, 40)


Assigning labels:   0%|          | 0/175 [00:00<?, ?it/s]

2021-04-07 12:26:26,521 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for ms-office
2021-04-07 12:26:26,576 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (739, 13)
2021-04-07 12:26:26,576 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:26:26,576 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:26:26,576 - INFO - dataset:dataset.py:63 - Leafs set size: 361 (nodes which have no children)
2021-04-07 12:26:26,582 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:26:26,582 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 255 (nodes which have leafs as children)
2021-04-07 12:26:26,582 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:26:26,582 - INFO - dataset:dataset.py:73 - Nodes with children

  0%|          | 0/739 [00:00<?, ?it/s]

2021-04-07 12:26:27,680 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (739, 40)


Assigning labels:   0%|          | 0/121 [00:00<?, ?it/s]

2021-04-07 12:26:27,989 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for google-voice
2021-04-07 12:26:27,999 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (98, 13)
2021-04-07 12:26:27,999 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:26:28,000 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:26:28,001 - INFO - dataset:dataset.py:63 - Leafs set size: 55 (nodes which have no children)
2021-04-07 12:26:28,002 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:26:28,003 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 25 (nodes which have leafs as children)
2021-04-07 12:26:28,004 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:26:28,005 - INFO - dataset:dataset.py:73 - Nodes with children

  0%|          | 0/98 [00:00<?, ?it/s]

2021-04-07 12:26:28,115 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (98, 40)


Assigning labels:   0%|          | 0/11 [00:00<?, ?it/s]

2021-04-07 12:26:28,307 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for metals-and-colors
2021-04-07 12:26:28,330 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (376, 13)
2021-04-07 12:26:28,331 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:26:28,332 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:26:28,332 - INFO - dataset:dataset.py:63 - Leafs set size: 169 (nodes which have no children)
2021-04-07 12:26:28,333 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:26:28,334 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 108 (nodes which have leafs as children)
2021-04-07 12:26:28,335 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:26:28,336 - INFO - dataset:dataset.py:73 - Nodes with 

  0%|          | 0/376 [00:00<?, ?it/s]

2021-04-07 12:26:28,907 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (376, 40)


Assigning labels:   0%|          | 0/34 [00:00<?, ?it/s]

2021-04-07 12:26:29,005 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for user-table
2021-04-07 12:26:29,011 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (369, 13)
2021-04-07 12:26:29,011 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:26:29,011 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:26:29,011 - INFO - dataset:dataset.py:63 - Leafs set size: 178 (nodes which have no children)
2021-04-07 12:26:29,026 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:26:29,028 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 117 (nodes which have leafs as children)
2021-04-07 12:26:29,029 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:26:29,029 - INFO - dataset:dataset.py:73 - Nodes with childre

  0%|          | 0/369 [00:00<?, ?it/s]

2021-04-07 12:26:29,543 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (369, 40)


Assigning labels:   0%|          | 0/58 [00:00<?, ?it/s]

2021-04-07 12:26:29,682 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for table-with-pages
2021-04-07 12:26:29,702 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (312, 13)
2021-04-07 12:26:29,702 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:26:29,703 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:26:29,704 - INFO - dataset:dataset.py:63 - Leafs set size: 146 (nodes which have no children)
2021-04-07 12:26:29,705 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:26:29,706 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 94 (nodes which have leafs as children)
2021-04-07 12:26:29,707 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:26:29,708 - INFO - dataset:dataset.py:73 - Nodes with ch

  0%|          | 0/312 [00:00<?, ?it/s]

2021-04-07 12:26:30,142 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (312, 40)


Assigning labels:   0%|          | 0/46 [00:00<?, ?it/s]

2021-04-07 12:26:30,241 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for angular
2021-04-07 12:26:31,140 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (6450, 13)
2021-04-07 12:26:31,140 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:26:31,147 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:26:31,149 - INFO - dataset:dataset.py:63 - Leafs set size: 3250 (nodes which have no children)
2021-04-07 12:26:31,149 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:26:31,156 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 1750 (nodes which have leafs as children)
2021-04-07 12:26:31,160 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:26:31,161 - INFO - dataset:dataset.py:73 - Nodes with childre

  0%|          | 0/6450 [00:00<?, ?it/s]

2021-04-07 12:26:39,470 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (6450, 40)


Assigning labels:   0%|          | 0/526 [00:00<?, ?it/s]

2021-04-07 12:26:46,244 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for bootstrap-1
2021-04-07 12:26:46,433 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (5229, 13)
2021-04-07 12:26:46,433 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:26:46,449 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:26:46,451 - INFO - dataset:dataset.py:63 - Leafs set size: 4581 (nodes which have no children)
2021-04-07 12:26:46,453 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:26:46,457 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 430 (nodes which have leafs as children)
2021-04-07 12:26:46,461 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:26:46,462 - INFO - dataset:dataset.py:73 - Nodes with chil

  0%|          | 0/5229 [00:00<?, ?it/s]

2021-04-07 12:26:52,253 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (5229, 40)


Assigning labels:   0%|          | 0/147 [00:00<?, ?it/s]

2021-04-07 12:26:54,754 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for dates
2021-04-07 12:26:54,776 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (316, 13)
2021-04-07 12:26:54,777 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:26:54,778 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:26:54,778 - INFO - dataset:dataset.py:63 - Leafs set size: 138 (nodes which have no children)
2021-04-07 12:26:54,779 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:26:54,781 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 101 (nodes which have leafs as children)
2021-04-07 12:26:54,782 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:26:54,783 - INFO - dataset:dataset.py:73 - Nodes with children: 17

  0%|          | 0/316 [00:00<?, ?it/s]

2021-04-07 12:26:55,187 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (316, 40)


Assigning labels:   0%|          | 0/48 [00:00<?, ?it/s]

2021-04-07 12:26:55,288 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for bootstrap
2021-04-07 12:26:55,334 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (1098, 13)
2021-04-07 12:26:55,334 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:26:55,334 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:26:55,334 - INFO - dataset:dataset.py:63 - Leafs set size: 536 (nodes which have no children)
2021-04-07 12:26:55,349 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:26:55,350 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 373 (nodes which have leafs as children)
2021-04-07 12:26:55,352 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:26:55,352 - INFO - dataset:dataset.py:73 - Nodes with childre

  0%|          | 0/1098 [00:00<?, ?it/s]

2021-04-07 12:26:56,839 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (1098, 40)


Assigning labels:   0%|          | 0/264 [00:00<?, ?it/s]

2021-04-07 12:26:57,553 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for react-ant
2021-04-07 12:26:57,577 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (703, 13)
2021-04-07 12:26:57,577 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:26:57,577 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:26:57,577 - INFO - dataset:dataset.py:63 - Leafs set size: 261 (nodes which have no children)
2021-04-07 12:26:57,591 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:26:57,592 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 199 (nodes which have leafs as children)
2021-04-07 12:26:57,594 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:26:57,594 - INFO - dataset:dataset.py:73 - Nodes with children

  0%|          | 0/703 [00:00<?, ?it/s]

2021-04-07 12:26:58,780 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (703, 40)


Assigning labels:   0%|          | 0/92 [00:00<?, ?it/s]

2021-04-07 12:26:58,984 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for different-elemants
2021-04-07 12:26:59,013 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (292, 13)
2021-04-07 12:26:59,013 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:26:59,014 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:26:59,015 - INFO - dataset:dataset.py:63 - Leafs set size: 132 (nodes which have no children)
2021-04-07 12:26:59,016 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:26:59,018 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 93 (nodes which have leafs as children)
2021-04-07 12:26:59,019 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:26:59,019 - INFO - dataset:dataset.py:73 - Nodes with 

  0%|          | 0/292 [00:00<?, ?it/s]

2021-04-07 12:26:59,376 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (292, 40)


Assigning labels:   0%|          | 0/50 [00:00<?, ?it/s]

2021-04-07 12:26:59,474 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for bootstrap-form-control
2021-04-07 12:26:59,532 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (1481, 13)
2021-04-07 12:26:59,532 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:26:59,532 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:26:59,547 - INFO - dataset:dataset.py:63 - Leafs set size: 1181 (nodes which have no children)
2021-04-07 12:26:59,548 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:26:59,550 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 221 (nodes which have leafs as children)
2021-04-07 12:26:59,551 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:26:59,552 - INFO - dataset:dataset.py:73 - Node

  0%|          | 0/1481 [00:00<?, ?it/s]

2021-04-07 12:27:01,226 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (1481, 40)


Assigning labels:   0%|          | 0/78 [00:00<?, ?it/s]

2021-04-07 12:27:01,723 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for login
2021-04-07 12:27:01,741 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (231, 13)
2021-04-07 12:27:01,742 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:27:01,743 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:27:01,744 - INFO - dataset:dataset.py:63 - Leafs set size: 100 (nodes which have no children)
2021-04-07 12:27:01,745 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:27:01,746 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 72 (nodes which have leafs as children)
2021-04-07 12:27:01,747 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:27:01,748 - INFO - dataset:dataset.py:73 - Nodes with children: 132

  0%|          | 0/231 [00:00<?, ?it/s]

2021-04-07 12:27:02,055 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (231, 40)


Assigning labels:   0%|          | 0/23 [00:00<?, ?it/s]

2021-04-07 12:27:02,155 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for bootstrap-form
2021-04-07 12:27:02,194 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (814, 13)
2021-04-07 12:27:02,194 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:27:02,194 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:27:02,194 - INFO - dataset:dataset.py:63 - Leafs set size: 475 (nodes which have no children)
2021-04-07 12:27:02,198 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:27:02,198 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 231 (nodes which have leafs as children)
2021-04-07 12:27:02,201 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:27:02,201 - INFO - dataset:dataset.py:73 - Nodes with chi

  0%|          | 0/814 [00:00<?, ?it/s]

2021-04-07 12:27:03,222 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (814, 40)


Assigning labels:   0%|          | 0/160 [00:00<?, ?it/s]

2021-04-07 12:27:03,665 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for search
2021-04-07 12:27:03,683 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (297, 13)
2021-04-07 12:27:03,685 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:27:03,686 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:27:03,687 - INFO - dataset:dataset.py:63 - Leafs set size: 139 (nodes which have no children)
2021-04-07 12:27:03,689 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:27:03,690 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 91 (nodes which have leafs as children)
2021-04-07 12:27:03,692 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:27:03,693 - INFO - dataset:dataset.py:73 - Nodes with children: 15

  0%|          | 0/297 [00:00<?, ?it/s]

2021-04-07 12:27:04,110 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (297, 40)


Assigning labels:   0%|          | 0/45 [00:00<?, ?it/s]

2021-04-07 12:27:04,226 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for html-5
2021-04-07 12:27:04,258 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (665, 13)
2021-04-07 12:27:04,258 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:27:04,259 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:27:04,260 - INFO - dataset:dataset.py:63 - Leafs set size: 413 (nodes which have no children)
2021-04-07 12:27:04,261 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:27:04,262 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 165 (nodes which have leafs as children)
2021-04-07 12:27:04,264 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:27:04,264 - INFO - dataset:dataset.py:73 - Nodes with children: 2

  0%|          | 0/665 [00:00<?, ?it/s]

2021-04-07 12:27:05,142 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (665, 40)


Assigning labels:   0%|          | 0/118 [00:00<?, ?it/s]

2021-04-07 12:27:05,456 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for complex-table
2021-04-07 12:27:05,481 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (426, 13)
2021-04-07 12:27:05,482 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:27:05,483 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:27:05,484 - INFO - dataset:dataset.py:63 - Leafs set size: 206 (nodes which have no children)
2021-04-07 12:27:05,485 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:27:05,487 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 131 (nodes which have leafs as children)
2021-04-07 12:27:05,488 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:27:05,489 - INFO - dataset:dataset.py:73 - Nodes with chil

  0%|          | 0/426 [00:00<?, ?it/s]

2021-04-07 12:27:06,125 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (426, 40)


Assigning labels:   0%|          | 0/61 [00:00<?, ?it/s]

2021-04-07 12:27:06,257 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for bootstrap-forms
2021-04-07 12:27:06,273 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (315, 13)
2021-04-07 12:27:06,274 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:27:06,275 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:27:06,275 - INFO - dataset:dataset.py:63 - Leafs set size: 152 (nodes which have no children)
2021-04-07 12:27:06,276 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:27:06,277 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 93 (nodes which have leafs as children)
2021-04-07 12:27:06,278 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:27:06,279 - INFO - dataset:dataset.py:73 - Nodes with chi

  0%|          | 0/315 [00:00<?, ?it/s]

2021-04-07 12:27:06,695 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (315, 40)


Assigning labels:   0%|          | 0/47 [00:00<?, ?it/s]

2021-04-07 12:27:06,813 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for contact-form
2021-04-07 12:27:06,834 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (339, 13)
2021-04-07 12:27:06,835 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:27:06,836 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:27:06,837 - INFO - dataset:dataset.py:63 - Leafs set size: 154 (nodes which have no children)
2021-04-07 12:27:06,839 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:27:06,840 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 99 (nodes which have leafs as children)
2021-04-07 12:27:06,841 - INFO - dataset:dataset.py:71 - count num children for each node
2021-04-07 12:27:06,842 - INFO - dataset:dataset.py:73 - Nodes with childr

  0%|          | 0/339 [00:00<?, ?it/s]

2021-04-07 12:27:07,294 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (339, 40)


Assigning labels:   0%|          | 0/38 [00:00<?, ?it/s]

2021-04-07 12:27:07,392 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:55 - Concatenate datasets
2021-04-07 12:27:07,456 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:57 - Dataset shape after reading: (27008, 42)
2021-04-07 12:27:08,791 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:338 - class_sm: (82062, 779)
2021-04-07 12:27:08,792 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:66 - Build OHE column for attribute role
2021-04-07 12:27:08,825 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:66 - Build OHE column for attribute type
2021-04-07 12:27:08,855 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:66 - Build OHE column for attribute ui
2021-04-07 12:27:08,891 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:70 - Build OHE column for attribute role_parent
2021-04-07 12:27:08,920 - INFO - <ipython-input-9-8e1de717026a>:<

In [11]:
train_dataset.dataset.columns

Index(['attributes', 'displayed', 'element_id', 'height', 'onmouseenter',
       'onmouseover', 'parent_id', 'style', 'tag_name', 'text', 'width', 'x',
       'y', 'is_leaf', 'num_leafs', 'num_children', 'sum_children_widths',
       'sum_children_hights', 'children_tags', 'num_followers',
       'attributes_parent', 'displayed_parent', 'element_id_parent',
       'height_parent', 'onmouseenter_parent', 'onmouseover_parent',
       'parent_id_parent', 'style_parent', 'tag_name_parent', 'text_parent',
       'width_parent', 'x_parent', 'y_parent', 'is_leaf_parent',
       'num_leafs_parent', 'num_children_parent', 'sum_children_widths_parent',
       'sum_children_hights_parent', 'children_tags_parent',
       'num_followers_parent', 'label', 'ds_name', 'cv_class', 'ohe_role',
       'ohe_type', 'ohe_ui', 'ohe_role_parent', 'ohe_type_parent',
       'ohe_ui_parent', 'attributes_text', 'attributes_parent_text'],
      dtype='object')

In [12]:
assert (train_dataset.dataset.element_id.nunique() == train_dataset.dataset.shape[0]) or train_dataset.rebalanced, 'There are nonunique values in the dataset, probably the dataset was rebalanced'
display(HTML("<h3>Check dataset.element_id for uniqueness: OK</h3>"))

In [13]:
display(HTML(f'<h3>Length of dataset: {train_dataset.dataset.shape[0]}<h3>'))

In [14]:
train_dataset.data.shape

(82062, 2002)

## Test Dataloader

In [15]:
BATCH_SIZE = 256
dataloader = DataLoader(train_dataset, shuffle=True, batch_size = BATCH_SIZE, collate_fn=train_dataset.collate_fn, pin_memory=True)

In [16]:
%%time
x, y = next(iter(dataloader))
x.shape, y.shape

Wall time: 1.45 s


(torch.Size([256, 2002]), torch.Size([256]))

In [17]:
print('Elapsed time:', dt.datetime.now() - START_DT)

Elapsed time: 0:00:59.209001


In [18]:
%%time

NUM_BATCHES = len(dataloader)

with trange(NUM_BATCHES) as bar:
    for x, y in dataloader:
        # print(x.shape, y.shape)
        bar.update(1)

  0%|          | 0/321 [00:00<?, ?it/s]

Wall time: 5.56 s


In [19]:
print('Elapsed time:', dt.datetime.now() - START_DT)

Elapsed time: 0:01:04.819124


In [20]:
test_dataset = JDIDataset(dataset_names=['angular'], rebalance=False)

2021-04-07 12:27:18,301 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:16 - List of dataset_names:['angular']
2021-04-07 12:27:18,302 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:21 - Dataset for angular
2021-04-07 12:27:19,235 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:23 - Dataset shape: (6450, 13)
2021-04-07 12:27:19,235 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:25 - cleaning tag_name from dummy/auxiliary words
2021-04-07 12:27:19,239 - INFO - dataset:dataset.py:61 - select all leafs (nodes which are not parents)
2021-04-07 12:27:19,241 - INFO - dataset:dataset.py:63 - Leafs set size: 3250 (nodes which have no children)
2021-04-07 12:27:19,243 - INFO - dataset:dataset.py:66 - count number of references to leafs
2021-04-07 12:27:19,246 - INFO - dataset:dataset.py:68 - Nodes with leafs as children set size: 1750 (nodes which have leafs as children)
2021-04-07 12:27:19,250 - INFO 

  0%|          | 0/6450 [00:00<?, ?it/s]

2021-04-07 12:27:27,662 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:35 - Dataset shape after merging with parents: (6450, 40)


Assigning labels:   0%|          | 0/526 [00:00<?, ?it/s]

2021-04-07 12:27:34,499 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:55 - Concatenate datasets
2021-04-07 12:27:34,512 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:57 - Dataset shape after reading: (6450, 42)
2021-04-07 12:27:34,544 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:338 - class_sm: (6450, 779)
2021-04-07 12:27:34,544 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:66 - Build OHE column for attribute role
2021-04-07 12:27:34,569 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:66 - Build OHE column for attribute type
2021-04-07 12:27:34,581 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:66 - Build OHE column for attribute ui
2021-04-07 12:27:34,591 - INFO - <ipython-input-9-8e1de717026a>:<ipython-input-9-8e1de717026a>:70 - Build OHE column for attribute role_parent
2021-04-07 12:27:34,597 - INFO - <ipython-input-9-8e1de717026a>:<ip

In [21]:
BATCH_SIZE = 1
dataloader = DataLoader(test_dataset, shuffle=False, batch_size = BATCH_SIZE, collate_fn=train_dataset.collate_fn, pin_memory=True)

In [22]:
%%time

NUM_BATCHES = len(dataloader)

with trange(NUM_BATCHES) as bar:
    for x, y in dataloader:
        #print(x.shape, y)
        bar.update(1)

  0%|          | 0/6450 [00:00<?, ?it/s]

Wall time: 3.85 s


In [23]:
print('Elapsed time:', dt.datetime.now() - START_DT)

Elapsed time: 0:01:25.395616
