In [1]:
import os, sys, re, gc, glob
import torch
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from utils import logger, assign_labels, build_children_features
from IPython.display import display, clear_output, HTML
import datetime as dt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack
import pickle

from tqdm.auto import trange
from collections import Counter

START_DT = dt.datetime.now()


2021-03-22 08:15:42,371 - INFO - dataset:dataset.py:27 - dataset package is loaded...


In [2]:
class JDIDataset(Dataset):
    
    def __init__(self, dataset_names:list=None, rebalance=False):
        super(JDIDataset, self).__init__()
        self.rebalanced = rebalance
               
        with open('dataset/classes.txt', 'r') as f:
            self.classes_dict = { class_name.strip():i for i, class_name in enumerate(f.readlines()) }
            self.classes_reverse_dict = { v:k for k, v in self.classes_dict.items()}
        self.dummy_class_value = self.classes_dict['n/a']
            
        if dataset_names is None:
            logger.warning('Using all available data to generate dataset')
            dataset_names = self._gen_dataset_names()
            
        logger.info(f"List of dataset_names:{dataset_names}")
        
        ds_list=[] # list of datasets to join
        
        for ds_name in dataset_names:
            logger.info(f'Dataset for {ds_name}')
            df = pd.read_parquet(f'dataset/df/{ds_name}.parquet')
            logger.info(f"Dataset shape: {df.shape}")

            logger.info('cleaning tag_name from dummy/auxiliary words')
            df.tag_name = df.tag_name.apply(lambda x: x.lower().replace('-example', ''))
            df = build_children_features(df=df)

            #----------------------------------------------------------------------------------------------
            # Merge children with parents
            # WARNING: There is a tag <HTML> without parent. Let's fix this issue
            df.parent_id = df.apply(lambda r: r.element_id if r.parent_id is None else r.parent_id, axis=1)            
            df = df.merge(df, left_on='parent_id', right_on='element_id', suffixes=('', '_parent'))
            logger.info(f"Dataset shape after merging with parents: {df.shape}")
            #----------------------------------------------------------------------------------------------
            
            # If annotation file exists, lets load it and assign labels
            if os.path.exists(f'dataset/annotations/{ds_name}.txt'):
                logger.warning(f'Load LABELS from dataset/annotations/{ds_name}.txt')
                df = assign_labels(df, annotations_file_path=f'dataset/annotations/{ds_name}.txt', 
                                       img=plt.imread(f'dataset/images/{ds_name}.png'),
                                       dummy_value = self.dummy_class_value
                                  )
            else:
                logger.warning(f'assign dummy values [n/a] for labels if there is no annotations')
                df['label'] = self.dummy_class_value
            
            df['ds_name'] = ds_name
            ds_list.append(df)            
                
        logger.info('Concatenate datasets')
        self.dataset = pd.concat(ds_list)
        logger.info(f"Dataset shape after reading: {self.dataset.shape}")
        
        if rebalance:
            self._oversample()
        
        ### add ohe_ columns to one hot encoding several attributes
        for attr in ['role', 'type', 'ui']:
            logger.info(f'Build OHE column for attribute {attr}')
            self.dataset['ohe_' + attr] = self.dataset['attributes'].apply(lambda x: x.get(attr)).fillna("").str.lower()
            
        for attr in ['role', 'type', 'ui']:
            logger.info(f'Build OHE column for attribute {attr}_parent')
            self.dataset['ohe_' + attr+'_parent'] = self.dataset['attributes_parent'].apply(lambda x: x.get(attr)).fillna("").str.lower()
            
        logger.info('OHE tag_name')
        self.tag_name_sm = self._ohe_column('tag_name')
        logger.info(f'tag_name_sm.shape: {self.tag_name_sm.shape}')

        logger.info('OHE tag_name_parent')
        self.tag_name_parent_sm = self._ohe_column(colname='tag_name_parent', ohe_file_path='model/tag_name.pkl')
        logger.info(f'tag_name_parent_sm.shape: {self.tag_name_parent_sm.shape}')

        logger.info('OHE ohe_role')
        self.ohe_role_sm = self._ohe_column('ohe_role')
        logger.info(f'ohe_role_sm.shape: {self.ohe_role_sm.shape}')

        logger.info('OHE ohe_role_parent')
        self.ohe_role_parent_sm = self._ohe_column(colname='ohe_role_parent', ohe_file_path='model/ohe_role.pkl')
        logger.info(f'ohe_role_parent_sm.shape: {self.ohe_role_parent_sm.shape}')
        
        logger.info('OHE ohe_type')
        self.ohe_type_sm = self._ohe_column('ohe_type')
        logger.info(f'ohe_type_sm.shape: {self.ohe_type_sm.shape}')

        logger.info('OHE ohe_type_parent')
        self.ohe_type_parent_sm = self._ohe_column(colname='ohe_type_parent', ohe_file_path='model/ohe_type.pkl')
        logger.info(f'ohe_type_parent_sm.shape: {self.ohe_type_parent_sm.shape}')
        
        logger.info('OHE ohe_ui')
        self.ohe_ui_sm = self._ohe_column('ohe_ui')
        logger.info(f'ohe_ui_sm.shape: {self.ohe_ui_sm.shape}')

        logger.info('OHE ohe_ui_parent')
        self.ohe_ui_parent_sm = self._ohe_column('ohe_ui_parent', ohe_file_path='model/ohe_ui.pkl')
        logger.info(f'ohe_ui_parent_sm.shape: {self.ohe_ui_parent_sm.shape}')
        
        ## extract all non null attributes names
        self.dataset['attributes_text'] = self.dataset.attributes.apply(lambda x: " ".join([k for k in x.keys() if x[k] is not None ]))
        logger.info('Fit CountVectorizer for column "attributes"')
        self.attributes_sm = self._count_vectorizer_column('attributes_text')
        logger.info(f'attributes_sm.shape: {self.attributes_sm.shape}')

        ## extract all non null attributes names
        self.dataset['attributes_parent_text'] = self.dataset.attributes_parent.apply(lambda x: " ".join([k for k in x.keys() if x[k] is not None ]))
        logger.info('Fit CountVectorizer for column "attributes_parent"')
        self.attributes_parent_sm = self._count_vectorizer_column(colname='attributes_parent_text', 
                                                                  file_path='model/count_vectorizer_attributes_text.pkl')
        logger.info(f'attributes_parent_sm.shape: {self.attributes_parent_sm.shape}')
        
        self._extract_features()
        
        self.labels = self.dataset.label.astype(int).map(self.classes_reverse_dict)
        self.dataset.label = self.dataset.label.astype(int)
        
        self.data = hstack([
                    self.attributes_sm, 
                    self.tag_name_sm, 
                    self.ohe_role_sm, 
                    self.ohe_type_sm, 
                    self.ohe_ui_sm,
                    self.tag_name_parent_sm,
                    self.ohe_type_parent_sm,
                    self.ohe_role_parent_sm,
                    self.ohe_ui_parent_sm,
                    self.attributes_parent_sm,
                    self.features_df.values
                  ]).astype(np.float32)
        
        logger.info(f'OHE columns sparse matrix: {self.data.shape}')
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        return np.array(self.data.getrow(idx).todense()[0]).squeeze(), self.dataset.iloc[idx]['label']
    

    def _oversample(self):
        logger.warning('Oversample data to balance the dataset, this will create duplicated rows in dataset')
        
        class_counts = [ r for r in self.dataset.label.value_counts().sort_values(ascending=False).items()]
        max_count = class_counts[0][1]

        dfs = [self.dataset]
        for cc in class_counts[1:]:
            ratio = max_count//cc[1]
            ratio = 20 if ratio >= 20 else ratio
            for _ in range(ratio):
                dfs.append(self.dataset[self.dataset.label == cc[0]].copy())

        self.dataset = pd.concat(dfs)
        logger.warning(f'Rebalanced dataset size: {self.dataset.shape[0]}')
        
                
    def _ohe_column(self, colname, ohe_file_path=None):
        """
            load attr_ohe if exists model/attr_ohe.pkl
            otherwise build the one
        """
        if ohe_file_path is None:
            file_path = f'model/{colname}.pkl'
        else:
            file_path = ohe_file_path
            
        if os.path.exists(file_path):
            logger.warning(f'loading existing OHE for column "{colname}" from {file_path}')
            with open(file_path,'rb') as f: 
                ohe = pickle.load(f)
        else:
            logger.warning(f'Create and fit OHE for column "{colname}"')
            ohe = OneHotEncoder(handle_unknown='ignore')
            ohe.fit(np.expand_dims(self.dataset[colname].values, axis=1))            
            with open(file_path, 'wb') as f:
                pickle.dump(ohe, f)
                
        sm = ohe.transform(np.expand_dims(self.dataset[colname].values, axis=1))        
        return sm
        
    def _count_vectorizer_column(self, colname, file_path=None):
        """
            load count_vercorizer for a column if a pkl file exists
            otherwise create the one
        """
        if file_path is None:
            file_path = f'model/count_vectorizer_{colname}.pkl'
            
        if os.path.exists(file_path):
            logger.warning(f'loading existing count vectorizer for column "{colname}" from {file_path}')
            with open(file_path,'rb') as f: 
                vectorizer = pickle.load(f)
            self.vocabulary = vectorizer.vocabulary_
        else:
            logger.warning(f'Create and fit count vectorizer for column "{colname}"')
            vectorizer = CountVectorizer(vocabulary=self._build_vocabulary())
            vectorizer.fit(self.dataset[colname].values)            
            with open(file_path, 'wb') as f:
                pickle.dump(vectorizer, f)
                
        sm = vectorizer.transform(self.dataset[colname].values)
        return sm
    
    def _build_vocabulary(self):
        
        """
            Attempt to reduce number of features by removing rarely used attributes
        """
        
        attributes_usage = Counter()
        for attr_list in self.dataset.attributes.apply(lambda x: [field for field in x if x[field] is not None]).values:
            attributes_usage.update(attr_list)
            
        attributes_usasge_df = pd.DataFrame( 
            [[attribute, cnt] for attribute, cnt in attributes_usage.items()], 
            columns=['attribute', 'cnt']
        ).sort_values(by='cnt', ascending=False)

        ## Lets cut off attributes which are rarely used
        attributes_list_df = attributes_usasge_df[attributes_usasge_df.cnt>1].copy()
        
        attr_unique_values_map = { 
                                    attr:self.dataset['attributes'].apply(lambda x: x.get(attr)).nunique() 
                                         for attr in (attributes_list_df.attribute.values)
                                 }
        attributes_list_df['num_unique_values'] = attributes_list_df.attribute.map(attr_unique_values_map)
        attributes_list_df['k'] = attributes_list_df.cnt/attributes_list_df.num_unique_values
        
        attributes_list_df = attributes_list_df[(attributes_list_df.k>2.0) & (attributes_list_df.num_unique_values>1)]\
                                    .sort_values(by='cnt', ascending=False).copy()
        
        self.vocabulary = { w:i for i,w in enumerate(sorted(attributes_list_df.attribute.values))}
        return self.vocabulary
        
        
    def _extract_features(self):
        self.features_df = self.dataset[[
            #'tag_name_parent',
            #'tag_name',
            'width', 
            'height', 
            'width_parent', 
            'height_parent', 
            'x', 
            'x_parent', 
            'y', 
            'y_parent',
            'is_leaf',
            'is_leaf_parent',
            'num_leafs',
            #'num_leafs_parent',
            'num_children',
            'num_children_parent',
            'sum_children_widths',
            'sum_children_widths_parent',
            'sum_children_hights',
            'sum_children_hights_parent',
            'displayed',
            #'onmouseenter'
        ]].copy()
        
        self.features_df.sum_children_hights = (self.features_df.sum_children_hights/self.features_df.num_children).fillna(-1)
        self.features_df.sum_children_hights_parent = (self.features_df.sum_children_hights_parent/self.features_df.num_children_parent).fillna(-1)
        self.features_df.sum_children_widths = (self.features_df.sum_children_widths/self.features_df.num_children).fillna(-1)
        self.features_df.sum_children_widths_parent = (self.features_df.sum_children_widths_parent/self.features_df.num_children_parent).fillna(-1)

        self.features_df.x = (self.features_df.x < 0).astype(int)
        self.features_df.y = (self.features_df.y < 0).astype(int)
        self.features_df.x_parent = (self.features_df.x_parent < 0).astype(int)
        self.features_df.y_parent = (self.features_df.y_parent < 0).astype(int)
        self.features_df['w'] = (self.features_df.width <= 2).astype(int)
        self.features_df['w_parent'] = (self.features_df.width_parent <= 2).astype(int)
        self.features_df['h'] = (self.features_df.height <= 2).astype(int)
        self.features_df['h_parent'] = (self.features_df.height_parent <= 2).astype(int)
        self.features_df.displayed = self.features_df.displayed.astype(int)
        
        
        
        
    def _find_dataset_names(self, path_mask='dataset/df/*.parquet'):
        return  set([re.sub( '.*[/\\\]', '', re.sub('\\..*$', '', os.path.normpath(fn)))
                    for fn in glob.glob(path_mask)])


    def _gen_dataset_names(self):
            dfs = self._find_dataset_names('dataset/df/*.parquet')
            imgs = self._find_dataset_names('dataset/images/*.png')
            anns = self._find_dataset_names('dataset/annotations/*.txt')

            return (dfs.intersection(imgs)).intersection(anns)
        
    

    

In [3]:
train_dataset = JDIDataset(rebalance=True)

2021-03-22 08:15:42,471 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:16 - List of dataset_names:{'table-with-pages', 'angular', 'bootstrap-form', 'user-table', 'dates', 'support', 'bootstrap', 'contact-form', 'react-ant', 'login', 'bootstrap-forms', 'search', 'complex-table', 'metals-and-colors', 'mobile-and-html-5', 'html-5', 'different-elemants', 'performance'}
2021-03-22 08:15:42,471 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:21 - Dataset for table-with-pages
2021-03-22 08:15:42,526 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:23 - Dataset shape: (312, 13)
2021-03-22 08:15:42,527 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:25 - cleaning tag_name from dummy/auxiliary words
2021-03-22 08:15:42,528 - INFO - dataset:dataset.py:56 - select all leafs (nodes which are not parents)
2021-03-22 08:15:42,529 - INFO - dataset:dataset.py:58 - Leafs set size: 146 (nodes which have no childre

Assigning labels:   0%|          | 0/46 [00:00<?, ?it/s]

2021-03-22 08:15:42,837 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:21 - Dataset for angular
2021-03-22 08:15:43,760 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:23 - Dataset shape: (6450, 13)
2021-03-22 08:15:43,761 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:25 - cleaning tag_name from dummy/auxiliary words
2021-03-22 08:15:43,765 - INFO - dataset:dataset.py:56 - select all leafs (nodes which are not parents)
2021-03-22 08:15:43,767 - INFO - dataset:dataset.py:58 - Leafs set size: 3250 (nodes which have no children)
2021-03-22 08:15:43,771 - INFO - dataset:dataset.py:61 - count number of references to leafs
2021-03-22 08:15:43,775 - INFO - dataset:dataset.py:63 - Nodes with leafs as children set size: 1750 (nodes which have leafs as children)
2021-03-22 08:15:43,780 - INFO - dataset:dataset.py:66 - count num children for each node
2021-03-22 08:15:43,782 - INFO - dataset:dataset.py:68 - Nodes with childre

Assigning labels:   0%|          | 0/526 [00:00<?, ?it/s]

2021-03-22 08:15:50,956 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:21 - Dataset for bootstrap-form
2021-03-22 08:15:50,995 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:23 - Dataset shape: (814, 13)
2021-03-22 08:15:50,995 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:25 - cleaning tag_name from dummy/auxiliary words
2021-03-22 08:15:50,997 - INFO - dataset:dataset.py:56 - select all leafs (nodes which are not parents)
2021-03-22 08:15:50,997 - INFO - dataset:dataset.py:58 - Leafs set size: 475 (nodes which have no children)
2021-03-22 08:15:50,998 - INFO - dataset:dataset.py:61 - count number of references to leafs
2021-03-22 08:15:51,000 - INFO - dataset:dataset.py:63 - Nodes with leafs as children set size: 231 (nodes which have leafs as children)
2021-03-22 08:15:51,001 - INFO - dataset:dataset.py:66 - count num children for each node
2021-03-22 08:15:51,002 - INFO - dataset:dataset.py:68 - Nodes with chi

Assigning labels:   0%|          | 0/160 [00:00<?, ?it/s]

2021-03-22 08:15:51,434 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:21 - Dataset for user-table
2021-03-22 08:15:51,451 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:23 - Dataset shape: (369, 13)
2021-03-22 08:15:51,452 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:25 - cleaning tag_name from dummy/auxiliary words
2021-03-22 08:15:51,453 - INFO - dataset:dataset.py:56 - select all leafs (nodes which are not parents)
2021-03-22 08:15:51,454 - INFO - dataset:dataset.py:58 - Leafs set size: 178 (nodes which have no children)
2021-03-22 08:15:51,455 - INFO - dataset:dataset.py:61 - count number of references to leafs
2021-03-22 08:15:51,456 - INFO - dataset:dataset.py:63 - Nodes with leafs as children set size: 117 (nodes which have leafs as children)
2021-03-22 08:15:51,457 - INFO - dataset:dataset.py:66 - count num children for each node
2021-03-22 08:15:51,457 - INFO - dataset:dataset.py:68 - Nodes with childre

Assigning labels:   0%|          | 0/58 [00:00<?, ?it/s]

2021-03-22 08:15:51,559 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:21 - Dataset for dates
2021-03-22 08:15:51,574 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:23 - Dataset shape: (316, 13)
2021-03-22 08:15:51,574 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:25 - cleaning tag_name from dummy/auxiliary words
2021-03-22 08:15:51,576 - INFO - dataset:dataset.py:56 - select all leafs (nodes which are not parents)
2021-03-22 08:15:51,578 - INFO - dataset:dataset.py:58 - Leafs set size: 138 (nodes which have no children)
2021-03-22 08:15:51,578 - INFO - dataset:dataset.py:61 - count number of references to leafs
2021-03-22 08:15:51,579 - INFO - dataset:dataset.py:63 - Nodes with leafs as children set size: 101 (nodes which have leafs as children)
2021-03-22 08:15:51,580 - INFO - dataset:dataset.py:66 - count num children for each node
2021-03-22 08:15:51,581 - INFO - dataset:dataset.py:68 - Nodes with children: 17

Assigning labels:   0%|          | 0/48 [00:00<?, ?it/s]

2021-03-22 08:15:51,669 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:21 - Dataset for support
2021-03-22 08:15:51,684 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:23 - Dataset shape: (287, 13)
2021-03-22 08:15:51,685 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:25 - cleaning tag_name from dummy/auxiliary words
2021-03-22 08:15:51,686 - INFO - dataset:dataset.py:56 - select all leafs (nodes which are not parents)
2021-03-22 08:15:51,687 - INFO - dataset:dataset.py:58 - Leafs set size: 128 (nodes which have no children)
2021-03-22 08:15:51,688 - INFO - dataset:dataset.py:61 - count number of references to leafs
2021-03-22 08:15:51,689 - INFO - dataset:dataset.py:63 - Nodes with leafs as children set size: 90 (nodes which have leafs as children)
2021-03-22 08:15:51,690 - INFO - dataset:dataset.py:66 - count num children for each node
2021-03-22 08:15:51,690 - INFO - dataset:dataset.py:68 - Nodes with children: 1

Assigning labels:   0%|          | 0/40 [00:00<?, ?it/s]

2021-03-22 08:15:51,770 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:21 - Dataset for bootstrap
2021-03-22 08:15:51,822 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:23 - Dataset shape: (1098, 13)
2021-03-22 08:15:51,823 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:25 - cleaning tag_name from dummy/auxiliary words
2021-03-22 08:15:51,824 - INFO - dataset:dataset.py:56 - select all leafs (nodes which are not parents)
2021-03-22 08:15:51,825 - INFO - dataset:dataset.py:58 - Leafs set size: 536 (nodes which have no children)
2021-03-22 08:15:51,826 - INFO - dataset:dataset.py:61 - count number of references to leafs
2021-03-22 08:15:51,829 - INFO - dataset:dataset.py:63 - Nodes with leafs as children set size: 373 (nodes which have leafs as children)
2021-03-22 08:15:51,830 - INFO - dataset:dataset.py:66 - count num children for each node
2021-03-22 08:15:51,831 - INFO - dataset:dataset.py:68 - Nodes with childre

Assigning labels:   0%|          | 0/264 [00:00<?, ?it/s]

2021-03-22 08:15:52,556 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:21 - Dataset for contact-form
2021-03-22 08:15:52,573 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:23 - Dataset shape: (339, 13)
2021-03-22 08:15:52,573 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:25 - cleaning tag_name from dummy/auxiliary words
2021-03-22 08:15:52,574 - INFO - dataset:dataset.py:56 - select all leafs (nodes which are not parents)
2021-03-22 08:15:52,575 - INFO - dataset:dataset.py:58 - Leafs set size: 154 (nodes which have no children)
2021-03-22 08:15:52,577 - INFO - dataset:dataset.py:61 - count number of references to leafs
2021-03-22 08:15:52,578 - INFO - dataset:dataset.py:63 - Nodes with leafs as children set size: 99 (nodes which have leafs as children)
2021-03-22 08:15:52,580 - INFO - dataset:dataset.py:66 - count num children for each node
2021-03-22 08:15:52,581 - INFO - dataset:dataset.py:68 - Nodes with childr

Assigning labels:   0%|          | 0/38 [00:00<?, ?it/s]

2021-03-22 08:15:52,663 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:21 - Dataset for react-ant
2021-03-22 08:15:52,695 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:23 - Dataset shape: (703, 13)
2021-03-22 08:15:52,696 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:25 - cleaning tag_name from dummy/auxiliary words
2021-03-22 08:15:52,697 - INFO - dataset:dataset.py:56 - select all leafs (nodes which are not parents)
2021-03-22 08:15:52,698 - INFO - dataset:dataset.py:58 - Leafs set size: 261 (nodes which have no children)
2021-03-22 08:15:52,699 - INFO - dataset:dataset.py:61 - count number of references to leafs
2021-03-22 08:15:52,700 - INFO - dataset:dataset.py:63 - Nodes with leafs as children set size: 199 (nodes which have leafs as children)
2021-03-22 08:15:52,701 - INFO - dataset:dataset.py:66 - count num children for each node
2021-03-22 08:15:52,702 - INFO - dataset:dataset.py:68 - Nodes with children

Assigning labels:   0%|          | 0/91 [00:00<?, ?it/s]

2021-03-22 08:15:52,911 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:21 - Dataset for login
2021-03-22 08:15:52,926 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:23 - Dataset shape: (231, 13)
2021-03-22 08:15:52,927 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:25 - cleaning tag_name from dummy/auxiliary words
2021-03-22 08:15:52,928 - INFO - dataset:dataset.py:56 - select all leafs (nodes which are not parents)
2021-03-22 08:15:52,929 - INFO - dataset:dataset.py:58 - Leafs set size: 100 (nodes which have no children)
2021-03-22 08:15:52,930 - INFO - dataset:dataset.py:61 - count number of references to leafs
2021-03-22 08:15:52,931 - INFO - dataset:dataset.py:63 - Nodes with leafs as children set size: 72 (nodes which have leafs as children)
2021-03-22 08:15:52,932 - INFO - dataset:dataset.py:66 - count num children for each node
2021-03-22 08:15:52,932 - INFO - dataset:dataset.py:68 - Nodes with children: 132

Assigning labels:   0%|          | 0/23 [00:00<?, ?it/s]

2021-03-22 08:15:53,015 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:21 - Dataset for bootstrap-forms
2021-03-22 08:15:53,031 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:23 - Dataset shape: (315, 13)
2021-03-22 08:15:53,032 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:25 - cleaning tag_name from dummy/auxiliary words
2021-03-22 08:15:53,033 - INFO - dataset:dataset.py:56 - select all leafs (nodes which are not parents)
2021-03-22 08:15:53,034 - INFO - dataset:dataset.py:58 - Leafs set size: 152 (nodes which have no children)
2021-03-22 08:15:53,035 - INFO - dataset:dataset.py:61 - count number of references to leafs
2021-03-22 08:15:53,037 - INFO - dataset:dataset.py:63 - Nodes with leafs as children set size: 93 (nodes which have leafs as children)
2021-03-22 08:15:53,038 - INFO - dataset:dataset.py:66 - count num children for each node
2021-03-22 08:15:53,038 - INFO - dataset:dataset.py:68 - Nodes with chi

Assigning labels:   0%|          | 0/46 [00:00<?, ?it/s]

2021-03-22 08:15:53,132 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:21 - Dataset for search
2021-03-22 08:15:53,147 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:23 - Dataset shape: (297, 13)
2021-03-22 08:15:53,148 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:25 - cleaning tag_name from dummy/auxiliary words
2021-03-22 08:15:53,149 - INFO - dataset:dataset.py:56 - select all leafs (nodes which are not parents)
2021-03-22 08:15:53,150 - INFO - dataset:dataset.py:58 - Leafs set size: 139 (nodes which have no children)
2021-03-22 08:15:53,151 - INFO - dataset:dataset.py:61 - count number of references to leafs
2021-03-22 08:15:53,152 - INFO - dataset:dataset.py:63 - Nodes with leafs as children set size: 91 (nodes which have leafs as children)
2021-03-22 08:15:53,153 - INFO - dataset:dataset.py:66 - count num children for each node
2021-03-22 08:15:53,154 - INFO - dataset:dataset.py:68 - Nodes with children: 15

Assigning labels:   0%|          | 0/47 [00:00<?, ?it/s]

2021-03-22 08:15:53,238 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:21 - Dataset for complex-table
2021-03-22 08:15:53,259 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:23 - Dataset shape: (426, 13)
2021-03-22 08:15:53,260 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:25 - cleaning tag_name from dummy/auxiliary words
2021-03-22 08:15:53,261 - INFO - dataset:dataset.py:56 - select all leafs (nodes which are not parents)
2021-03-22 08:15:53,262 - INFO - dataset:dataset.py:58 - Leafs set size: 206 (nodes which have no children)
2021-03-22 08:15:53,263 - INFO - dataset:dataset.py:61 - count number of references to leafs
2021-03-22 08:15:53,264 - INFO - dataset:dataset.py:63 - Nodes with leafs as children set size: 131 (nodes which have leafs as children)
2021-03-22 08:15:53,265 - INFO - dataset:dataset.py:66 - count num children for each node
2021-03-22 08:15:53,265 - INFO - dataset:dataset.py:68 - Nodes with chil

Assigning labels:   0%|          | 0/61 [00:00<?, ?it/s]

2021-03-22 08:15:53,366 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:21 - Dataset for metals-and-colors
2021-03-22 08:15:53,383 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:23 - Dataset shape: (376, 13)
2021-03-22 08:15:53,384 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:25 - cleaning tag_name from dummy/auxiliary words
2021-03-22 08:15:53,385 - INFO - dataset:dataset.py:56 - select all leafs (nodes which are not parents)
2021-03-22 08:15:53,386 - INFO - dataset:dataset.py:58 - Leafs set size: 169 (nodes which have no children)
2021-03-22 08:15:53,388 - INFO - dataset:dataset.py:61 - count number of references to leafs
2021-03-22 08:15:53,389 - INFO - dataset:dataset.py:63 - Nodes with leafs as children set size: 108 (nodes which have leafs as children)
2021-03-22 08:15:53,390 - INFO - dataset:dataset.py:66 - count num children for each node
2021-03-22 08:15:53,390 - INFO - dataset:dataset.py:68 - Nodes with 

Assigning labels:   0%|          | 0/34 [00:00<?, ?it/s]

2021-03-22 08:15:53,468 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:21 - Dataset for mobile-and-html-5
2021-03-22 08:15:53,496 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:23 - Dataset shape: (650, 13)
2021-03-22 08:15:53,497 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:25 - cleaning tag_name from dummy/auxiliary words
2021-03-22 08:15:53,499 - INFO - dataset:dataset.py:56 - select all leafs (nodes which are not parents)
2021-03-22 08:15:53,499 - INFO - dataset:dataset.py:58 - Leafs set size: 398 (nodes which have no children)
2021-03-22 08:15:53,500 - INFO - dataset:dataset.py:61 - count number of references to leafs
2021-03-22 08:15:53,501 - INFO - dataset:dataset.py:63 - Nodes with leafs as children set size: 165 (nodes which have leafs as children)
2021-03-22 08:15:53,502 - INFO - dataset:dataset.py:66 - count num children for each node
2021-03-22 08:15:53,503 - INFO - dataset:dataset.py:68 - Nodes with 

Assigning labels:   0%|          | 0/113 [00:00<?, ?it/s]

2021-03-22 08:15:53,796 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:21 - Dataset for html-5
2021-03-22 08:15:53,825 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:23 - Dataset shape: (665, 13)
2021-03-22 08:15:53,825 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:25 - cleaning tag_name from dummy/auxiliary words
2021-03-22 08:15:53,827 - INFO - dataset:dataset.py:56 - select all leafs (nodes which are not parents)
2021-03-22 08:15:53,828 - INFO - dataset:dataset.py:58 - Leafs set size: 413 (nodes which have no children)
2021-03-22 08:15:53,830 - INFO - dataset:dataset.py:61 - count number of references to leafs
2021-03-22 08:15:53,831 - INFO - dataset:dataset.py:63 - Nodes with leafs as children set size: 165 (nodes which have leafs as children)
2021-03-22 08:15:53,832 - INFO - dataset:dataset.py:66 - count num children for each node
2021-03-22 08:15:53,833 - INFO - dataset:dataset.py:68 - Nodes with children: 2

Assigning labels:   0%|          | 0/118 [00:00<?, ?it/s]

2021-03-22 08:15:54,140 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:21 - Dataset for different-elemants
2021-03-22 08:15:54,155 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:23 - Dataset shape: (292, 13)
2021-03-22 08:15:54,156 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:25 - cleaning tag_name from dummy/auxiliary words
2021-03-22 08:15:54,157 - INFO - dataset:dataset.py:56 - select all leafs (nodes which are not parents)
2021-03-22 08:15:54,157 - INFO - dataset:dataset.py:58 - Leafs set size: 132 (nodes which have no children)
2021-03-22 08:15:54,159 - INFO - dataset:dataset.py:61 - count number of references to leafs
2021-03-22 08:15:54,160 - INFO - dataset:dataset.py:63 - Nodes with leafs as children set size: 93 (nodes which have leafs as children)
2021-03-22 08:15:54,162 - INFO - dataset:dataset.py:66 - count num children for each node
2021-03-22 08:15:54,162 - INFO - dataset:dataset.py:68 - Nodes with 

Assigning labels:   0%|          | 0/50 [00:00<?, ?it/s]

2021-03-22 08:15:54,240 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:21 - Dataset for performance
2021-03-22 08:15:54,326 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:23 - Dataset shape: (2675, 13)
2021-03-22 08:15:54,327 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:25 - cleaning tag_name from dummy/auxiliary words
2021-03-22 08:15:54,329 - INFO - dataset:dataset.py:56 - select all leafs (nodes which are not parents)
2021-03-22 08:15:54,330 - INFO - dataset:dataset.py:58 - Leafs set size: 2121 (nodes which have no children)
2021-03-22 08:15:54,331 - INFO - dataset:dataset.py:61 - count number of references to leafs
2021-03-22 08:15:54,333 - INFO - dataset:dataset.py:63 - Nodes with leafs as children set size: 487 (nodes which have leafs as children)
2021-03-22 08:15:54,335 - INFO - dataset:dataset.py:66 - count num children for each node
2021-03-22 08:15:54,336 - INFO - dataset:dataset.py:68 - Nodes with chil

Assigning labels:   0%|          | 0/42 [00:00<?, ?it/s]

2021-03-22 08:15:55,164 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:51 - Concatenate datasets
2021-03-22 08:15:55,243 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:53 - Dataset shape after reading: (16615, 38)
2021-03-22 08:15:55,785 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:60 - Build OHE column for attribute role
2021-03-22 08:15:55,804 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:60 - Build OHE column for attribute type
2021-03-22 08:15:55,823 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:60 - Build OHE column for attribute ui
2021-03-22 08:15:55,841 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:64 - Build OHE column for attribute role_parent
2021-03-22 08:15:55,858 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:64 - Build OHE column for attribute type_parent
2021-03-22 08:15:55,875 - INFO - <ipython-inpu

In [4]:
train_dataset[120]

(array([  0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   1. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   0. ,   1. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
      

In [5]:
train_dataset.labels.value_counts()

link                14864
n/a                 14810
button               6447
textfield            3591
checkbox             2121
radiobutton          1869
dropdown             1785
range                 630
table                 525
datetimeselector      483
textarea              336
fileinput             147
numberselector        105
dropdownselector      105
iframe                 84
colorpicker            84
checklist              42
image                  42
Name: label, dtype: int64

In [6]:
pd.DataFrame(train_dataset.dataset.label.value_counts()).reset_index().sort_values(by='index')

Unnamed: 0,index,label
2,0,6447
0,1,14864
14,2,84
3,3,3591
6,4,1785
4,5,2121
10,6,336
11,9,147
16,10,42
15,11,84


In [7]:
train_dataset.dataset.columns


Index(['attributes', 'displayed', 'element_id', 'height', 'onmouseenter',
       'onmouseover', 'parent_id', 'style', 'tag_name', 'text', 'width', 'x',
       'y', 'is_leaf', 'num_leafs', 'num_children', 'sum_children_widths',
       'sum_children_hights', 'attributes_parent', 'displayed_parent',
       'element_id_parent', 'height_parent', 'onmouseenter_parent',
       'onmouseover_parent', 'parent_id_parent', 'style_parent',
       'tag_name_parent', 'text_parent', 'width_parent', 'x_parent',
       'y_parent', 'is_leaf_parent', 'num_leafs_parent', 'num_children_parent',
       'sum_children_widths_parent', 'sum_children_hights_parent', 'label',
       'ds_name', 'ohe_role', 'ohe_type', 'ohe_ui', 'ohe_role_parent',
       'ohe_type_parent', 'ohe_ui_parent', 'attributes_text',
       'attributes_parent_text'],
      dtype='object')

In [8]:
train_dataset.vocabulary

{'alt': 0,
 'appearance': 1,
 'aria-checked': 2,
 'aria-controls': 3,
 'aria-describedby': 4,
 'aria-disabled': 5,
 'aria-expanded': 6,
 'aria-haspopup': 7,
 'aria-hidden': 8,
 'aria-label': 9,
 'aria-multiselectable': 10,
 'aria-posinset': 11,
 'aria-pressed': 12,
 'aria-required': 13,
 'aria-selected': 14,
 'aria-setsize': 15,
 'aria-valuemax': 16,
 'aria-valuemin': 17,
 'aria-valuenow': 18,
 'checked': 19,
 'class': 20,
 'color': 21,
 'cols': 22,
 'colspan': 23,
 'content': 24,
 'cx': 25,
 'cy': 26,
 'd': 27,
 'data-dt-idx': 28,
 'data-placement': 29,
 'data-slide-to': 30,
 'data-target': 31,
 'data-toggle': 32,
 'disabled': 33,
 'fill': 34,
 'for': 35,
 'height': 36,
 'href': 37,
 'id': 38,
 'index': 39,
 'list': 40,
 'mat-sort-header': 41,
 'matbadge': 42,
 'matbadgecolor': 43,
 'mattooltip': 44,
 'max': 45,
 'maxlength': 46,
 'min': 47,
 'mode': 48,
 'multiple': 49,
 'name': 50,
 'onchange': 51,
 'onclick': 52,
 'oncontextmenu': 53,
 'oninput': 54,
 'placeholder': 55,
 'preserveA

In [9]:
assert (train_dataset.dataset.element_id.nunique() == train_dataset.dataset.shape[0]) or train_dataset.rebalanced, 'There are nonunique values in the dataset, probably the dataset was rebalanced'
display(HTML("<h3>Check dataset.element_id for uniqueness: OK</h3>"))

In [10]:
display(HTML(f'<h3>Length of dataset: {train_dataset.dataset.shape[0]}<h3>'))

In [11]:
train_dataset.data.shape

(48070, 842)

## Test Dataloader

In [12]:
BATCH_SIZE = 256
dataloader = DataLoader(train_dataset, shuffle=True, batch_size = BATCH_SIZE)

In [13]:
next(iter(dataloader))

[tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [1., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 tensor([18,  3,  1,  1,  0,  1, 20,  4,  1, 20,  0, 20,  1, 20,  1, 20,  1, 18,
          1,  0,  1,  1, 20,  0,  1,  1, 12,  3,  0,  4,  1,  0, 20, 20, 20,  3,
         20,  0,  5,  1,  0,  1, 20,  0,  0,  0, 20, 20,  0,  1, 20, 14,  1, 20,
          0, 20,  5, 18,  1,  1, 20, 20,  1, 20, 20, 20,  0, 20,  0, 20,  1,  1,
          0,  1,  0,  1, 20,  1, 12,  1,  5,  1,  1,  1,  5, 20,  0,  3,  0,  1,
          0,  4, 20,  3, 18,  1,  1, 20, 20,  3, 20,  1, 20,  0,  1,  1,  1, 20,
          1,  1,  1, 20, 20,  1, 20,  4, 18,  1, 20, 20,  1,  1,  0,  1,  1, 14,
          1, 20, 20,  1,  0, 20, 20, 18,  3, 20,  1,  0,  0,  3,  0, 20,  1,  4,
          3,  0,  1, 20,  0,  1,  1, 20,  1,  4,  0,  3,  1,  1,  3,  0,  1,  1,
         

In [14]:
print('Elapsed time:', dt.datetime.now() - START_DT)

Elapsed time: 0:00:21.153855


In [15]:
%%time

NUM_BATCHES = len(dataloader)

with trange(NUM_BATCHES) as bar:
    for x, y in dataloader:
        print(x.shape, y.shape)
        bar.update(1)

  0%|          | 0/188 [00:00<?, ?it/s]

torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256, 842]) torch.Size([256])
torch.Size([256,

In [16]:
print('Elapsed time:', dt.datetime.now() - START_DT)

Elapsed time: 0:08:03.681160


In [17]:
test_dataset = JDIDataset(dataset_names=['angular'], rebalance=False)

2021-03-22 08:24:03,056 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:16 - List of dataset_names:['angular']
2021-03-22 08:24:03,057 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:21 - Dataset for angular
2021-03-22 08:24:03,990 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:23 - Dataset shape: (6450, 13)
2021-03-22 08:24:03,991 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:25 - cleaning tag_name from dummy/auxiliary words
2021-03-22 08:24:03,994 - INFO - dataset:dataset.py:56 - select all leafs (nodes which are not parents)
2021-03-22 08:24:03,996 - INFO - dataset:dataset.py:58 - Leafs set size: 3250 (nodes which have no children)
2021-03-22 08:24:04,000 - INFO - dataset:dataset.py:61 - count number of references to leafs
2021-03-22 08:24:04,003 - INFO - dataset:dataset.py:63 - Nodes with leafs as children set size: 1750 (nodes which have leafs as children)
2021-03-22 08:24:04,008 - INFO 

Assigning labels:   0%|          | 0/526 [00:00<?, ?it/s]

2021-03-22 08:24:11,215 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:51 - Concatenate datasets
2021-03-22 08:24:11,222 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:53 - Dataset shape after reading: (6450, 38)
2021-03-22 08:24:11,223 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:60 - Build OHE column for attribute role
2021-03-22 08:24:11,229 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:60 - Build OHE column for attribute type
2021-03-22 08:24:11,235 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:60 - Build OHE column for attribute ui
2021-03-22 08:24:11,243 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:64 - Build OHE column for attribute role_parent
2021-03-22 08:24:11,249 - INFO - <ipython-input-2-e1d8b3799ba9>:<ipython-input-2-e1d8b3799ba9>:64 - Build OHE column for attribute type_parent
2021-03-22 08:24:11,255 - INFO - <ipython-input

In [21]:
BATCH_SIZE = 1
dataloader = DataLoader(test_dataset, shuffle=False, batch_size = BATCH_SIZE)

In [24]:
%%time

NUM_BATCHES = len(dataloader)

with trange(NUM_BATCHES) as bar:
    for x, y in dataloader:
        #print(x.shape, y)
        bar.update(1)

  0%|          | 0/6450 [00:00<?, ?it/s]

Wall time: 8.88 s
