In [1]:
import sys
import re
import os
from datetime import datetime
from time import sleep
from IPython.display import display, clear_output, HTML
import logging

# set working dir
WORKING_DIR = re.sub('jdi-qasp-ml.*$','',os.path.normpath(os.getcwd())) + 'jdi-qasp-ml'
os.chdir(WORKING_DIR)

import utils
from utils import *
import torch
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader

from glob import glob
import selenium
from tqdm.auto import tqdm, trange

START_TS = datetime.now()
display(HTML(f'{START_TS} - Current working directory: <b>{WORKING_DIR}</b>'))

import matplotlib.pyplot as plt
import matplotlib.patches as patches
%matplotlib inline
from scipy.sparse import csc_matrix, csr_matrix, vstack

import numba


2021-06-18 18:29:32,338 -INFO - config:config.py:16 - Module utils.config was loaded
2021-06-18 18:29:32,341 -INFO - common:common.py:618 - Module utils.common is loaded...
2021-06-18 18:29:32,343 -INFO - hidden:hidden.py:121 - hidden module is loaded
2021-06-18 18:29:32,670 -INFO - features_builder:features_builder.py:305 - feature_bilder module is loaded...
2021-06-18 18:29:32,672 -INFO - dataset_builder:dataset_builder.py:207 - dataset package is loaded...
2021-06-18 18:29:33,015 -INFO - dataset:dataset.py:273 - dataset module is loaded...


In [2]:
TRAIN_DATASETS = [
    'angular',
    'bootstrap-1',
    # 'bootstrap-form-control',
    'bootstrap-form',
    'bootstrap-forms',
    'bootstrap-reboot',
    'bootstrap',
    'complex-table',
    'contact-form',
    # 'dates',
    "different-elements",
    'gitlab',
    'google-voice',
    'html-5',
    'login',
    'metals-and-colors',
    # 'mobile-and-html-5',
    'ms-office',
    'ozon',
    'performance',
    'react-ant',
    # 'search',
    # 'support',
    # 'table-with-pages',
    'user-table',
    'wildberries',
    'material-ui-Button Groups',
    'material-ui-Switch',
    'material-ui-Textarea Autosize',
    'material-ui-Progress',
    'material-ui-Radio',
    'material-ui-Buttons',
    'material-ui-Checkbox',
    'material-ui-List',
    'material-ui-Text Field',
    'material-ui-Floating Action Button',
    'material-ui-Slider',
    'material-ui-Select'
]

TEST_DATASETS = [
    'dates',
    'search',
    'support',
    'table-with-pages',
    'mobile-and-html-5',
    'phys-org'
]

In [3]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
logger.info(f'device: {DEVICE}')

2021-06-18 18:29:33,877 -INFO - <ipython-input-3-43c6dd9f1c24>:<ipython-input-3-43c6dd9f1c24>:2 - device: cuda


In [4]:
class JDNDataset(Dataset):
    
    def __init__(self, datasets_list: list=None, rebalance_and_suffle: bool=True):
        
        super(JDNDataset, self).__init__()
        self.rebalance_and_suffle = rebalance_and_suffle
        
        if datasets_list is None:
            logger.info('Will use all available datasets')
            ds_files = glob('dataset/df/*.parquet')
            ds_files = [(fn, 'dataset/annotations/' + re.split(r'[/\\]', re.sub(r'\.parquet$', '', fn))[-1] + '.txt') for fn in ds_files]
        else:
            ds_files = [(f'dataset/df/{fn}.parquet', f'dataset/annotations/{fn}.txt') for fn in datasets_list]
        
        # display(ds_files)
        
        df_list = []
        
        logger.setLevel(logging.ERROR)
        with trange(len(ds_files)) as bar:
            for df_file_path, ann_file_path in ds_files:
                if not os.path.exists(df_file_path):
                    logger.error(f'File: {df_file_path} does not extst')
                else:
                    bar.set_postfix_str(f'{df_file_path}, {ann_file_path}')
                    df = pd.read_parquet(df_file_path)
                    df = build_features(df)
                    df = assign_labels(df=df, annotations_file_path=ann_file_path)
                    df_list.append(df)
                bar.update(1)
        
        logger.setLevel(logging.DEBUG)
        
        self.df = pd.concat(df_list)
        logger.info(f'self.df.shape: {self.df.shape}')
        
        logger.info('Check for duplicates...')
        if self.df.element_id.nunique() != self.df.shape[0]:
            logger.fatal('There are duplicates in the dataset')
            raise Exception('There are duplicates in the dataset')
        logger.info('Check for duplicates is OK')
            
        self.X, self.y = collect_dataset(self.df)
        
        if self.rebalance_and_suffle:
            self.indices = np.array(rebalance(self.y))
        else:
            self.indices = np.array([i for i in range(len(self.y))])
        
    def __getitem__(self, idx):
        return self.X[self.indices[idx]], self.y[self.indices[idx]]
    
    def __len__(self):
        return self.indices.shape[0]


In [5]:
jdn_dataset = JDNDataset(datasets_list = TRAIN_DATASETS)


  0%|          | 0/32 [00:00<?, ?it/s]

2021-06-18 18:30:01,555 -INFO - <ipython-input-4-1ee8047141fa>:<ipython-input-4-1ee8047141fa>:35 - self.df.shape: (39826, 24)
2021-06-18 18:30:01,555 -INFO - <ipython-input-4-1ee8047141fa>:<ipython-input-4-1ee8047141fa>:37 - Check for duplicates...
2021-06-18 18:30:01,565 -INFO - <ipython-input-4-1ee8047141fa>:<ipython-input-4-1ee8047141fa>:41 - Check for duplicates is OK
2021-06-18 18:30:01,707 -INFO - features_builder:features_builder.py:228 - used column: tag_name
2021-06-18 18:30:01,708 -INFO - features_builder:features_builder.py:231 - loading model/ohe_tag_name.pkl
2021-06-18 18:30:01,725 -INFO - features_builder:features_builder.py:228 - used column: tag_name_parent
2021-06-18 18:30:01,725 -INFO - features_builder:features_builder.py:231 - loading model/ohe_tag_name.pkl
2021-06-18 18:30:01,742 -INFO - features_builder:features_builder.py:228 - used column: tag_name_upsib
2021-06-18 18:30:01,743 -INFO - features_builder:features_builder.py:231 - loading model/ohe_tag_name.pkl
202

  0%|          | 0/39826 [00:00<?, ?it/s]

2021-06-18 18:30:04,791 -INFO - features_builder:features_builder.py:152 - used column: attributes_parent


  0%|          | 0/39826 [00:00<?, ?it/s]

2021-06-18 18:30:07,828 -INFO - features_builder:features_builder.py:152 - used column: attributes_upsib


  0%|          | 0/39826 [00:00<?, ?it/s]

2021-06-18 18:30:10,798 -INFO - features_builder:features_builder.py:152 - used column: attributes_dnsib


  0%|          | 0/39826 [00:00<?, ?it/s]

2021-06-18 18:30:13,830 -INFO - dataset_collector:dataset_collector.py:53 - attributes_sm: (39826, 17)
2021-06-18 18:30:13,830 -INFO - features_builder:features_builder.py:188 - used column: attributes
2021-06-18 18:30:13,831 -INFO - features_builder:features_builder.py:191 - TfIdfVectorizer for class attribute exists. Loaging...
2021-06-18 18:30:13,990 -INFO - features_builder:features_builder.py:188 - used column: attributes_parent
2021-06-18 18:30:13,991 -INFO - features_builder:features_builder.py:191 - TfIdfVectorizer for class attribute exists. Loaging...
2021-06-18 18:30:14,164 -INFO - features_builder:features_builder.py:188 - used column: attributes_upsib
2021-06-18 18:30:14,165 -INFO - features_builder:features_builder.py:191 - TfIdfVectorizer for class attribute exists. Loaging...
2021-06-18 18:30:14,285 -INFO - features_builder:features_builder.py:188 - used column: attributes_dnsib
2021-06-18 18:30:14,286 -INFO - features_builder:features_builder.py:191 - TfIdfVectorizer f

Unnamed: 0,label,label_text,cnt,ratio,cnt_rebalanced
0,0,button,1158,4,4632
1,1,link,1636,3,4908
2,2,textfield,207,24,4968
3,3,dropdown,7,738,5166
4,4,checkbox,239,21,5019
5,5,radiobutton,105,49,5145
6,6,textarea,20,258,5160
7,7,fileinput,7,738,5166
8,8,iframe,5,1033,5165
9,9,range,6,861,5166


2021-06-18 18:30:14,971 -INFO - dataset:dataset.py:209 - Rebalanced and shuffled indices: 143405


In [6]:
assert jdn_dataset.df.element_id.nunique() == jdn_dataset.df.shape[0], "There are duplicates in the dataset"
display(HTML(f"<h1><b>The dataset is OK, there are no duplicates: {jdn_dataset.df.element_id.nunique()}, {jdn_dataset.df.shape[0]}</b></h1>"))

In [7]:
jdn_dataset.indices

array([32630, 14541, 10190, ..., 11789, 27250, 16857])

In [8]:
jdn_dataset[jdn_dataset.indices[[0,1,2,-1]]]

(array([[0., 0., 0., ..., 0., 1., 1.],
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 1., 1.]]),
 array([18,  9, 21, 16]))

 # Dataloader

In [9]:
# jdn_dataset = JDNDataset(rebalance_and_suffle=False)

In [10]:
# len(jdn_dataset)

In [11]:
BATCH_SIZE = 256

In [12]:
jdn_dataloader = DataLoader(jdn_dataset,
                            batch_size=BATCH_SIZE,
                            shuffle=True,
                            pin_memory=True,
                            drop_last=False,
                            num_workers=0)

In [13]:
len(jdn_dataloader)

561

In [14]:
r_cnt = 0
with trange(len(jdn_dataloader)) as bar:
    for X, y in jdn_dataloader:
#         display(X)
#         display(X.shape)
#         display(y)
#         display(y.shape)
        bar.update(1)
        r_cnt += X.shape[0]
        #break
r_cnt, len(jdn_dataloader), len(jdn_dataset)

  0%|          | 0/561 [00:00<?, ?it/s]

(143405, 561, 143405)

In [17]:
with open('dataset/classes.txt', 'r') as f:
    classes_dict = { i:v.strip() for i, v in enumerate(f.readlines())}
classes_dict

{0: 'button',
 1: 'link',
 2: 'textfield',
 3: 'dropdown',
 4: 'checkbox',
 5: 'radiobutton',
 6: 'textarea',
 7: 'fileinput',
 8: 'iframe',
 9: 'range',
 10: 'progressbar',
 11: 'datetimeselector',
 12: 'colorpicker',
 13: 'numberselector',
 14: 'selector',
 15: 'table',
 16: 'switch',
 17: 'slider',
 18: 'tree-node',
 19: 'steper',
 20: 'tab',
 21: 'n/a'}