In [1]:
import sys
import os
import re
import gc
import json
import logging
import requests
import itertools
import numba

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse.csr import csr_matrix
from time import sleep


from tqdm.auto import trange, tqdm
from collections import Counter, defaultdict
from scipy.sparse import hstack
import pickle
from torch.utils.data import Dataset
from glob import glob
from IPython.display import display as ipython_displpay
from IPython.display import HTML

In [2]:
WORKING_DIR = re.sub('jdi-qasp-ml.*$','',os.path.normpath(os.getcwd())) + 'jdi-qasp-ml'
os.chdir(WORKING_DIR)

In [3]:
WORKING_DIR

'C:\\WORK\\jdi-qasp-ml'

In [4]:

from utils.dataset_builder import DatasetBuilder
from utils.common import maximize_window

2021-09-06 15:17:50,672 -INFO - config.py:25 - Module utils.config was loaded
2021-09-06 15:17:50,677 -INFO - common.py:618 - Module utils.common is loaded...
2021-09-06 15:17:50,682 -INFO - hidden.py:121 - hidden module is loaded
2021-09-06 15:17:50,687 -INFO - features_builder.py:305 - feature_bilder module is loaded...
2021-09-06 15:17:50,693 -INFO - dataset_builder.py:213 - dataset package is loaded...
2021-09-06 15:17:50,699 -INFO - dataset.py:277 - dataset module is loaded...


In [5]:
UTILS_LOGGER = 'utils.dataset'
LOG_FILE_PATH = 'tmp/log.txt'

os.makedirs(name='tmp', exist_ok=True)

logger = logging.getLogger(UTILS_LOGGER)
logger.setLevel(logging.DEBUG)


formatter = logging.Formatter('%(asctime)s -%(levelname)s - %(module)s:%(filename)s:%(lineno)d - %(message)s')

console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(formatter)

file_handler = logging.FileHandler(filename=LOG_FILE_PATH, mode='w')
file_handler.setFormatter(formatter)

logger.addHandler(console_handler)
logger.addHandler(file_handler)

logger.info('Module utils.config was loaded')

2021-09-06 15:18:00,820 -INFO - 632196539.py:21 - Module utils.config was loaded
2021-09-06 15:18:00,820 -INFO - 632196539:632196539.py:21 - Module utils.config was loaded


In [6]:
WAIT_TIME_SECONDS = 3

In [7]:
os.getcwd()

'C:\\WORK\\jdi-qasp-ml'

Below we regenerate datasets we used, after applying changes to DatasetBuilder (.pkl storage and dataset_root_path parameter) 

In [8]:
SITE_URLS = [ p.replace("\\", "/") +"/index.html" for p in glob(f'{os.getcwd()}/MUI_model/dataset/build/*')]
DATASET_NAMES = [ re.search('site[0-9]+', nm)[0] for nm in SITE_URLS]
DATASET_NAMES = ['mui-' + name for name in DATASET_NAMES]

In [9]:
class JDIDatasetBuilder(DatasetBuilder):

    def setUp(self, driver):
        self.logger.info(f'getting page')
        driver.get(self.url)
        #driver.set_window_size(width=1500, height=8000)
        maximize_window(driver)
        sleep(WAIT_TIME_SECONDS)
        
        
with JDIDatasetBuilder(url=SITE_URLS[0], 
                       dataset_name=DATASET_NAMES[0], 
                       headless=True, 
                       dataset_root_path='./MUI_model/dataset/') as builder:
    
    elements_df = builder.dataset
    dataset_json = builder.dataset_json
    plt.imshow(plt.imread(f'./MUI_model/dataset/images/{DATASET_NAMES[0]}.png'))
    
    
# builder = JDIDatasetBuilder(url=SITE_URL, dataset_name=DATASET_NAME, headless=True)
# elements_df = builder.dataset
# plt.imshow(plt.imread(f'dataset/images/{DATASET_NAME}.png'))

2021-09-06 13:07:30,295 -INFO - dataset_builder.py:78 - Dataset name: mui-site1
2021-09-06 13:07:30,295 -INFO - dataset_builder:dataset_builder.py:78 - Dataset name: mui-site1
2021-09-06 13:07:30,298 -INFO - dataset_builder.py:88 - Create directories to save the dataset
2021-09-06 13:07:30,298 -INFO - dataset_builder:dataset_builder.py:88 - Create directories to save the dataset
2021-09-06 13:07:30,301 -INFO - dataset_builder.py:126 - Creating driver
2021-09-06 13:07:30,301 -INFO - dataset_builder:dataset_builder.py:126 - Creating driver


SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 91
Current browser version is 93.0.4577.63 with binary path C:\Program Files\Google\Chrome\Application\chrome.exe


In [None]:
def collect_many_ds(site_urls, dataset_names):
    
    all_dfs = []
    
    with trange(len(site_urls)) as bar:

        for site, ds_name in zip(site_urls, dataset_names):
            
            class JDIDatasetBuilder(DatasetBuilder):
                def setUp(self, driver):
                    self.logger.info(f'getting page')
                    driver.get(self.url)
                    maximize_window(driver=driver)
                    sleep(WAIT_TIME_SECONDS)

            with JDIDatasetBuilder(url=site, 
                                   dataset_name=ds_name, 
                                   headless=True,
                                   dataset_root_path='./MUI_model/dataset/'
                                  ) as builder:
                elements_df = builder.dataset
                dataset_json = builder.dataset_json
                elements_df['idx'] = elements_df.index
                elements_df['ds_name'] = ds_name
                # plt.imshow(plt.imread(f'dataset/images/{ds_name}.png'))
                all_dfs.append(elements_df)
                
            bar.update(1)
            
    df_sites = pd.concat(all_dfs, axis=0)      
    return df_sites

In [31]:
elements_df = collect_many_ds(SITE_URLS, DATASET_NAMES)
elements_df.to_pickle('MUI_model/dataset/elements_df.pkl')

  0%|          | 0/122 [00:00<?, ?it/s]

2021-08-30 14:03:44,333 -INFO - dataset_builder:dataset_builder.py:78 - Dataset name: mui-site1
2021-08-30 14:03:44,333 -INFO - dataset_builder.py:78 - Dataset name: mui-site1
2021-08-30 14:03:44,339 -INFO - dataset_builder:dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:03:44,339 -INFO - dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:03:44,345 -INFO - dataset_builder:dataset_builder.py:126 - Creating driver
2021-08-30 14:03:44,345 -INFO - dataset_builder.py:126 - Creating driver
2021-08-30 14:03:45,466 -INFO - dataset_builder:dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:03:45,466 -INFO - dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:03:45,469 -INFO - 2387718061:2387718061.py:11 - getting page
2021-08-30 14:03:45,469 -INFO - 2387718061.py:11 - getting page
2021-08-30 14:03:46,009 -INFO - common:common.py:75 - Window maximized
2021-08-30 14:03:46,009 -INFO - common.py:75 - Window maxi

2021-08-30 14:04:12,878 -INFO - dataset_builder:dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site101.html
2021-08-30 14:04:12,878 -INFO - dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site101.html
2021-08-30 14:04:12,921 -INFO - dataset_builder:dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site101.pkl
2021-08-30 14:04:12,921 -INFO - dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site101.pkl
2021-08-30 14:04:12,928 -INFO - dataset_builder:dataset_builder.py:204 - No attributes: (20, 13)
2021-08-30 14:04:12,928 -INFO - dataset_builder.py:204 - No attributes: (20, 13)
2021-08-30 14:04:13,163 -INFO - dataset_builder:dataset_builder.py:103 - Close web driver
2021-08-30 14:04:13,163 -INFO - dataset_builder.py:103 - Close web driver
2021-08-30 14:04:13,247 -INFO - dataset_builder:dataset_builder.py:78 - Dataset name: mui-site102
2021-08-30 14:04:13,247 -INFO - dataset_builder.py:78 - Dataset name: mui-site10

2021-08-30 14:04:36,113 -INFO - dataset_builder:dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:04:36,113 -INFO - dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:04:36,116 -INFO - 2387718061:2387718061.py:11 - getting page
2021-08-30 14:04:36,116 -INFO - 2387718061.py:11 - getting page
2021-08-30 14:04:36,602 -INFO - common:common.py:75 - Window maximized
2021-08-30 14:04:36,602 -INFO - common.py:75 - Window maximized
2021-08-30 14:04:39,615 -INFO - dataset_builder:dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site105.png
2021-08-30 14:04:39,615 -INFO - dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site105.png
2021-08-30 14:04:40,241 -INFO - dataset_builder:dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:04:40,241 -INFO - dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:04:41,491 -INFO - dataset_builder:dataset_builder.py:192 - Save html to ./MUI_mo

2021-08-30 14:05:03,941 -INFO - dataset_builder:dataset_builder.py:204 - No attributes: (21, 13)
2021-08-30 14:05:03,941 -INFO - dataset_builder.py:204 - No attributes: (21, 13)
2021-08-30 14:05:04,162 -INFO - dataset_builder:dataset_builder.py:103 - Close web driver
2021-08-30 14:05:04,162 -INFO - dataset_builder.py:103 - Close web driver
2021-08-30 14:05:04,258 -INFO - dataset_builder:dataset_builder.py:78 - Dataset name: mui-site109
2021-08-30 14:05:04,258 -INFO - dataset_builder.py:78 - Dataset name: mui-site109
2021-08-30 14:05:04,262 -INFO - dataset_builder:dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:05:04,262 -INFO - dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:05:04,267 -INFO - dataset_builder:dataset_builder.py:126 - Creating driver
2021-08-30 14:05:04,267 -INFO - dataset_builder.py:126 - Creating driver
2021-08-30 14:05:05,387 -INFO - dataset_builder:dataset_builder.py:137 - Chrome web driver is created
2021-08-

2021-08-30 14:05:51,755 -INFO - dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site111.png
2021-08-30 14:05:52,238 -INFO - dataset_builder:dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:05:52,238 -INFO - dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:05:54,198 -INFO - dataset_builder:dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site111.html
2021-08-30 14:05:54,198 -INFO - dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site111.html
2021-08-30 14:05:54,241 -INFO - dataset_builder:dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site111.pkl
2021-08-30 14:05:54,241 -INFO - dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site111.pkl
2021-08-30 14:05:54,247 -INFO - dataset_builder:dataset_builder.py:204 - No attributes: (17, 13)
2021-08-30 14:05:54,247 -INFO - dataset_builder.py:204 - No attributes: (17, 13)
2021-08-30 14:05:54,517 -INFO 

2021-08-30 14:06:14,939 -INFO - dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:06:14,945 -INFO - dataset_builder:dataset_builder.py:126 - Creating driver
2021-08-30 14:06:14,945 -INFO - dataset_builder.py:126 - Creating driver
2021-08-30 14:06:15,972 -INFO - dataset_builder:dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:06:15,972 -INFO - dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:06:15,975 -INFO - 2387718061:2387718061.py:11 - getting page
2021-08-30 14:06:15,975 -INFO - 2387718061.py:11 - getting page
2021-08-30 14:06:16,623 -INFO - common:common.py:75 - Window maximized
2021-08-30 14:06:16,623 -INFO - common.py:75 - Window maximized
2021-08-30 14:06:19,635 -INFO - dataset_builder:dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site115.png
2021-08-30 14:06:19,635 -INFO - dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site115.png
2021-08-30 14:06:20,174 -INFO - d

2021-08-30 14:06:48,068 -INFO - dataset_builder:dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site118.pkl
2021-08-30 14:06:48,068 -INFO - dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site118.pkl
2021-08-30 14:06:48,073 -INFO - dataset_builder:dataset_builder.py:204 - No attributes: (30, 13)
2021-08-30 14:06:48,073 -INFO - dataset_builder.py:204 - No attributes: (30, 13)
2021-08-30 14:06:48,294 -INFO - dataset_builder:dataset_builder.py:103 - Close web driver
2021-08-30 14:06:48,294 -INFO - dataset_builder.py:103 - Close web driver
2021-08-30 14:06:48,376 -INFO - dataset_builder:dataset_builder.py:78 - Dataset name: mui-site119
2021-08-30 14:06:48,376 -INFO - dataset_builder.py:78 - Dataset name: mui-site119
2021-08-30 14:06:48,380 -INFO - dataset_builder:dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:06:48,380 -INFO - dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:06:48,387 -INFO - 

2021-08-30 14:07:13,363 -INFO - 2387718061.py:11 - getting page
2021-08-30 14:07:13,900 -INFO - common:common.py:75 - Window maximized
2021-08-30 14:07:13,900 -INFO - common.py:75 - Window maximized
2021-08-30 14:07:16,912 -INFO - dataset_builder:dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site121.png
2021-08-30 14:07:16,912 -INFO - dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site121.png
2021-08-30 14:07:17,474 -INFO - dataset_builder:dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:07:17,474 -INFO - dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:07:19,009 -INFO - dataset_builder:dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site121.html
2021-08-30 14:07:19,009 -INFO - dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site121.html
2021-08-30 14:07:19,044 -INFO - dataset_builder:dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-s

2021-08-30 14:07:40,325 -INFO - dataset_builder.py:103 - Close web driver
2021-08-30 14:07:40,410 -INFO - dataset_builder:dataset_builder.py:78 - Dataset name: mui-site15
2021-08-30 14:07:40,410 -INFO - dataset_builder.py:78 - Dataset name: mui-site15
2021-08-30 14:07:40,413 -INFO - dataset_builder:dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:07:40,413 -INFO - dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:07:40,420 -INFO - dataset_builder:dataset_builder.py:126 - Creating driver
2021-08-30 14:07:40,420 -INFO - dataset_builder.py:126 - Creating driver
2021-08-30 14:07:41,510 -INFO - dataset_builder:dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:07:41,510 -INFO - dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:07:41,514 -INFO - 2387718061:2387718061.py:11 - getting page
2021-08-30 14:07:41,514 -INFO - 2387718061.py:11 - getting page
2021-08-30 14:07:42,067 -INFO - common:common.py:75 -

2021-08-30 14:08:06,924 -INFO - dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:08:08,414 -INFO - dataset_builder:dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site18.html
2021-08-30 14:08:08,414 -INFO - dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site18.html
2021-08-30 14:08:08,455 -INFO - dataset_builder:dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site18.pkl
2021-08-30 14:08:08,455 -INFO - dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site18.pkl
2021-08-30 14:08:08,460 -INFO - dataset_builder:dataset_builder.py:204 - No attributes: (19, 13)
2021-08-30 14:08:08,460 -INFO - dataset_builder.py:204 - No attributes: (19, 13)
2021-08-30 14:08:08,686 -INFO - dataset_builder:dataset_builder.py:103 - Close web driver
2021-08-30 14:08:08,686 -INFO - dataset_builder.py:103 - Close web driver
2021-08-30 14:08:08,780 -INFO - dataset_builder:dataset_builder.py:78 - Dataset name: mui-si

2021-08-30 14:09:12,501 -INFO - dataset_builder:dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:09:12,501 -INFO - dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:09:12,505 -INFO - 2387718061:2387718061.py:11 - getting page
2021-08-30 14:09:12,505 -INFO - 2387718061.py:11 - getting page
2021-08-30 14:09:13,058 -INFO - common:common.py:75 - Window maximized
2021-08-30 14:09:13,058 -INFO - common.py:75 - Window maximized
2021-08-30 14:09:16,067 -INFO - dataset_builder:dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site21.png
2021-08-30 14:09:16,067 -INFO - dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site21.png
2021-08-30 14:09:16,658 -INFO - dataset_builder:dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:09:16,658 -INFO - dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:09:18,388 -INFO - dataset_builder:dataset_builder.py:192 - Save html to ./MUI_mode

2021-08-30 14:09:40,590 -INFO - dataset_builder.py:204 - No attributes: (33, 13)
2021-08-30 14:09:40,818 -INFO - dataset_builder:dataset_builder.py:103 - Close web driver
2021-08-30 14:09:40,818 -INFO - dataset_builder.py:103 - Close web driver
2021-08-30 14:09:40,902 -INFO - dataset_builder:dataset_builder.py:78 - Dataset name: mui-site25
2021-08-30 14:09:40,902 -INFO - dataset_builder.py:78 - Dataset name: mui-site25
2021-08-30 14:09:40,906 -INFO - dataset_builder:dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:09:40,906 -INFO - dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:09:40,910 -INFO - dataset_builder:dataset_builder.py:126 - Creating driver
2021-08-30 14:09:40,910 -INFO - dataset_builder.py:126 - Creating driver
2021-08-30 14:09:42,046 -INFO - dataset_builder:dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:09:42,046 -INFO - dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:09:42,0

2021-08-30 14:10:06,515 -INFO - dataset_builder:dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:10:06,515 -INFO - dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:10:10,396 -INFO - dataset_builder:dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site28.html
2021-08-30 14:10:10,396 -INFO - dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site28.html
2021-08-30 14:10:10,452 -INFO - dataset_builder:dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site28.pkl
2021-08-30 14:10:10,452 -INFO - dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site28.pkl
2021-08-30 14:10:10,458 -INFO - dataset_builder:dataset_builder.py:204 - No attributes: (29, 13)
2021-08-30 14:10:10,458 -INFO - dataset_builder.py:204 - No attributes: (29, 13)
2021-08-30 14:10:10,700 -INFO - dataset_builder:dataset_builder.py:103 - Close web driver
2021-08-30 14:10:10,700 -INFO - dataset_builder.py:103 - Clo

2021-08-30 14:10:33,107 -INFO - dataset_builder:dataset_builder.py:126 - Creating driver
2021-08-30 14:10:33,107 -INFO - dataset_builder.py:126 - Creating driver
2021-08-30 14:10:34,225 -INFO - dataset_builder:dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:10:34,225 -INFO - dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:10:34,227 -INFO - 2387718061:2387718061.py:11 - getting page
2021-08-30 14:10:34,227 -INFO - 2387718061.py:11 - getting page
2021-08-30 14:10:34,783 -INFO - common:common.py:75 - Window maximized
2021-08-30 14:10:34,783 -INFO - common.py:75 - Window maximized
2021-08-30 14:10:37,796 -INFO - dataset_builder:dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site31.png
2021-08-30 14:10:37,796 -INFO - dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site31.png
2021-08-30 14:10:38,175 -INFO - dataset_builder:dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:10:38,175 

2021-08-30 14:11:00,691 -INFO - dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site34.pkl
2021-08-30 14:11:00,697 -INFO - dataset_builder:dataset_builder.py:204 - No attributes: (21, 13)
2021-08-30 14:11:00,697 -INFO - dataset_builder.py:204 - No attributes: (21, 13)
2021-08-30 14:11:00,936 -INFO - dataset_builder:dataset_builder.py:103 - Close web driver
2021-08-30 14:11:00,936 -INFO - dataset_builder.py:103 - Close web driver
2021-08-30 14:11:01,029 -INFO - dataset_builder:dataset_builder.py:78 - Dataset name: mui-site35
2021-08-30 14:11:01,029 -INFO - dataset_builder.py:78 - Dataset name: mui-site35
2021-08-30 14:11:01,033 -INFO - dataset_builder:dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:11:01,033 -INFO - dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:11:01,040 -INFO - dataset_builder:dataset_builder.py:126 - Creating driver
2021-08-30 14:11:01,040 -INFO - dataset_builder.py:126 - Creating driver
2

2021-08-30 14:11:27,433 -INFO - common.py:75 - Window maximized
2021-08-30 14:11:30,449 -INFO - dataset_builder:dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site38.png
2021-08-30 14:11:30,449 -INFO - dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site38.png
2021-08-30 14:11:30,914 -INFO - dataset_builder:dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:11:30,914 -INFO - dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:11:32,973 -INFO - dataset_builder:dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site38.html
2021-08-30 14:11:32,973 -INFO - dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site38.html
2021-08-30 14:11:33,015 -INFO - dataset_builder:dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site38.pkl
2021-08-30 14:11:33,015 -INFO - dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site38.pkl
2021-08-30 14:11:3

2021-08-30 14:12:37,734 -INFO - dataset_builder.py:78 - Dataset name: mui-site41
2021-08-30 14:12:37,737 -INFO - dataset_builder:dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:12:37,737 -INFO - dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:12:37,743 -INFO - dataset_builder:dataset_builder.py:126 - Creating driver
2021-08-30 14:12:37,743 -INFO - dataset_builder.py:126 - Creating driver
2021-08-30 14:12:38,789 -INFO - dataset_builder:dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:12:38,789 -INFO - dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:12:38,793 -INFO - 2387718061:2387718061.py:11 - getting page
2021-08-30 14:12:38,793 -INFO - 2387718061.py:11 - getting page
2021-08-30 14:12:39,334 -INFO - common:common.py:75 - Window maximized
2021-08-30 14:12:39,334 -INFO - common.py:75 - Window maximized
2021-08-30 14:12:42,338 -INFO - dataset_builder:dataset_builder.py:140 - save scrinshot: 

2021-08-30 14:13:07,890 -INFO - dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site44.html
2021-08-30 14:13:07,930 -INFO - dataset_builder:dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site44.pkl
2021-08-30 14:13:07,930 -INFO - dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site44.pkl
2021-08-30 14:13:07,935 -INFO - dataset_builder:dataset_builder.py:204 - No attributes: (30, 13)
2021-08-30 14:13:07,935 -INFO - dataset_builder.py:204 - No attributes: (30, 13)
2021-08-30 14:13:08,145 -INFO - dataset_builder:dataset_builder.py:103 - Close web driver
2021-08-30 14:13:08,145 -INFO - dataset_builder.py:103 - Close web driver
2021-08-30 14:13:08,228 -INFO - dataset_builder:dataset_builder.py:78 - Dataset name: mui-site45
2021-08-30 14:13:08,228 -INFO - dataset_builder.py:78 - Dataset name: mui-site45
2021-08-30 14:13:08,231 -INFO - dataset_builder:dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:13:08,

2021-08-30 14:13:30,689 -INFO - 2387718061:2387718061.py:11 - getting page
2021-08-30 14:13:30,689 -INFO - 2387718061.py:11 - getting page
2021-08-30 14:13:31,231 -INFO - common:common.py:75 - Window maximized
2021-08-30 14:13:31,231 -INFO - common.py:75 - Window maximized
2021-08-30 14:13:34,249 -INFO - dataset_builder:dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site48.png
2021-08-30 14:13:34,249 -INFO - dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site48.png
2021-08-30 14:13:34,645 -INFO - dataset_builder:dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:13:34,645 -INFO - dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:13:38,339 -INFO - dataset_builder:dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site48.html
2021-08-30 14:13:38,339 -INFO - dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site48.html
2021-08-30 14:13:38,389 -INFO - dataset_builde

2021-08-30 14:14:01,172 -INFO - dataset_builder.py:103 - Close web driver
2021-08-30 14:14:01,263 -INFO - dataset_builder:dataset_builder.py:78 - Dataset name: mui-site51
2021-08-30 14:14:01,263 -INFO - dataset_builder.py:78 - Dataset name: mui-site51
2021-08-30 14:14:01,266 -INFO - dataset_builder:dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:14:01,266 -INFO - dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:14:01,273 -INFO - dataset_builder:dataset_builder.py:126 - Creating driver
2021-08-30 14:14:01,273 -INFO - dataset_builder.py:126 - Creating driver
2021-08-30 14:14:02,372 -INFO - dataset_builder:dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:14:02,372 -INFO - dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:14:02,375 -INFO - 2387718061:2387718061.py:11 - getting page
2021-08-30 14:14:02,375 -INFO - 2387718061.py:11 - getting page
2021-08-30 14:14:03,003 -INFO - common:common.py:75 -

2021-08-30 14:14:28,741 -INFO - dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:14:30,376 -INFO - dataset_builder:dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site54.html
2021-08-30 14:14:30,376 -INFO - dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site54.html
2021-08-30 14:14:30,414 -INFO - dataset_builder:dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site54.pkl
2021-08-30 14:14:30,414 -INFO - dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site54.pkl
2021-08-30 14:14:30,419 -INFO - dataset_builder:dataset_builder.py:204 - No attributes: (22, 13)
2021-08-30 14:14:30,419 -INFO - dataset_builder.py:204 - No attributes: (22, 13)
2021-08-30 14:14:30,637 -INFO - dataset_builder:dataset_builder.py:103 - Close web driver
2021-08-30 14:14:30,637 -INFO - dataset_builder.py:103 - Close web driver
2021-08-30 14:14:30,731 -INFO - dataset_builder:dataset_builder.py:78 - Dataset name: mui-si

2021-08-30 14:14:52,947 -INFO - dataset_builder:dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:14:52,947 -INFO - dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:14:52,950 -INFO - 2387718061:2387718061.py:11 - getting page
2021-08-30 14:14:52,950 -INFO - 2387718061.py:11 - getting page
2021-08-30 14:14:53,521 -INFO - common:common.py:75 - Window maximized
2021-08-30 14:14:53,521 -INFO - common.py:75 - Window maximized
2021-08-30 14:14:56,536 -INFO - dataset_builder:dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site58.png
2021-08-30 14:14:56,536 -INFO - dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site58.png
2021-08-30 14:14:56,853 -INFO - dataset_builder:dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:14:56,853 -INFO - dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:14:58,415 -INFO - dataset_builder:dataset_builder.py:192 - Save html to ./MUI_mode

2021-08-30 14:15:19,989 -INFO - dataset_builder.py:204 - No attributes: (28, 13)
2021-08-30 14:15:20,238 -INFO - dataset_builder:dataset_builder.py:103 - Close web driver
2021-08-30 14:15:20,238 -INFO - dataset_builder.py:103 - Close web driver
2021-08-30 14:15:20,325 -INFO - dataset_builder:dataset_builder.py:78 - Dataset name: mui-site61
2021-08-30 14:15:20,325 -INFO - dataset_builder.py:78 - Dataset name: mui-site61
2021-08-30 14:15:20,329 -INFO - dataset_builder:dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:15:20,329 -INFO - dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:15:20,334 -INFO - dataset_builder:dataset_builder.py:126 - Creating driver
2021-08-30 14:15:20,334 -INFO - dataset_builder.py:126 - Creating driver
2021-08-30 14:15:21,466 -INFO - dataset_builder:dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:15:21,466 -INFO - dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:15:21,4

2021-08-30 14:16:37,507 -INFO - dataset_builder:dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:16:37,507 -INFO - dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:16:39,407 -INFO - dataset_builder:dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site64.html
2021-08-30 14:16:39,407 -INFO - dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site64.html
2021-08-30 14:16:39,447 -INFO - dataset_builder:dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site64.pkl
2021-08-30 14:16:39,447 -INFO - dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site64.pkl
2021-08-30 14:16:39,454 -INFO - dataset_builder:dataset_builder.py:204 - No attributes: (18, 13)
2021-08-30 14:16:39,454 -INFO - dataset_builder.py:204 - No attributes: (18, 13)
2021-08-30 14:16:39,712 -INFO - dataset_builder:dataset_builder.py:103 - Close web driver
2021-08-30 14:16:39,712 -INFO - dataset_builder.py:103 - Clo

2021-08-30 14:17:02,118 -INFO - dataset_builder:dataset_builder.py:126 - Creating driver
2021-08-30 14:17:02,118 -INFO - dataset_builder.py:126 - Creating driver
2021-08-30 14:17:03,230 -INFO - dataset_builder:dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:17:03,230 -INFO - dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:17:03,233 -INFO - 2387718061:2387718061.py:11 - getting page
2021-08-30 14:17:03,233 -INFO - 2387718061.py:11 - getting page
2021-08-30 14:17:03,718 -INFO - common:common.py:75 - Window maximized
2021-08-30 14:17:03,718 -INFO - common.py:75 - Window maximized
2021-08-30 14:17:06,732 -INFO - dataset_builder:dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site68.png
2021-08-30 14:17:06,732 -INFO - dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site68.png
2021-08-30 14:17:07,097 -INFO - dataset_builder:dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:17:07,097 

2021-08-30 14:17:30,266 -INFO - dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site70.pkl
2021-08-30 14:17:30,279 -INFO - dataset_builder:dataset_builder.py:204 - No attributes: (22, 13)
2021-08-30 14:17:30,279 -INFO - dataset_builder.py:204 - No attributes: (22, 13)
2021-08-30 14:17:30,493 -INFO - dataset_builder:dataset_builder.py:103 - Close web driver
2021-08-30 14:17:30,493 -INFO - dataset_builder.py:103 - Close web driver
2021-08-30 14:17:30,586 -INFO - dataset_builder:dataset_builder.py:78 - Dataset name: mui-site71
2021-08-30 14:17:30,586 -INFO - dataset_builder.py:78 - Dataset name: mui-site71
2021-08-30 14:17:30,589 -INFO - dataset_builder:dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:17:30,589 -INFO - dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:17:30,595 -INFO - dataset_builder:dataset_builder.py:126 - Creating driver
2021-08-30 14:17:30,595 -INFO - dataset_builder.py:126 - Creating driver
2

2021-08-30 14:17:54,059 -INFO - common.py:75 - Window maximized
2021-08-30 14:17:57,063 -INFO - dataset_builder:dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site74.png
2021-08-30 14:17:57,063 -INFO - dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site74.png
2021-08-30 14:17:57,563 -INFO - dataset_builder:dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:17:57,563 -INFO - dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:18:00,705 -INFO - dataset_builder:dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site74.html
2021-08-30 14:18:00,705 -INFO - dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site74.html
2021-08-30 14:18:00,768 -INFO - dataset_builder:dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site74.pkl
2021-08-30 14:18:00,768 -INFO - dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site74.pkl
2021-08-30 14:18:0

2021-08-30 14:18:22,445 -INFO - dataset_builder.py:78 - Dataset name: mui-site78
2021-08-30 14:18:22,448 -INFO - dataset_builder:dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:18:22,448 -INFO - dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:18:22,455 -INFO - dataset_builder:dataset_builder.py:126 - Creating driver
2021-08-30 14:18:22,455 -INFO - dataset_builder.py:126 - Creating driver
2021-08-30 14:18:23,566 -INFO - dataset_builder:dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:18:23,566 -INFO - dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:18:23,571 -INFO - 2387718061:2387718061.py:11 - getting page
2021-08-30 14:18:23,571 -INFO - 2387718061.py:11 - getting page
2021-08-30 14:18:24,120 -INFO - common:common.py:75 - Window maximized
2021-08-30 14:18:24,120 -INFO - common.py:75 - Window maximized
2021-08-30 14:18:27,138 -INFO - dataset_builder:dataset_builder.py:140 - save scrinshot: 

2021-08-30 14:18:50,005 -INFO - dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site80.html
2021-08-30 14:18:50,039 -INFO - dataset_builder:dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site80.pkl
2021-08-30 14:18:50,039 -INFO - dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site80.pkl
2021-08-30 14:18:50,045 -INFO - dataset_builder:dataset_builder.py:204 - No attributes: (17, 13)
2021-08-30 14:18:50,045 -INFO - dataset_builder.py:204 - No attributes: (17, 13)
2021-08-30 14:18:50,233 -INFO - dataset_builder:dataset_builder.py:103 - Close web driver
2021-08-30 14:18:50,233 -INFO - dataset_builder.py:103 - Close web driver
2021-08-30 14:18:50,328 -INFO - dataset_builder:dataset_builder.py:78 - Dataset name: mui-site81
2021-08-30 14:18:50,328 -INFO - dataset_builder.py:78 - Dataset name: mui-site81
2021-08-30 14:18:50,332 -INFO - dataset_builder:dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:18:50,

2021-08-30 14:19:13,927 -INFO - 2387718061:2387718061.py:11 - getting page
2021-08-30 14:19:13,927 -INFO - 2387718061.py:11 - getting page
2021-08-30 14:19:14,454 -INFO - common:common.py:75 - Window maximized
2021-08-30 14:19:14,454 -INFO - common.py:75 - Window maximized
2021-08-30 14:19:17,468 -INFO - dataset_builder:dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site84.png
2021-08-30 14:19:17,468 -INFO - dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site84.png
2021-08-30 14:19:18,072 -INFO - dataset_builder:dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:19:18,072 -INFO - dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:19:19,633 -INFO - dataset_builder:dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site84.html
2021-08-30 14:19:19,633 -INFO - dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site84.html
2021-08-30 14:19:19,671 -INFO - dataset_builde

2021-08-30 14:19:41,474 -INFO - dataset_builder.py:103 - Close web driver
2021-08-30 14:19:41,565 -INFO - dataset_builder:dataset_builder.py:78 - Dataset name: mui-site88
2021-08-30 14:19:41,565 -INFO - dataset_builder.py:78 - Dataset name: mui-site88
2021-08-30 14:19:41,568 -INFO - dataset_builder:dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:19:41,568 -INFO - dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:19:41,575 -INFO - dataset_builder:dataset_builder.py:126 - Creating driver
2021-08-30 14:19:41,575 -INFO - dataset_builder.py:126 - Creating driver
2021-08-30 14:19:42,693 -INFO - dataset_builder:dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:19:42,693 -INFO - dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:19:42,697 -INFO - 2387718061:2387718061.py:11 - getting page
2021-08-30 14:19:42,697 -INFO - 2387718061.py:11 - getting page
2021-08-30 14:19:43,251 -INFO - common:common.py:75 -

2021-08-30 14:21:08,541 -INFO - dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:21:09,552 -INFO - dataset_builder:dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site90.html
2021-08-30 14:21:09,552 -INFO - dataset_builder.py:192 - Save html to ./MUI_model/dataset/html/mui-site90.html
2021-08-30 14:21:09,590 -INFO - dataset_builder:dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site90.pkl
2021-08-30 14:21:09,590 -INFO - dataset_builder.py:202 - Save parquet to ./MUI_model/dataset/df/mui-site90.pkl
2021-08-30 14:21:09,594 -INFO - dataset_builder:dataset_builder.py:204 - No attributes: (20, 13)
2021-08-30 14:21:09,594 -INFO - dataset_builder.py:204 - No attributes: (20, 13)
2021-08-30 14:21:09,716 -INFO - dataset_builder:dataset_builder.py:103 - Close web driver
2021-08-30 14:21:09,716 -INFO - dataset_builder.py:103 - Close web driver
2021-08-30 14:21:09,811 -INFO - dataset_builder:dataset_builder.py:78 - Dataset name: mui-si

2021-08-30 14:21:33,321 -INFO - dataset_builder:dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:21:33,321 -INFO - dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:21:33,323 -INFO - 2387718061:2387718061.py:11 - getting page
2021-08-30 14:21:33,323 -INFO - 2387718061.py:11 - getting page
2021-08-30 14:21:33,969 -INFO - common:common.py:75 - Window maximized
2021-08-30 14:21:33,969 -INFO - common.py:75 - Window maximized
2021-08-30 14:21:36,985 -INFO - dataset_builder:dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site94.png
2021-08-30 14:21:36,985 -INFO - dataset_builder.py:140 - save scrinshot: ./MUI_model/dataset/images/mui-site94.png
2021-08-30 14:21:37,416 -INFO - dataset_builder:dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:21:37,416 -INFO - dataset_builder.py:184 - Collect features using JS script
2021-08-30 14:21:39,791 -INFO - dataset_builder:dataset_builder.py:192 - Save html to ./MUI_mode

2021-08-30 14:22:02,201 -INFO - dataset_builder.py:204 - No attributes: (28, 13)
2021-08-30 14:22:02,363 -INFO - dataset_builder:dataset_builder.py:103 - Close web driver
2021-08-30 14:22:02,363 -INFO - dataset_builder.py:103 - Close web driver
2021-08-30 14:22:02,455 -INFO - dataset_builder:dataset_builder.py:78 - Dataset name: mui-site98
2021-08-30 14:22:02,455 -INFO - dataset_builder.py:78 - Dataset name: mui-site98
2021-08-30 14:22:02,458 -INFO - dataset_builder:dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:22:02,458 -INFO - dataset_builder.py:88 - Create directories to save the dataset
2021-08-30 14:22:02,465 -INFO - dataset_builder:dataset_builder.py:126 - Creating driver
2021-08-30 14:22:02,465 -INFO - dataset_builder.py:126 - Creating driver
2021-08-30 14:22:03,596 -INFO - dataset_builder:dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:22:03,596 -INFO - dataset_builder.py:137 - Chrome web driver is created
2021-08-30 14:22:03,5

In [32]:
display(elements_df.shape)

(182727, 15)

#### Changing functions: using TfIdfTransformer instead of TfIdfVectorizer

In [192]:
def build_class_feature(df: pd.DataFrame, colname='attributes') -> csr_matrix:
    """
        Extract TfIdf features for "class" attribute from a column
        containing attributes ("attributes", "attributes_parent", "attributes_up_sibling"...)
        default columns: "attributes"
    """
    model_count_file_path = 'MUI_model/model/count_attr_class.pkl'
    model_tf_file_path = 'MUI_model/model/tfidf_attr_class.pkl'
    logger.info(f'used column: {colname}')

    if os.path.exists(model_count_file_path) and os.path.exists(model_tf_file_path):
        logger.info("CountVectorizer and TfidfTransformer for class attribute exist. Loading...")
        with open(model_count_file_path, 'rb') as c:
            model_cv = pickle.load(file=c)
        with open(model_tf_file_path, 'rb') as f:
            model_tf = pickle.load(file=f)
#             print(f'{model_tf!r}')
#         attr_class_series = df[colname].apply(lambda x: x.get('class') if x is not None else '').fillna('')
        attr_class_series = df[colname].apply(lambda x: x.get('class') if x is not None else '').fillna('')
        attr_class_series = attr_class_series.apply(lambda x: x.replace("-", " ").lower())
        class_sm = model_cv.transform(attr_class_series.values)
        class_sm = model_tf.transform(class_sm)
    else:
        logger.info("TfIdfVectorizer for class attribute does not exist. Build the one.")
#         if len(set(df.columns).intersection(set(['label', 'label_text']))) != 2:
#             raise Exception('Cannot prepare CountVectorizer for attribute "class": need labels')

        logger.info("Extract useful attr_class features, 'attributes' column will be used")
        attr_class_series = df[df.label_text != 'n/a']\
            .attributes.apply(lambda x: x.get('class') if x is not None else '')\
            .fillna('')
        attr_class_series = attr_class_series.apply(lambda x: x.replace("-", " ").lower())
        class_vocab = list(itertools.chain.from_iterable(attr_class_series.apply(lambda x: x.split(sep=' '))))
        class_vocab = list(set(class_vocab))
        class_vocab = sorted([v for v in class_vocab if re.match(r'^[a-z][a-z]+$', v)])
        class_cv = CountVectorizer(vocabulary = class_vocab)
        class_sm = class_cv.fit_transform(attr_class_series.values)
        model = TfidfTransformer()
        class_sm = model.fit_transform(class_sm)
        # filter out class names:  length is at least 2 characters and only letters
#         vocabulary = sorted([v for v in class_cv.vocabulary_.keys() if re.match(r'^[a-z][a-z]+$', v)])

        logger.info(f'Column ["{colname}"] used for tfidf')
#         attr_class_series = df[colname].apply(lambda x: None if type(x) is not dict else x.get('class')).fillna('')
#         model = TfidfVectorizer(vocabulary=vocabulary)  # CountVectorizer?
#         model.fit(attr_class_series.values)
        logger.info(f'Saving {model_count_file_path}, vocabulary length: {len(class_cv.vocabulary_)}')
        with open(model_count_file_path, 'wb') as c:
            pickle.dump(class_cv, c)
            c.flush()
        logger.info(f'Saving {model_tf_file_path}')
        with open(model_tf_file_path, 'wb') as f:
            pickle.dump(model, f)
            f.flush()

    return class_sm

In [199]:
def build_tree_dict(df: pd.DataFrame) -> dict:
    """
        Builds tree dict for
        get_parents_list
        df:pd.DataFrame must have columns: 'parent_id', 'element_id'

    """
    # tree_dict = dict(zip(df.element_id.values, df.parent_id.values))  # old implementation
    tree_dict = {
        r.element_id: None if r.element_id == r.parent_id else r.parent_id
        for _, r in df[['element_id', 'parent_id']].iterrows()
    }

    return tree_dict

In [204]:
@numba.jit(forceobj=True)
def get_parents_list(tree_dict: dict, element_id: str, paternts_list: list = None) -> list:
    """
        returns ordered list of parent for a element
        starting from root which is the <html/> tag
    """
    if paternts_list is None:
        paternts_list = []

    parent_id = tree_dict.get(element_id)
    if parent_id is None:
        return paternts_list
    else:
        paternts_list.append(parent_id)
        return get_parents_list(tree_dict, element_id=parent_id, paternts_list=paternts_list)

In [196]:
def build_tree_features(elements_df: pd.DataFrame) -> pd.DataFrame:
    """
        Walk on elements tree and build tree-features:
           - chilren_tags
           - folloer_counters
    """

    def empty_string():
        return ''

    tree_dict = build_tree_dict(elements_df)

    # Build paths
    followers_counter = Counter()
    # level_dict = defaultdict(int)
    children_tags_dict = defaultdict(empty_string)

    with trange(elements_df.shape[0]) as tbar:
        tbar.set_description('Build tree features')
        for i, r in elements_df.iterrows():
            list_of_parents = get_parents_list(
                tree_dict=tree_dict, element_id=r.element_id)
            children_tags_dict[r.parent_id] += r.tag_name.lower() + ' '
            # print(list_of_parents)
            # calculate number of followers
            followers_counter.update(list_of_parents)
            tbar.update(1)

    elements_df['children_tags'] = elements_df.element_id.map(
        children_tags_dict).fillna('')
    elements_df['num_followers'] = elements_df.element_id.map(
        followers_counter)
    return elements_df

In [197]:
def followers_features(df: pd.DataFrame, followers_set: set = None, level=0) -> pd.DataFrame:
    """
        Build feature: "children_tags" and max reverse depth ( depth from leafs) # noqa
        Concatenate all children tag_names into a string filed 'children_tags'
    """
    # get leafs (nodes without children)
    if followers_set is None:
        level = 0
        followers_set = set(df.element_id.values) - set(df.parent_id.values)
        followers_tags_df = \
            df[df.element_id.isin(followers_set)][['parent_id', 'tag_name']]\
            .groupby('parent_id')['tag_name']\
            .apply(lambda x: ','.join(x))\
            .reset_index()
        followers_tags_dict = dict(followers_tags_df.values)
        df['followers_tags'] = df.element_id.map(
            followers_tags_dict).fillna('')

        # create max_depth field
        df['max_depth'] = 0
        df.max_depth = df.max_depth + \
            df.element_id.isin(set(followers_tags_dict.keys())).astype(int)

        # recursive call
        followers_features(df=df, followers_set=set(
            followers_tags_dict.keys()), level=level + 1)

    elif len(followers_set) > 0:
        # print(f'level: {level}')
        followers_tags_df = \
            df[df.element_id.isin(followers_set)][['parent_id', 'tag_name']]\
            .groupby('parent_id')['tag_name']\
            .apply(lambda x: ','.join(x))\
            .reset_index()
        followers_tags_dict = dict(followers_tags_df.values)
        df['followers_tags'] = df.followers_tags + ',' + \
            df.element_id.map(followers_tags_dict).fillna('')

        # increase max_depth
        df.max_depth = df.max_depth + \
            df.element_id.isin(set(followers_tags_dict.keys())).astype(int)

        # recursive call
        followers_features(df=df, followers_set=set(
            followers_tags_dict.keys()), level=level + 1)

    df['followers_tags'] = df['followers_tags'].apply(
        lambda x: re.sub('\\s+', ' ', x.replace(',', ' ')).lower().strip())
    return df

In [275]:
def build_children_tags(df: pd.DataFrame, colname='children_tags') -> csr_matrix:
    model_count_path = 'MUI_model/model/count_children_tags.pkl'
    model_tf_path = 'MUI_model/model/tfidf_children_tags.pkl'
    logger.info(f'used column: {colname}')

    if os.path.exists(model_count_path) and os.path.exists(model_tf_path):
        logger.info("CountVectorizer and TfidfTransformer for children tags exist. Loading...")
        with open(model_count_path, 'rb') as c:
            model_cv = pickle.load(file=c)
        with open(model_tf_path, 'rb') as f:
            model_tf = pickle.load(file=f)
        
        child_tags_series = df[colname].fillna('')
        child_tags_series = child_tags_series.apply(lambda x: x.strip() if x != '' else x)
        child_sm = model_cv.transform(child_tags_series.values)
        child_sm = model_tf.transform(child_sm)

    else:
        logger.info("TfIdfVectorizer for children tags does not exist. Build the one.")
#         if len(set(df.columns).intersection(set(['label', 'label_text']))) != 2:
#             raise Exception('Cannot prepare CountVectorizer for attribute "class": need labels')

        logger.info("Extract useful child_tags features, 'children_tags' column will be used")
        child_tags_series =  df[colname].fillna('')
        child_vocab = list(itertools.chain.from_iterable(child_tags_series.apply(lambda x: x.split(sep=' '))))
        child_vocab = list(set(child_vocab))
        child_vocab = sorted([v for v in child_vocab if re.match(r'^[a-z][a-z]+$', v)])
        
        child_cv = CountVectorizer(vocabulary=child_vocab)
        child_sm = child_cv.fit_transform(child_tags_series.values)

        logger.info(f'Column ["{colname}"] used for tfidf')
        
        model = TfidfTransformer()
        child_sm = model.fit_transform(child_sm)
        
        logger.info(f'Saving {model_count_path}, vocabulary length: {len(child_cv.vocabulary_)}')
        logger.info(f'Saving {model_tf_path}')
        with open(model_count_path, 'wb') as c:
            pickle.dump(child_cv, c)
            c.flush()
        with open(model_tf_path, 'wb') as f:
            pickle.dump(model, f)
            f.flush()

    return child_sm

In [276]:
def build_followers_tags(df: pd.DataFrame, colname='followers_tags') -> csr_matrix:
    model_count_path = 'MUI_model/model/count_followers_tags.pkl'
    model_tf_path = 'MUI_model/model/tfidf_followers_tags.pkl'
    logger.info(f'used column: {colname}')

    if os.path.exists(model_count_path) and os.path.exists(model_tf_path):
        logger.info("CountVectorizer and TfidfTransformer for followers tags exist. Loading...")
        with open(model_count_path, 'rb') as c:
            model_cv = pickle.load(file=c)
        with open(model_tf_path, 'rb') as f:
            model_tf = pickle.load(file=f)
        
        followers_tags_series = df[colname].fillna('')
        followers_tags_series = followers_tags_series.apply(lambda x: x.strip() if x != '' else x)
        followers_sm = model_cv.transform(followers_tags_series.values)
        followers_sm = model_tf.transform(followers_sm)

    else:
        logger.info("CountVectorizer and TfIdfTransformer for followers tags do not exist. Build them.")
#         if len(set(df.columns).intersection(set(['label', 'label_text']))) != 2:
#             raise Exception('Cannot prepare CountVectorizer for attribute "class": need labels')

        logger.info("Extract useful child_tags features, 'followers_tags' column will be used")
        followers_tags_series =  df[colname].fillna('')
        followers_vocab = list(itertools.chain.from_iterable(followers_tags_series.apply(lambda x: x.split(sep=' '))))
        followers_vocab = list(set(followers_vocab))
        followers_vocab = sorted([v for v in followers_vocab if re.match(r'^[a-z][a-z]+$', v)])
        
        followers_cv = CountVectorizer(vocabulary=followers_vocab)
        followers_sm = followers_cv.fit_transform(followers_tags_series.values)

        logger.info(f'Column ["{colname}"] used for tfidf')
        
        model = TfidfTransformer()
        followers_sm = model.fit_transform(followers_sm)
        
        logger.info(f'Saving {model_count_path}, vocabulary length: {len(followers_cv.vocabulary_)}')
        logger.info(f'Saving {model_tf_path}')
        with open(model_count_path, 'wb') as c:
            pickle.dump(followers_cv, c)
            c.flush()
        with open(model_tf_path, 'wb') as f:
            pickle.dump(model, f)
            f.flush()

    return followers_sm

In [137]:
elements_df['label_text'] =  elements_df.attributes.apply(lambda x: x.get('data-label') if x is not None else None).fillna('n/a')

In [280]:
followers_features(elements_df)

Unnamed: 0,attributes,displayed,element_id,height,onmouseenter,onmouseover,parent_id,style,tag_name,text,width,x,y,idx,ds_name,label_text,children_tags,num_followers,followers_tags,max_depth
0,{'lang': 'en'},False,5313751051214295958780828823,24307.71875,,,,"[align-content, align-items, align-self, align...",HTML,Recents\nFavorites\nNearby\nInbox\nStarred\nSe...,877.000000,0.000000,-15615.00,0,mui-site1,,head body,1435,head body body body body body body body body b...,13
1,,False,9972128381214295958976818296,0.00000,,,5313751051214295958780828823,"[align-content, align-items, align-self, align...",HEAD,React App\n.MuiSvgIcon-root {\n fill: current...,0.000000,0.000000,0.00,1,mui-site1,,meta link meta meta meta link link title style...,118,meta link meta meta meta link link title style...,1
2,{'charset': 'utf-8'},False,1422379309214295952079575008,0.00000,,,9972128381214295958976818296,"[align-content, align-items, align-self, align...",META,,0.000000,0.000000,0.00,2,mui-site1,,,0,,0
3,"{'href': './favicon.ico', 'rel': 'icon'}",False,3794611023214295951238901967,0.00000,,,9972128381214295958976818296,"[align-content, align-items, align-self, align...",LINK,,0.000000,0.000000,0.00,3,mui-site1,,,0,,0
4,"{'content': 'width=device-width,initial-scale=...",False,7491848177214295950995160010,0.00000,,,9972128381214295958976818296,"[align-content, align-items, align-self, align...",META,,0.000000,0.000000,0.00,4,mui-site1,,,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182722,"{'data-test': 'sentinelEnd', 'tabindex': '0'}",False,9643588738225346508842760690,0.00000,,,7741191648225346505783329108,"[align-content, align-items, align-self, align...",DIV,,774.000000,0.000000,0.00,1484,mui-site99,,,0,,0
182723,"{'data-label': 'popper', 'role': 'tooltip', 's...",True,6253881103225346501377423261,38.00000,,,0201343053225346488931122608,"[align-content, align-items, align-self, align...",DIV,The content of the Popper.,186.906250,0.000000,0.00,1485,mui-site99,popper,div,1,div,1
182724,{'class': 'jss11'},True,3681047993225346505640883357,38.00000,,,6253881103225346501377423261,"[align-content, align-items, align-self, align...",DIV,The content of the Popper.,186.906250,0.000000,0.00,1486,mui-site99,,,0,,0
182725,,True,0082313773225346501113264030,16.00000,,,0201343053225346488931122608,"[align-content, align-items, align-self, align...",SPAN,But I actually render here!,164.359375,0.000000,7329.75,1487,mui-site99,,,0,,0


In [253]:
attr_class_series = elements_df['attributes'].apply(lambda x: x.get('class') if x is not None else '').fillna('')
attr_class_series = attr_class_series.apply(lambda x: x.replace("-", " ").lower())

In [258]:
attr_class_series[100:130]

100                                                     
101                                                     
102                                                     
103                                                     
104                                                     
105                                                     
106                                                     
107                                                     
108                                                     
109                                                     
110                                                     
111                                                     
112                                                     
113                                                     
114                                                     
115                                                     
116                                                     
117                            

In [None]:
child_tags_series = elements_df['children_tags'].fillna('')

In [256]:
child_tags_series

0                                                head body 
1         meta link meta meta meta link link title style...
2                                                          
3                                                          
4                                                          
                                ...                        
182722                                                     
182723                                                 div 
182724                                                     
182725                                                     
182726                                                     
Name: children_tags, Length: 182727, dtype: object

In [235]:
child_vocab = list(itertools.chain.from_iterable(child_tags_series.apply(lambda x: x.split(sep=' '))))
child_vocab = list(set(child_vocab))
child_vocab = sorted([v for v in child_vocab if re.match(r'^[a-z][a-z]+$', v)])

In [236]:
child_vocab

['body',
 'button',
 'circle',
 'div',
 'em',
 'fieldset',
 'form',
 'head',
 'header',
 'hr',
 'input',
 'label',
 'legend',
 'li',
 'link',
 'meta',
 'nav',
 'noscript',
 'ol',
 'option',
 'path',
 'script',
 'select',
 'span',
 'style',
 'svg',
 'table',
 'tbody',
 'td',
 'text',
 'textarea',
 'th',
 'thead',
 'title',
 'tr',
 'ul']

In [242]:
child_cv = CountVectorizer(vocabulary = child_vocab)

In [243]:
child_cv.fit_transform(child_tags_series.values)

<182727x36 sparse matrix of type '<class 'numpy.int64'>'
	with 96019 stored elements in Compressed Sparse Row format>

In [244]:
child_cv.transform(child_tags_series.values)

<182727x36 sparse matrix of type '<class 'numpy.int64'>'
	with 96019 stored elements in Compressed Sparse Row format>

In [213]:
vocabulary = sorted([v for v in child_cv.vocabulary_.keys() if re.match(r'^[a-z][a-z]+$', v)])

In [181]:
attr_class_series = elements_df['attributes'].apply(lambda x: x.get('class') if x is not None else '').fillna('')

In [182]:
attr_class_series = attr_class_series.apply(lambda x: x.replace("-", " ").lower())

In [183]:
attr_class_series

0              
1              
2              
3              
4              
          ...  
182722         
182723         
182724    jss11
182725         
182726         
Name: attributes, Length: 182727, dtype: object

In [138]:
elements_df.reset_index(drop=True, inplace=True)

In [139]:
elements_df.loc[0,:'attributes'].values

array([{'lang': 'en'}], dtype=object)

In [140]:
type(elements_df.loc[0,:'attributes'].values)

numpy.ndarray

In [141]:
elements_df[elements_df.label_text != 'n/a'].attributes.apply(lambda x: x.get('class') if x is not None else '')\
            .fillna('')

124                      MuiBottomNavigation-root jss1
127                                    MuiSvgIcon-root
133                                    MuiSvgIcon-root
139                                    MuiSvgIcon-root
145               MuiDrawer-root MuiDrawer-docked jss4
                              ...                     
182707       MuiList-root MuiMenu-list MuiList-padding
182715                                                
182717                                 MuiPopover-root
182721    MuiTypography-root jss58 MuiTypography-body1
182723                                                
Name: attributes, Length: 24137, dtype: object

In [142]:
attr_class_series = elements_df[elements_df.label_text != 'n/a']\
            .attributes.apply(lambda x: x.get('class') if x is not None else '')\
            .fillna('')
attr_class_series = attr_class_series.apply(lambda x: x.replace("-", " ").lower())
class_vocab = list(itertools.chain.from_iterable(attr_class_series.apply(lambda x: x.split(sep=' '))))
class_vocab = list(set(class_vocab))
class_vocab = sorted([v for v in class_vocab if re.match(r'^[a-z][a-z]+$', v)])




In [218]:
attr_class_series = attr_class_series.apply(lambda x: x.replace("-", " ").lower())

In [219]:
class_vocab = list(itertools.chain.from_iterable(attr_class_series.apply(lambda x: x.split(sep=' '))))

In [220]:
class_vocab = list(set(class_vocab))

In [221]:
class_vocab = sorted([v for v in class_vocab if re.match(r'^[a-z][a-z]+$', v)])

In [222]:
class_vocab

['action',
 'actions',
 'active',
 'align',
 'aligncenter',
 'alignleft',
 'alignright',
 'alternativelabel',
 'ampmlabel',
 'ampmselection',
 'anchororiginbottomcenter',
 'anchororigintoprightrectangle',
 'animated',
 'animatetransform',
 'around',
 'asterisk',
 'avatar',
 'avatarcolorprimary',
 'avatarcolorsecondary',
 'avatarsmall',
 'badge',
 'bar',
 'barcolorprimary',
 'barcolorsecondary',
 'body',
 'button',
 'caption',
 'cell',
 'cellcheckbox',
 'center',
 'checkboxinput',
 'checked',
 'child',
 'childpulsate',
 'circle',
 'circledeterminate',
 'circleindeterminate',
 'circular',
 'clickable',
 'clickablecolorprimary',
 'clickablecolorsecondary',
 'clock',
 'clocknumber',
 'clocknumberselected',
 'colordefault',
 'colorerror',
 'colorinherit',
 'colorprimary',
 'colorsecondary',
 'colortextprimary',
 'colortextsecondary',
 'columnheader',
 'columnheadercheckbox',
 'columnheaderdraggablecontainer',
 'columnheadertitle',
 'columnheadertitlecontainer',
 'columnheaderwrapper',
 'col

In [147]:
class_cv = CountVectorizer(vocabulary = class_vocab)

In [241]:
class_sm = class_cv.fit_transform(attr_class_series)

In [158]:
model = TfidfTransformer()

In [160]:
class_sm = model.fit_transform(class_sm.reshape(-1, 1))

AttributeError: 'CountVectorizer' object has no attribute 'reshape'

In [193]:
m = build_class_feature(elements_df)

2021-08-30 17:46:43,805 -INFO - 143991698:143991698.py:9 - used column: attributes
2021-08-30 17:46:43,805 -INFO - 143991698.py:9 - used column: attributes
2021-08-30 17:46:43,809 -INFO - 143991698:143991698.py:12 - CountVectorizer and TfidfTransformer for class attribute exist. Loading...
2021-08-30 17:46:43,809 -INFO - 143991698.py:12 - CountVectorizer and TfidfTransformer for class attribute exist. Loading...


In [268]:
with open('MUI_model/model/count_children_tags.pkl', 'rb') as c:
            model_cv = pickle.load(file=c)

In [269]:
model_cv.transform(child_tags_series)

<182727x36 sparse matrix of type '<class 'numpy.int64'>'
	with 96019 stored elements in Compressed Sparse Row format>

In [282]:
build_followers_tags(elements_df)

2021-08-30 19:15:44,569 -INFO - 2004508130:2004508130.py:4 - used column: followers_tags
2021-08-30 19:15:44,569 -INFO - 2004508130.py:4 - used column: followers_tags
2021-08-30 19:15:44,574 -INFO - 2004508130:2004508130.py:7 - CountVectorizer and TfidfTransformer for followers tags exist. Loading...
2021-08-30 19:15:44,574 -INFO - 2004508130.py:7 - CountVectorizer and TfidfTransformer for followers tags exist. Loading...


<182727x36 sparse matrix of type '<class 'numpy.float64'>'
	with 96019 stored elements in Compressed Sparse Row format>