In [1]:
import os
import regex as re
import sqlite3
from collections import defaultdict
from pathlib import Path

from transformers import PreTrainedTokenizerFast

import plotly.graph_objects as go

from typing import (
    Union,
    Optional,
    Sequence,
)

## Constants

In [2]:
# Crawled database

DB = Path("..")/"thegirl_crawler"/"thegirl_tests.db"
TABLE = "titles"
TITLE_COLUMN = "title"

# Clean dataset

TITLE_DUMP = Path(".")/"input"/"all_titles.txt"

# Tokenizer

TOKENIZER = Path(".")/"model"/"tokenizer"/"pretrained_tokenizer"

# Plots

IMG_DUMP = Path(".")/"img"/"title_length_distribution.png"
HEIGHT = 1080
WIDTH = 1920
COLOR = "#EC1268"

## Data Preparation

### Fetching test titles from the database

In [3]:
def create_db_connection(db_file: Union[str, os.PathLike]) -> sqlite3.Connection:
    connection = None
    try:
        connection = sqlite3.connect(db_file)
    except Error as error:
        print(error)

    return connection

In [4]:
def fetch_entities(connection: sqlite3.Connection,
                   db_table: Optional[str] = TABLE,
                   entity_column: Optional[str] = TITLE_COLUMN) -> Sequence[str]:
    cursor = connection.cursor()
    cursor.execute(f"SELECT {entity_column} FROM {db_table}")

    entities = [
        row[0].strip()
        for row
        in cursor.fetchall()
    ]

    return entities

In [5]:
db_connection = create_db_connection(db_file=DB)

with db_connection:
    raw_titles = fetch_entities(connection=db_connection)

### Cleaning out repetitive test intros

In [6]:
intro_pattern = re.compile(r"\[.+\]")

intro2cnt = defaultdict(int)

for title in raw_titles:
    formulaic_intro = intro_pattern.search(title)
    if formulaic_intro:
        intro2cnt[formulaic_intro.group(0)] += 1
    elif ": " in title:
        formulaic_intro = title.split(": ")[0]
        intro2cnt[formulaic_intro] += 1
        
intro2cnt = {
    intro: count
    for intro, count
    in sorted(intro2cnt.items(), key=lambda _: _[1], reverse=True)
    if count >= 50
}

In [7]:
garbage_intros = set(intro2cnt.keys())

for intro, count in intro2cnt.items():
    print(f"{intro}:\t{count}")

Тест:	4494
Quiz:	1059
[тест]:	818
ТЕСТ:	784
Тест-рулетка:	192
[quiz]:	127
Гадание онлайн:	107
Тест на глазастость:	99
Гадание:	84


In [8]:
with open(TITLE_DUMP, "w", encoding="utf-8") as outfile:
    for title in raw_titles:
        intro_text = None
        formulaic_intro = intro_pattern.search(title)
        if formulaic_intro:
            intro_text = formulaic_intro.group(0)
        elif ": " in title:
            intro_text = title.split(": ")[0]
        if intro_text is not None and intro_text in garbage_intros:
                title = title[len(intro_text) + 1:].strip()
        outfile.write(f"{title}\n")  

### Count unique symbols

In [9]:
symbol_set = set()

with open(TITLE_DUMP, "r", encoding="utf-8") as infile:
    for title in infile:
        symbols = set(title.strip())
        symbol_set.update(symbols)

print(f"{len(symbol_set)} unique symbols")

524 unique symbols
