# Entity Patterns from Labels

In [None]:
import os
import json
from pathlib import Path
from typing import List, Tuple, Dict, Union

import spacy
import pandas as pd
from dotenv import load_dotenv

nlp = spacy.load("en_core_web_sm")
load_dotenv()
DATA_PATH = Path(os.getenv("DATA_PATH"))

# only for .ipynb because relative imports don't work
root_path = Path(DATA_PATH).parent
os.chdir(str(root_path))

import src.database.db_connector as db

In [None]:
# database for labels
db_name = "clustering_db"
cnx = db.connect_to_database(db_name)

# bool to indicate df overwriting
mapped = False

In [None]:
# load pretrained english pipeline
nlp = spacy.load("en_core_web_sm")
print(nlp.pipe_names)

## Utility functions for pattern generation

In [None]:
def subsequences(lst: List[any]) -> List[List[any]]:
    """Get all subsequences of a list.

    Args:
        lst (List[any]): base list

    Returns:
        List[List[any]]: list of all list subsequences
    """    
    # filter stop words
    lst = [seq for seq in lst if seq not in nlp.Defaults.stop_words]

    sequences = []
    for k in range(0, len(lst)):
        sequences += [lst[k:i] for i in range(k+1, len(lst)+1)]

    # add elements with changed order for 2-tuples
    sequences += [[elem[1], elem[0]] for elem in sequences if len(elem) == 2]

    sequences.sort(key=len)
    return sequences


def get_pattern(tuple: Tuple[str, str, str]) -> List[Dict[str, Union[str, List[Dict[str, str]]]]]:
    """Creates a pattern for the spaCy EntityRuler from a given label.

    Args:
        tuple (Tuple[str, str]): (label name, label type, label id)

    Returns:
         List[Dict[str, Union[str, List[Dict[str, str]]]]]: list of patterns
            Either
            
            1. `{'label': 'CAT', 'pattern': [{'LOWER': 'promotional'}], 'id': 'Promotional'}` or

            2. `{'label': 'CAT', 'pattern': [{'LOWER': 'culture'}, {'LOWER': 'education'}], 'id': 'Culture & Education'}`
    """    
    labels = {"category": "CAT", "color": "COLOR", "feature": "FEAT"}

    # create lowercase pattern 
    pattern = [{"LOWER": it.lower()} for it in tuple[1]]

    return {"label": labels[tuple[0]], "pattern": pattern, "id": tuple[2]}


In [None]:
query = """
    select l.name, l.type
    from screenshots as s
    inner join websites as w on w.url=s.page_url
    inner join website_labels as wl on w.id=wl.website_id
    inner join labels as l on wl.label_id=l.id
    where l.type != "technology"
    group by l.name
    having count(*) > 100
    order by l.type;
"""

df = pd.read_sql(query, cnx)

# rename type 'tag' to 'feature'
df["type"] = df["type"].apply(lambda l_type: l_type if l_type != "tag" else "feature")

print(df.to_string())

## Label remapping

In [None]:
# restructure for remapping}
base_mapping_dict = {name: {"name": name, "type": type } for (name, type) in df.values}

print(base_mapping_dict)

with open(DATA_PATH / "chatbot" / "mappings_base.json", "w+") as json_file:
    json.dump(base_mapping_dict, json_file)

In [None]:
# load manually edited mappings
with open(DATA_PATH / "chatbot" / "mappings.json", "r") as json_file:
    mappings_dict = json.load(json_file)

# print(mappings_dict)

new_df = pd.DataFrame.from_records([mappings_dict[key] for key in mappings_dict])

# drop Nonetype rows
new_df = new_df.dropna(how='any',axis=0) 

# overwrite old 
df = new_df
mapped = True

## Pattern generation

In [None]:
records = df.to_dict("records")

# tokenize label names
tuples = [
    (
        dct["type"],
        [
            t.text
            for t in nlp(dct["name"], disable=["parser", "ner"]) # only tokenize
            if t.text is not None and t.text not in ["/", "&", "-"] # remove special tokens
        ],
        dct["name"],
    )
    for dct in records
]

# get all subsequences from label name tokens
more_tuples = [
    (tpl[0], token, tpl[2]) for tpl in tuples for token in subsequences(tpl[1])
]

# ner patterns from tuples
patterns = list(map(get_pattern, more_tuples))
for pat in patterns:
    print(pat)

## Pattern testing

In [None]:
# add entity ruler to pipeline
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)

# example request
request = "Create a promotional e colorful website for Google"

doc = nlp(request) 
print([token.text for token in doc])
print([(ent.text, ent.label_) for ent in doc.ents])


## Save patterns

In [None]:
overwrite_old_patterns = False

print(mapped)

# save patterns in file
if mapped and overwrite_old_patterns: 
    ruler.to_disk(root_path / "data" / "chatbot" / "auto_mapped_patterns.jsonl")
elif overwrite_old_patterns:
    ruler.to_disk(root_path / "data" / "chatbot" / "auto_patterns.jsonl")

# Color Patterns

In [None]:
df = pd.read_csv(DATA_PATH / "chatbot" / "colors.csv")

print(df)

In [None]:
import re

def split_uppercase(color_name):
    return re.findall('[A-Z][^A-Z]*', color_name)

# split names on uppercase and lower result
df["name"] = df["name"].apply(lambda x: " ".join(split_uppercase(x)).lower())

In [None]:
colors = df.to_dict("records")

def to_pattern(color_dict: Dict[str, str]):
    return {"label": "HEX", "pattern": [{"LOWER": color_dict["name"]}], "id": color_dict["hex"]}

# create patterns from colors 
color_patterns = list(map(lambda c: to_pattern(c), colors))

# print(color_patterns)

In [None]:
from spacy.pipeline import EntityRuler

# load pretrained pipeline only to get patterns
colors_nlp = spacy.load("en_core_web_sm")
ruler = EntityRuler(colors_nlp)

ruler.add_patterns(color_patterns)

# save patterns in file
# ruler.to_disk(root_path / "data" / "chatbot" / "color_patterns.jsonl")