# Create `hints.json`

In [None]:
import os
import pandas as pd
import json

from pathlib import Path
from dotenv import load_dotenv

load_dotenv()
DATA_PATH = Path(os.getenv("DATA_PATH"))

# only for .ipynb because relative imports don't work
root_path = Path(DATA_PATH).parent
os.chdir(str(root_path))

import src.database.db_connector as db

# database name for results
db_name = "clustering_db"
cnx = db.connect_to_database(db_name)


In [None]:
labels_path = DATA_PATH / "labels" / "labels.csv"

# read labels into dataframe
df = pd.read_csv(labels_path, header=None, names=["label", "type", "id"])

df

In [None]:
# category list
categories = list(df[df["type"] == "category"]["label"])

In [None]:
# query top n=5 labels for each cluster withj 
top_n_query = """
    SELECT rj.cluster_id, rj.label_name
    FROM ( 
        SELECT * FROM (
            SELECT 
            j.ranked_k_value,
            j.cluster_id, 
            @cluster_rank := if(@current_cluster = j.cluster_id, @cluster_rank  + 1, 1) AS cluster_rank ,
            j.label_name,
            j.factor_tf_idf,
            @current_cluster := j.cluster_id
            FROM ( 
                SELECT  *
                FROM matchings AS m
                INNER JOIN ( 
                    SELECT ma.k_value as ranked_k_value, ma.run_name as ranked_run
                    FROM matchings AS ma
                    WHERE (ma.run_name = "v01_all_run2_pca90" OR ma.run_name = "v01_all_run3_pca90") 
                    group by ma.k_value
                ) as ranked_k on ranked_k.ranked_k_value = m.k_value
                INNER JOIN labels as l on m.label_name = l.name
                WHERE (m.run_name = "v01_all_run2_pca90" OR m.run_name = "v01_all_run3_pca90")  AND m.k_value = ranked_k.ranked_k_value AND l.type ="tag"
                ORDER BY ranked_k.ranked_k_value, m.cluster_id, m.factor_tf_idf desc
            ) as j
        ) as ranked_clusters
        where ranked_clusters.cluster_rank <= 5
        and ranked_clusters.ranked_k_value = 55
    ) as rj;
"""

df_topn = pd.read_sql(top_n_query, cnx)
df_topn

In [None]:
# list of dict with cluster and feature tag  
cluster_dict_list = df_topn.to_dict("records")

cluster_dict_list

In [None]:
# load manually edited mappings
with open(DATA_PATH / "chatbot" / "mappings.json", "r") as json_file:
    mappings_dict = json.load(json_file)
    
print(len(cluster_dict_list))

mapped_hints = []
# map label names
for hint in cluster_dict_list:
    name = mappings_dict[hint["label_name"]]["name"]
    print(hint)
    
    # skip if label is mapped to None
    if name is not None:
        hint["label_name"] = name
        mapped_hints.append(hint)
    else:
        continue

print(len(mapped_hints))

In [None]:
mapped_cats = []

# apply mappings to categories
for cat in categories:
    name = mappings_dict[cat]["name"]
    label_type = mappings_dict[cat]["type"]

    # skip if label is mapped to None
    if name is not None:
        mapped_cats.append(name)
    else:
        continue

In [None]:
# create dicts containing list of top 5 features for each cluster
hint_lists = [
    {cid: [dct["label_name"] for dct in cluster_dict_list if dct["cluster_id"] == cid]}
    for cid in list(range(0, 55))
]

hint_lists

In [None]:
# merge hint lists of dicts to one dict
hint_dict = {}
for dct in hint_lists:
    hint_dict.update(dct)

hint_dict

In [None]:
# create dict to write to json
json_dict = {"categories": mapped_cats, "features": hint_dict}

json_dict

In [None]:
# export to json file
import json

with open(DATA_PATH / "chatbot" / "hints-new.json", "w") as outfile:
    json.dump(json_dict, outfile, indent = 4)