In [None]:
import os
import seaborn as sns
import pandas as pd
from collections import Counter

In [None]:
bttv_emote_dir = "./../../output/Jerma985/emotes/bttv"
twitch_emote_dir = "./../../output/Jerma985/emotes/twitch"

In [None]:
bttv_emotes = {os.path.join(bttv_emote_dir, emote_file) for emote_file in os.listdir(bttv_emote_dir)}
twitch_emotes = {os.path.join(twitch_emote_dir, emote_file) for emote_file in os.listdir(twitch_emote_dir)}
all_emotes = bttv_emotes.union(twitch_emotes)

In [None]:
files = [
    "./READ_DESCRIPTION_OF_VOD_Happy_Birthday_________Insert_name_of_choice_here.tsv",
    "./Frog_Detective_3_and_Grotto_Beasts_breakdown_later.tsv",
    "./Hardcore_gaming_mindset_standard_difficulty.tsv",
    "./RE4RemA4ke_or_however_Im_supposed_to_write_it_idk.tsv",
    "./Hylics.tsv",
]

In [None]:
import csv
import sys
from typing import Tuple

csv.field_size_limit(sys.maxsize)

In [None]:
emote_to_file = {os.path.basename(os.path.splitext(emote_file)[0]): emote_file for emote_file in all_emotes}
emote_set = set(emote_to_file.keys())

In [None]:
def count_emotes(chat_tsv: str, emote_set: set) -> pd.DataFrame:
    with open(chat_tsv, 'rt') as chat_file:
        vod_name = os.path.splitext(os.path.basename(chat_tsv))[0]
        reader = csv.DictReader(
            chat_file,
            fieldnames=["timestamp", "user", "msg", "is_command", "is_mention"],
            delimiter="\t"
        )
        
        word_count = Counter()
        for line in reader:
            words = Counter(line["msg"].split(' '))
            remove_words = set(words.keys()).difference(emote_set)
            for remove_word in remove_words:
                words.pop(remove_word)
            word_count.update(words)

        return pd.DataFrame({
            "file": [vod_name] * len(word_count),
            "word": word_count.keys(),
            "count": word_count.values()}
        )

In [None]:
df_emote_counts = (
    pd.concat(
        count_emotes(file, emote_set) for file in files
    )
    .query('count > 10')
    .pivot(index=["word"], columns=["file"], values=["count"])
    .fillna(0)    
)
df_emote_counts

In [None]:
import matplotlib.pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

def add_image_annot(coord: Tuple[int, int], img_path: str, img_scale: float, bbox_pad: float, ax) -> None:
    """
    Adapted from: https://stackoverflow.com/a/44264051
    """
    img = plt.imread(img_path)
    im = OffsetImage(img, zoom=img_scale)
    im.image.axes = ax
    ab = AnnotationBbox(
        im,
        xy=coord,
        xycoords="data",
        pad=bbox_pad
    )
    ax.add_artist(ab)

In [None]:
cmap = sns.clustermap(
    df_emote_counts,
    row_cluster=False,
    figsize=(
        # Row, cols
        len(df_emote_counts.columns) * 2,
        len(df_emote_counts.index) / 2,
    ),
    standard_scale=0,
    dendrogram_ratio=0.02,
    method="ward",
    cbar_pos=(-0.15, 0.5, 0.05, 0.18),
    cbar_kws={
        "shrink": 0.40,
        "aspect": 40,
        "label": "Normalized Emote Counts (Across VODs)"
    }
)
cmap.fig.suptitle("VODs Clustered by Emote", fontsize="xx-large", fontweight="bold", y=1.01)

new_labels = []
for lbl in cmap.ax_heatmap.axes.get_xticklabels():
    emote_lbl = lbl.get_text().replace("count-", "")
    new_labels.append(emote_lbl)
cmap.ax_heatmap.axes.set_xticklabels(new_labels, rotation=30, ha='right')


for i, lbl in enumerate(cmap.ax_heatmap.axes.get_yticklabels()):
    emote_path = emote_to_file[lbl.get_text()]
    lbl_x, lbl_y = lbl.get_position()
    add_image_annot(
        coord=(0, lbl_y),
        img_path=emote_path,
        img_scale=0.2,
        bbox_pad=0,
        ax=cmap.ax_heatmap.axes
    )

cmap.ax_heatmap.axes.set(xlabel="VOD", ylabel="Emote")