In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import jsonlines
from collections import defaultdict
import json
from pathlib import Path
import random
import matplotlib.pyplot as plt
import matplotlib.font_manager
import matplotlib.image as mpimg
from tqdm.notebook import tqdm
import numpy as np

In [None]:
import matplotlib.pyplot as plt
import matplotlib.font_manager
import matplotlib.image as mpimg
print(f"available fonts: {sorted([f.name for f in matplotlib.font_manager.fontManager.ttflist])}")

plt.style.use('seaborn-muted')

plt.rcParams["figure.dpi"] = 150
plt.rcParams["savefig.dpi"] = 300
plt.rcParams["savefig.format"] = "pdf"
plt.rcParams["savefig.bbox"] = "tight"
plt.rcParams["savefig.pad_inches"] = 0.1

plt.rcParams['figure.titlesize'] = 18
plt.rcParams['axes.titlesize'] = 18
plt.rcParams['font.family'] = 'Helvetica'
plt.rcParams['font.size'] = 18

plt.rcParams["lines.linewidth"] = 2
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16
plt.rcParams['legend.fontsize'] = 16
plt.rcParams['axes.linewidth'] = 2
plt.rcParams['axes.titlepad'] = 6

plt.rcParams['mathtext.fontset'] = 'dejavuserif'
plt.rcParams['mathtext.it'] = 'serif:italic'
plt.rcParams['lines.marker'] = ""
plt.rcParams['legend.frameon'] = False

In [None]:
def list2count(_list):
    count = defaultdict(int)
    for x in _list:
        count[x] += 1
    count = {key: val for key, val in sorted(count.items())}
    return count

def show_statistics(data):
    print(f"Total samples: {len(data)}")
    
    source_count = defaultdict(int)
    for sample in data:
        source_count[sample["ori"]] += 1
    print(f"Source count: {json.dumps(source_count, indent=2)}")
    
    steps = []
    durations = []
    clip_durations = []
    for sample in data:
        n_step = len(sample["annotation"])
        steps.append(n_step)
        durations.append(sample["duration"])
        clip_duration = sum([x['segment'][1] - x['segment'][0] for x in sample["annotation"]])
        clip_durations.append(clip_duration)
    steps_count = list2count(steps)
    # print(f"Steps count: {json.dumps(steps_count, indent=2)}")
    print(f"Total steps: {sum(steps)}, total states: {sum(steps) + len(steps)}")
    
    cross_steps_count = list2count([steps[i] for i, sample in enumerate(data) if sample["ori"] == "cross"])
    # var_steps_count = list2count([steps[i] for i, sample in enumerate(data) if sample["ori"] == "var"])
    coin_steps_count = list2count([steps[i] for i, sample in enumerate(data) if sample["ori"] == "coin"])

    
    # ax1.plot(range(len(steps)), steps, linewidth=1)
    # ax1.set_title("Step Overview",  fontstyle='italic')
    # ax1.set_xlabel("samples")
    # ax1.set_ylabel("steps")
    
    def plot_step_count(ax):
        ax.plot(steps_count.keys(), steps_count.values(), "-", label="total")
        ax.plot(cross_steps_count.keys(), cross_steps_count.values(), "--", label="cross")
        # ax2.plot(var_steps_count.keys(), var_steps_count.values(), ":", label="var")
        ax.plot(coin_steps_count.keys(), coin_steps_count.values(), "-.", label="coin")

        ax.set_title("Step Count", fontstyle="italic")
        ax.set_xlabel("steps")
        ax.set_ylabel("number of samples")
        ax.legend(loc='best', numpoints=1, fancybox=False)
        
    def plot_step_dist(ax):
        # bins = np.arange(14) - 0.5
        index = np.arange(len(steps_count.keys()))
        ax.bar(index, steps_count.values(), 0.9)
        # ax.hist(steps)
        ax.set_xticks(index, steps_count.keys())

        ax.set_title("Step Distribution", fontstyle="italic")
        ax.set_xlabel("steps")
        ax.set_ylabel("number of samples")


    def plot_duration_dist(ax):
        ax.hist(clip_durations, bins=100)

        ax.set_title("Duration Distribution", fontstyle="italic")
        ax.set_xlabel("duration")
        ax.set_ylabel("number of samples")
    
        
    width, height = plt.figaspect(0.3)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(width, height))
    # plot_step_count(ax1)
    plot_step_dist(ax1)
    plot_duration_dist(ax2)
    
    # plt.subplots_adjust(hspace=0.25, wspace=0.40)
    plt.subplots_adjust(top=0.8, hspace=None, wspace=None) 

    fig.suptitle("Statistics of VTT dataset")

In [None]:
with jsonlines.open("/data/vtt/meta/vtt_integrated.jsonl") as reader:
    data = list(reader)
show_statistics(data)
plt.savefig("statistics-all.pdf")

In [None]:
with jsonlines.open("/data/vtt/meta/vtt_integrated.jsonl") as reader:
    data = list(reader)
steps = []
durations = []
clip_durations = []
for sample in data:
    n_step = len(sample["annotation"])
    steps.append(n_step)
    durations.append(sample["duration"])
    clip_duration = sum([x['segment'][1] - x['segment'][0] for x in sample["annotation"]])
    clip_durations.append(clip_duration)
steps_count = list2count(steps)

In [None]:
font_size = 16
plt.rcParams["figure.dpi"] = 200
plt.rcParams["savefig.dpi"] = 300
plt.rcParams['axes.labelsize'] = font_size + 2
plt.rcParams['axes.labelweight'] = 'normal'
plt.rcParams['legend.fontsize'] = font_size
plt.rcParams['xtick.labelsize'] = font_size
plt.rcParams['ytick.labelsize'] = font_size
plt.rcParams['axes.linewidth'] = 1.5

plt.bar(steps_count.keys(), steps_count.values())
plt.xticks(list(steps_count.keys()))
plt.xlabel("transformations")
plt.ylabel("#Samples")

plt.savefig("steps_dist.pdf", dpi=300)

In [None]:
# import scipy.stats
font_size = 16
plt.rcParams["figure.dpi"] = 200
plt.rcParams["savefig.dpi"] = 300
plt.rcParams['axes.labelsize'] = font_size + 2
plt.rcParams['axes.labelweight'] = 'normal'
plt.rcParams['legend.fontsize'] = font_size
plt.rcParams['xtick.labelsize'] = font_size
plt.rcParams['ytick.labelsize'] = font_size
plt.rcParams['axes.linewidth'] = 1.5

_, bins, _ = plt.hist(durations, bins=100, density=False, edgecolor='w', linewidth=0.5)

# mu, sigma = scipy.stats.norm.fit(durations)
# best_fit_line = scipy.stats.norm.pdf(bins, mu, sigma)
# plt.plot(bins, best_fit_line, 'k--', alpha=0.5)

plt.xlabel("duration")
plt.ylabel("#Samples")

plt.savefig("duration_dist.pdf", dpi=300)

In [None]:
# import scipy.stats
font_size = 16
plt.rcParams["figure.dpi"] = 200
plt.rcParams["savefig.dpi"] = 300
plt.rcParams['axes.labelsize'] = font_size + 2
plt.rcParams['axes.labelweight'] = 'normal'
plt.rcParams['legend.fontsize'] = font_size
plt.rcParams['xtick.labelsize'] = font_size
plt.rcParams['ytick.labelsize'] = font_size
plt.rcParams['axes.linewidth'] = 1.5

_, bins, _ = plt.hist(clip_durations, bins=100, density=False, edgecolor='w', linewidth=0.5)

# mu, sigma = scipy.stats.norm.fit(durations)
# best_fit_line = scipy.stats.norm.pdf(bins, mu, sigma)
# plt.plot(bins, best_fit_line, 'k--', alpha=0.5)

plt.xlabel("segment duration")
plt.ylabel("#Samples")

plt.savefig("segment_duration_dist.pdf", dpi=300)

## sentence statistics

In [None]:
import spacy

# python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")
lemmatizer = nlp.get_pipe("lemmatizer")

sentences = defaultdict(list)
words = defaultdict(lambda: defaultdict(int))
words_all = []
for sample in tqdm(data):
    for step in sample["annotation"]:
        sentences[sample["ori"]].append(len(step['label'].split()))
        doc = nlp(step['label'])
        for word in doc:
            word = str(word)
            words_all.append(word)
            if word not in [",", "."]:
                words[sample["ori"]][word] += 1
sentences_count = {}
for key, val in sentences.items():
    sentences_count[key] = list2count(val)

In [None]:
plt.figure()
for key, val in sentences_count.items():
    plt.plot(val.keys(), val.values(), label=key)
plt.title("Sentences Length Count", fontstyle="italic")
plt.xlabel("length")
plt.ylabel("Count")
plt.legend(loc='best', numpoints=1, fancybox=False)
plt.savefig("statistics-sentences.pdf", dpi=300)

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
# join the list and lowercase all the words
text = ' '.join(words_all).lower()

#create the wordcloud object
wordcloud = WordCloud(width=1000,height=500,stopwords = STOPWORDS, collocations=True, background_color="white").generate(text)

#plot the wordcloud object
plt.imshow(wordcloud, interpolation='bilInear')
plt.axis('off')

In [None]:
with jsonlines.open("/data/vtt/meta/vtt_integrated.jsonl") as reader:
    data = list(reader)
sentences = defaultdict(int)
for sample in data:
    for step in sample["annotation"]:
        n_word = len(step["label"].split())
        sentences[n_word] += 1

In [None]:
font_size = 16
plt.rcParams["figure.dpi"] = 200
plt.rcParams["savefig.dpi"] = 300
plt.rcParams['axes.labelsize'] = font_size + 2
plt.rcParams['axes.labelweight'] = 'normal'
plt.rcParams['legend.fontsize'] = font_size
plt.rcParams['xtick.labelsize'] = font_size
plt.rcParams['ytick.labelsize'] = font_size
plt.rcParams['axes.linewidth'] = 1.5

axis = plt.bar(sentences.keys(), sentences.values())
# plt.xticks(list(sentences.keys()))
plt.ylabel("#Sentences")
plt.xlabel("words")

plt.savefig("sentences_dist.pdf", dpi=300)

In [None]:
## merge two plots into one

In [None]:
font_size = 16
plt.rcParams["figure.dpi"] = 200
plt.rcParams["savefig.dpi"] = 300
plt.rcParams['axes.labelsize'] = font_size + 2
plt.rcParams['axes.labelweight'] = 'normal'
plt.rcParams['legend.fontsize'] = font_size
plt.rcParams['xtick.labelsize'] = font_size
plt.rcParams['ytick.labelsize'] = font_size
plt.rcParams['axes.linewidth'] = 1.5

width, height = plt.figaspect(0.75)
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(width, height))



ax1.bar(steps_count.keys(), steps_count.values())
ax1.set_xticks(list(steps_count.keys()))
ax1.set_xlabel("transformations")
ax1.set_ylabel("#Samples")

ax2.bar(sentences.keys(), sentences.values())
# plt.xticks(list(sentences.keys()))
ax2.set_ylabel("#Sentences")
ax2.set_xlabel("words")

# plt.subplots_adjust(hspace=0.25, wspace=0.40)
plt.subplots_adjust(hspace=0.6) 

plt.savefig("steps_sentences_dist.pdf", dpi=300)

## statistics of categories

In [None]:
with jsonlines.open("/data/vtt/meta/vtt_integrated.jsonl") as reader:
    data = list(reader)

In [None]:
import re
class ListData:
    def __init__(self, data_list):
        self._data_list = data_list
        self._id_map = {sample["id"]: sample for sample in data_list}

    def __getitem__(self, _id):
        return self._id_map[_id]

    def __len__(self):
        return len(self._data_list)

    def __iter__(self):
        return self._data_list.__iter__()


class Taxonomy:
    def __init__(self, json_path="/data/coin/data/taxonomy.json"):
        with open(json_path) as f:
            self._data = json.load(f)
        self.domains = ListData(self._data["domain"])
        self.targets = ListData(self._data["target"])
        self.actions = ListData(self._data["action"])

    def get_domain_targets(self, domain_id):
        domain = self.domains[domain_id]
        targets = [self.targets[_id] for _id in domain["target_list"]]
        return targets

    def get_target_actions(self, target_id):
        target = self.targets[target_id]
        actions = [self.actions[_id] for _id in target["action_list"]]
        return actions

    def get_action_target(self, action_id):
        return self.targets[self.actions[action_id]["target_id"]]

    def get_target_domain(self, target_id):
        return self.domains[self.targets[target_id]["domain_id"]]

    def get_action_domain(self, action_id):
        target = self.get_action_target(action_id)
        return self.domains[target["domain_id"]]
    def split_words(self, s):
        # split words by Capital letter
        words = re.findall(r"CPR|RJ45|SIM|SSD|CD|TV|PC|[A-Z][^A-Z]*", s)
        words = " ".join(words)
        return words

In [None]:
taxonomy = Taxonomy()

In [None]:
print(f"{len(taxonomy.domains)} domains")
print(f"{len(taxonomy.targets)} targets")
print(f"{len(taxonomy.actions)} actions")

In [None]:
for i, domain in enumerate(taxonomy.domains):
    targets = taxonomy.get_domain_targets(domain['id'])
    # print(f"{i:02d}. {domain['label']}")
    print(f"{i:02d}. {domain['label']}")
    # for i, target in enumerate(targets):
    #     actions = taxonomy.get_target_actions(target['id'])
    #     print(f"\t{i:03d}. {target['label']}")
        # for i, action in enumerate(actions):
        #     print(f"\t\t{i:03d}. {action['label']}")

In [None]:
for i, domain in enumerate(taxonomy.domains):
    targets = taxonomy.get_domain_targets(domain['id'])
    print(f"{i:02d}. {domain['label']}")
    for i, target in enumerate(targets):
        actions = taxonomy.get_target_actions(target['id'])
        print(f"\t{i:03d}. {taxonomy.split_words(target['label'])}")
        # for i, action in enumerate(actions):
        #     print(f"\t\t{i:03d}. {action['label']}")

In [None]:
import sys
sys.path.append("..")
import src.utils.datatool as dtool
tasks = dtool.read_jsonlines("/data/CrossTask/crosstask_release/tasks.jsonl")

In [None]:
for i, task in enumerate(tasks):
    print(f"\"{task['task']}\": \"\",")

In [None]:
for i, domain in enumerate(taxonomy.domains):
    targets = taxonomy.get_domain_targets(domain['id'])
    print(f"{i:02d}. {domain['label']}")
    # for i, target in enumerate(targets):
    #     actions = taxonomy.get_target_actions(target['id'])
    #     print(f"\t{i:03d}. {target['label']}")
        # for i, action in enumerate(actions):
        #     print(f"\t\t{i:03d}. {action['label']}")

### Plot category distribution

In [None]:
with jsonlines.open("/data/vtt/meta/vtt.jsonl") as reader:
    data = list(reader)

In [None]:
from collections import defaultdict

topics = defaultdict(int)
categories = defaultdict(int)
topics_split = defaultdict(lambda: defaultdict(int))
for sample in data:
    topics[sample['topic']] += 1
    categories[sample['category']] += 1
    topics_split[sample['split']][sample['topic']] += 1

In [None]:
categories

In [None]:
list_sorted = sorted(categories.items(), key=lambda x: x[1], reverse=True)

font_size = 16
plt.rcParams["figure.dpi"] = 200
plt.rcParams["savefig.dpi"] = 300
plt.rcParams['axes.labelsize'] = font_size + 2
plt.rcParams['axes.labelweight'] = 'normal'
plt.rcParams['legend.fontsize'] = font_size
plt.rcParams['xtick.labelsize'] = font_size
plt.rcParams['ytick.labelsize'] = font_size
plt.rcParams['axes.linewidth'] = 1

plt.xticks(rotation='vertical')
axis = plt.bar([x[0] for x in list_sorted], [x[1] for x in list_sorted])
plt.ylabel("#Samples")

In [None]:
print(plt.colormaps())
colors = plt.get_cmap("Set3")

In [None]:
import random
list_sorted = sorted(categories.items(), key=lambda x: x[1], reverse=True)
n_sample = sum([x[1] for x in list_sorted])

width, height = plt.figaspect(1)
plt.figure(figsize=(width,height))

font_size = 12

colormap = random.choice(['Accent', 'Dark2', 'Paired', 'Pastel1', 'Pastel2', 'Set1', 'Set2', 'Set3', 'tab10', 'tab20', 'tab20b', 'tab20c'])
colormap = "tab20c"
print(colormap)

plt.style.use('seaborn-muted')
plt.rcParams["figure.dpi"] = 200
plt.rcParams["savefig.dpi"] = 300
plt.rcParams['font.size'] = font_size
plt.rcParams['axes.labelsize'] = font_size + 2
plt.rcParams['axes.labelweight'] = 'normal'
plt.rcParams['legend.fontsize'] = font_size
plt.rcParams['xtick.labelsize'] = font_size
plt.rcParams['ytick.labelsize'] = font_size
plt.rcParams['axes.linewidth'] = 1

_, _, autotexts = plt.pie(
    [x[1] / n_sample for x in list_sorted],
    # labels = [f"{x[0]} \n{x[1]}" for x in list_sorted],
    labels = [f"{x[0]}" for x in list_sorted],
    startangle=15,
    explode=[0.02] * len(list_sorted),
    autopct=lambda x: f"{int(x*n_sample/100)},\n{x:.2f}%",
    pctdistance=0.75,
    colors=plt.get_cmap(colormap).colors
)
for i, autotext in enumerate(autotexts):
    autotext.set_fontsize(font_size - 3 - i*0.2)
plt.savefig("categories_dist.pdf", dpi=300)

In [None]:
len(topics)

In [None]:
sorted_keys = sorted(topics.items(), key=lambda x: x[1], reverse=True)
list_sorted = {}
list_sorted['train'] = [(x[0], topics_split['train'][x[0]]) for x in sorted_keys]
list_sorted['val'] = [(x[0], topics_split['val'][x[0]]) for x in sorted_keys]
list_sorted['test'] = [(x[0], topics_split['test'][x[0]]) for x in sorted_keys]

n_rows = 2

width, height = plt.figaspect(0.2)
fig, ax = plt.subplots(n_rows, 1, figsize=(width, height*n_rows))

split_pos = len(list_sorted['train']) // n_rows

font_size = 8
plt.rcParams["figure.dpi"] = 200
plt.rcParams["savefig.dpi"] = 300
plt.rcParams['axes.labelsize'] = font_size + 2
plt.rcParams['axes.labelweight'] = 'normal'
plt.rcParams['legend.fontsize'] = font_size
plt.rcParams['xtick.labelsize'] = font_size
plt.rcParams['ytick.labelsize'] = font_size
plt.rcParams['axes.linewidth'] = 1

colormap = random.choice(['Accent', 'Dark2', 'Paired', 'Pastel1', 'Pastel2', 'Set1', 'Set2', 'Set3', 'tab10', 'tab20', 'tab20b', 'tab20c'])
colormap = "tab20b"
colors = plt.get_cmap(colormap).colors
print(colormap)

for i, axi in enumerate(ax):

    start = i * split_pos
    end = len(list_sorted['train']) if i == (len(ax) - 1) else (i + 1) * split_pos
    last_top = [0] * (end - start)
    for j, split in enumerate(["train", "val", "test"]):
        items = list_sorted[split]
        key = [x[0] for x in list_sorted[split][start:end]]
        val = [x[1] for x in list_sorted[split][start:end]]
        axi.bar(
            key,
            val,
            bottom=last_top,
            label=split,
            color=colors[j+1]
        )
        for i in range(len(last_top)):
            last_top[i] += val[i]
    axi.margins(x=0.005)
    axi.tick_params(axis='x', rotation=90)
    axi.set_ylabel("#Samples")

    axi.legend()
plt.subplots_adjust(hspace=1.2)
plt.savefig("topics_dist.pdf", dpi=300)

## Statistics Table

In [None]:
statistics = defaultdict(lambda: defaultdict(int))
for sample in data:
    statistics["total"]["samples"] += 1
    statistics["total"]["segments"] += len(sample["annotation"])
    statistics["total"]["duration"] += sample["duration"]
    statistics["total"]["seg_duration"] += sum([
        x['segment'][1] - x['segment'][0] 
        for x in sample['annotation']
    ])

In [None]:
statistics

In [None]:
import pandas as pd
df = pd.DataFrame.from_dict(statistics, orient="index")
df

In [None]:
print(f"increase {(13547 - 11827)/11827*100:.2f}% samples")
print(f"increase {(595 - 476 + (22 - 38)/60)/(476+38/60)*100:.2f}% duration")
print(f"increase {(55482 - 46354)/46354*100:.2f}% segments")

In [None]:
import sys
sys.path.append("..")
from src.utils.timetool import time2str
from functools import partial
time_fmt = partial(time2str, units=["h", "min"])
df["duration"] = df["duration"].apply(time_fmt)
df

In [None]:
print(
    df.style.to_latex(
        caption="Statistics of the VTT dataset",
        hrules=True,
        position="ht",
        position_float="centering",
    )
)