In [None]:
import jsonlines
from collections import defaultdict
import json
from pathlib import Path
import random
import matplotlib.pyplot as plt
import matplotlib.font_manager
import matplotlib.image as mpimg
from tqdm.notebook import tqdm

In [None]:
import matplotlib.pyplot as plt
import matplotlib.font_manager
import matplotlib.image as mpimg
print(f"available fonts: {sorted([f.name for f in matplotlib.font_manager.fontManager.ttflist])}")

plt.style.use('seaborn-muted')

plt.rcParams["figure.dpi"] = 300
plt.rcParams["savefig.dpi"] = 300
plt.rcParams["savefig.format"] = "pdf"
plt.rcParams["savefig.bbox"] = "tight"
plt.rcParams["savefig.pad_inches"] = 0.1

plt.rcParams['figure.titlesize'] = 18
plt.rcParams['axes.titlesize'] = 18
plt.rcParams['font.family'] = 'Helvetica'
plt.rcParams['font.size'] = 18

plt.rcParams["lines.linewidth"] = 2
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16
plt.rcParams['legend.fontsize'] = 16
plt.rcParams['axes.linewidth'] = 2
plt.rcParams['axes.titlepad'] = 6

plt.rcParams['mathtext.fontset'] = 'dejavuserif'
plt.rcParams['mathtext.it'] = 'serif:italic'
plt.rcParams['lines.marker'] = ""
plt.rcParams['legend.frameon'] = False

In [None]:
def list2count(_list):
    count = defaultdict(int)
    for x in _list:
        count[x] += 1
    count = {key: val for key, val in sorted(count.items())}
    return count

def show_statistics(data):
    print(f"Total samples: {len(data)}")
    
    source_count = defaultdict(int)
    for sample in data:
        source_count[sample["ori"]] += 1
    print(f"Source count: {json.dumps(source_count, indent=2)}")
    
    steps = []
    for sample in data:
        n_step = len(sample["annotation"])
        steps.append(n_step)
    steps_count = list2count(steps)
    # print(f"Steps count: {json.dumps(steps_count, indent=2)}")
    print(f"Total steps: {sum(steps)}, total states: {sum(steps) + len(steps)}")
    
    cross_steps_count = list2count([steps[i] for i, sample in enumerate(data) if sample["ori"] == "cross"])
    var_steps_count = list2count([steps[i] for i, sample in enumerate(data) if sample["ori"] == "var"])
    coin_steps_count = list2count([steps[i] for i, sample in enumerate(data) if sample["ori"] == "coin"])
    
    width, height = plt.figaspect(0.3)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(width, height))
    
    ax1.plot(range(len(steps)), steps, linewidth=1)
    ax1.set_title("Step Overview",  fontstyle='italic')
    ax1.set_xlabel("samples")
    ax1.set_ylabel("steps")
    
    ax2.plot(steps_count.keys(), steps_count.values(), "-", label="total")
    ax2.plot(cross_steps_count.keys(), cross_steps_count.values(), "--", label="cross")
    ax2.plot(var_steps_count.keys(), var_steps_count.values(), ":", label="var")
    ax2.plot(coin_steps_count.keys(), coin_steps_count.values(), "-.", label="coin")

    ax2.set_title("Step Count", fontstyle="italic")
    ax2.set_xlabel("steps")
    ax2.set_ylabel("number of samples")
    ax2.legend(loc='best', numpoints=1, fancybox=False)
    
    
    # plt.subplots_adjust(hspace=0.25, wspace=0.40)
    plt.subplots_adjust(top=0.8, hspace=None, wspace=None) 

    fig.suptitle("Statistics of VTT dataset")

In [None]:
with jsonlines.open("../data/vtt_non_overlap.jsonl") as reader:
    data = list(reader)
show_statistics(data)
plt.savefig("statistics-all.pdf")

In [None]:
with jsonlines.open("../data/vtt.jsonl") as reader:
    data = list(reader)
show_statistics(data)

## Samples are health

In [None]:
with jsonlines.open("/data/reason/vtt/meta/report_extract_states.jsonl") as reader:
    data = list(reader)
success = [sample for sample in data if sample["status"] == "success"]
show_statistics(success)

## missing statistics

In [None]:
missing = defaultdict(int)
for sample in data:
    if sample["status"] == "missing":
        missing[sample["ori"]] += 1
print(dict(missing))

## sentence statistics

In [None]:
import spacy

# English pipelines include a rule-based lemmatizer
nlp = spacy.load("en_core_web_sm")
lemmatizer = nlp.get_pipe("lemmatizer")
print(lemmatizer.mode)  # 'rule'

doc = nlp("I was reading the paper.")
print([token.lemma_ for token in doc])
# ['I', 'be', 'read', 'the', 'paper', '.']

In [None]:
with jsonlines.open("/data/reason/vtt/meta/report_extract_states.jsonl") as reader:
    data = list(reader)
data = [sample for sample in data if sample["status"] == "success"]
sentences = defaultdict(list)
words = defaultdict(lambda: defaultdict(int))
for sample in tqdm(data):
    for step in sample["annotation"]:
        sentences[sample["ori"]].append(len(step['label'].split()))
        doc = nlp(step['label'])
        for word in doc:
            word = str(word)
            if word not in [",", "."]:
                words[sample["ori"]][word] += 1
sentences_count = {}
for key, val in sentences.items():
    sentences_count[key] = list2count(val)

In [None]:
plt.figure()
for key, val in sentences_count.items():
    plt.plot(val.keys(), val.values(), label=key)
plt.title("Sentences Length Count", fontstyle="italic")
plt.xlabel("length")
plt.ylabel("Count")
plt.legend(loc='best', numpoints=1, fancybox=False)
plt.savefig("statistics-sentences.pdf")

In [None]:
top_words

In [None]:
plt.style.use('seaborn-muted')

plt.rcParams["figure.dpi"] = 300
plt.rcParams["savefig.dpi"] = 300
plt.rcParams["savefig.format"] = "pdf"
plt.rcParams["savefig.bbox"] = "tight"
plt.rcParams["savefig.pad_inches"] = 0.1

plt.rcParams['figure.titlesize'] = 8
plt.rcParams['axes.titlesize'] = 6
plt.rcParams['font.family'] = 'Helvetica'
plt.rcParams['font.size'] = 18

plt.rcParams["lines.linewidth"] = 2
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['xtick.labelsize'] = 3
plt.rcParams['ytick.labelsize'] = 5
plt.rcParams['legend.fontsize'] = 16
plt.rcParams['axes.linewidth'] = 1
plt.rcParams['axes.titlepad'] = 6

plt.rcParams['mathtext.fontset'] = 'dejavuserif'
plt.rcParams['mathtext.it'] = 'serif:italic'
plt.rcParams['lines.marker'] = ""
plt.rcParams['legend.frameon'] = False
width, height = plt.figaspect(1)
n_subset = len(words)
fig, axs = plt.subplots(n_subset, 1, figsize=(width, height))
for i, (key, val) in enumerate(words.items()):
    top_words = sorted(
        [
            item for item in val.items()
            if item[0] not in ["the", "a", "and", " ", "is", "of", "are", "A"]],
        key=lambda x: x[1], reverse=True)[:20]
    w, c = zip(*top_words)
    axs[i].bar(w, c)
    axs[i].set_title(key,  fontstyle='italic')
    # axs[i].set_xlabel("samples")
    # axs[i].set_ylabel("steps")
# plt.title("Word Distribution", fontstyle="italic")
# plt.xlabel("length")
# plt.ylabel("Count")
# plt.legend(loc='best', numpoints=1, fancybox=False)
# plt.savefig("statistics-sentences.pdf")
plt.subplots_adjust(top=None, hspace=0.5, wspace=None) 

fig.suptitle("Word frequences")
plt.savefig("word_freq.png")