In [None]:
import os

import pandas as pd
import numpy as np


In [None]:
dir = "./outputs/en"

### Substrings

In [None]:
total_size = os.path.getsize(os.path.join(dir, "text.csv"))
total_size

In [None]:
substr = pd.read_csv(os.path.join(dir, "substring_bytes.tsv"), sep='\t')

In [None]:
substr.head()

In [None]:
duplicated = sum(substr.y - substr.x)

duplicated, duplicated / total_size

In [None]:
matches = pd.read_csv(os.path.join(dir, "matches.tsv"), sep='\t')
clusters = pd.read_csv(os.path.join(dir, "clusters.tsv"), sep='\t')

In [None]:
matches.head()

In [None]:
clusters.head()

In [None]:
ids = pd.read_csv(os.path.join(dir, "ids.csv"), names=["id"])
text = pd.read_csv(os.path.join(dir, "text.csv"), names=["text"])

print(len(ids), len(text))

In [None]:
text.head()

In [None]:
clusters = clusters[clusters.cluster != -1]

### Cluster size distribution

In [None]:
sizes = clusters.groupby("cluster").size().value_counts().sort_index()
sizes = sizes.to_frame()
sizes.reset_index(inplace=True)
sizes = sizes.rename({"index": "size", 0: "count"}, axis=1)
sizes.plot.bar(x="size", y="count", figsize=(20, 8))

### Document length distribution

In [None]:
ids = ids.assign(
    length = text.apply(lambda x: len(x["text"]), axis=1)
)

In [None]:
id2length = dict(ids[["id", "length"]].values.tolist())

In [None]:

matches["length"] = matches.apply(lambda x: max(
    id2length[x["id1"]],
    id2length[x["id2"]],
), axis=1)


In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

fig, ax = plt.subplots(1, 1, figsize=(20, 8))

g = sns.histplot(data=matches, stat="count", multiple="stack",
             x="length", kde=False,
             palette="pastel", hue="diff",
             element="bars", legend=True)

g.set(xlim=(0, 200000))

In [None]:
clusters.groupby("cluster").apply(lambda x: np.max([id2length[r["id"]] for r in x.to_dict("records")])).plot.hist(bins=2000, figsize=(20, 8), xlim=(0, 200000))