In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from collections import defaultdict

In [None]:
import matplotlib.pyplot as plt
import matplotlib.font_manager
import matplotlib.image as mpimg
print(f"available fonts: {sorted([f.name for f in matplotlib.font_manager.fontManager.ttflist])}")

plt.style.use('seaborn-muted')

plt.rcParams["figure.dpi"] = 150
plt.rcParams["savefig.dpi"] = 300
plt.rcParams["savefig.format"] = "pdf"
plt.rcParams["savefig.bbox"] = "tight"
plt.rcParams["savefig.pad_inches"] = 0.1

plt.rcParams['figure.titlesize'] = 18
plt.rcParams['axes.titlesize'] = 18
plt.rcParams['font.family'] = 'Helvetica'
plt.rcParams['font.size'] = 18

plt.rcParams["lines.linewidth"] = 2
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16
plt.rcParams['legend.fontsize'] = 16
plt.rcParams['axes.linewidth'] = 2
plt.rcParams['axes.titlepad'] = 6

plt.rcParams['mathtext.fontset'] = 'dejavuserif'
plt.rcParams['mathtext.it'] = 'serif:italic'
plt.rcParams['lines.marker'] = ""
plt.rcParams['legend.frameon'] = False

In [None]:
with open("/data/vtt/wikihow/wikihow_steps.txt") as f:
    steps = f.readlines()
steps = [x.strip() for x in steps]
print(f"Total steps: {len(steps)}")

In [None]:
words_all = []
length = []
for step in steps:
    words = step.split()
    words_all.extend(words)
    length.append(len(words))


In [None]:
# join the list and lowercase all the words
text = " ".join(words_all).lower()

# create the wordcloud object
wordcloud = WordCloud(
    width=1000,
    height=500,
    stopwords=STOPWORDS,
    collocations=True,
    background_color="white",
).generate(text)

# plot the wordcloud object
plt.imshow(wordcloud, interpolation="bilInear")
plt.axis("off")
plt.savefig("wikihow_wordcloud.png", dpi=300)


In [None]:
def list2count(_list):
    count = defaultdict(int)
    for x in _list:
        count[x] += 1
    count = {key: val for key, val in sorted(count.items())}
    return count

In [None]:
for ratio in [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999, 0.9999]:
    pos = int(len(length) * ratio)
    print(f"{ratio * 100}% ({pos}) of the steps have {length[pos]} words or less")

In [None]:
for key, value in list2count(length).items():
    print(f"{key}: {value}")

In [None]:
length_count = list2count(length[:int(len(length) * 0.99)])
plt.plot(length_count.keys(), length_count.values())
plt.xlabel("length")
plt.ylabel("Count")
plt.savefig("wikihow_length_dist.pdf")