In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt

df = pd.read_pickle("../pickled/annotations/submissions-basic_stats.pkl").sort_values(by='count_that', ascending=False).reset_index(drop=True)
df['occurrence_ratio'] = (df['count_that'] / df['count_tokens'])

display(df)

In [None]:
corpus_size_s = df["count_sentences"].sum()
corpus_size_t = df["count_tokens"].sum()
print("Total size of corpus:", corpus_size_t, "tokens in", corpus_size_s, "sentences")

average_length_s = df["count_sentences"].mean()
average_length_t = df["count_tokens"].mean()
print("Average length in sentences per text:", round(average_length_s))
print("Average length in tokens per text:", round(average_length_t))

average_thats = df["count_that"].mean()
print("\nAverage number of 'that' occurrences per text:", average_thats)

average_ratio = df['occurrence_ratio'].mean()
print("Average occurrence ratio:", round(average_ratio * 100, 2), "%")


In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(df['count_tokens'], df['count_that'], marker='o', c=df['count_that'], cmap="plasma")
plt.title('Occurrences relative to text length')
plt.xlabel('Text length (in tokens)')
plt.ylabel('Occurrences of "that"')
plt.grid(True)
plt.show()


In [None]:
token_to_compare = "that"

df_sorted = df.loc[df.project == "aita"].sort_values(by="id", ascending=False)
plt.figure(figsize=(10, 6))

bar_width = 0.6
bar_positions = range(len(df_sorted["id"]))

colors = plt.cm.plasma(
    df_sorted["count_that"] / max(df_sorted["count_that"])
)

plt.bar(bar_positions, df_sorted["occurrence_ratio"] * 100, width=bar_width, color=colors)
plt.xlabel("Submissions sorted by ID", fontsize=12)
plt.ylabel(f'Occurrence rate of "{token_to_compare}" (%)', fontsize=12)
plt.title(
    f'Frequency of "{token_to_compare}" across AITA submissions',
    fontsize=14,
    fontweight="bold",
)
plt.ylim(0, 3)
plt.xticks(bar_positions, df_sorted["id"], rotation=90, ha="right", fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()

plt.savefig("../img/aita-frequency-that_plot.png", dpi=300, bbox_inches="tight")
plt.savefig("../img/aita-frequency-that_plot.svg", bbox_inches="tight")

plt.show()

In [None]:
token_to_compare = "that"

df_sorted = df.loc[df.project == "tifu"].sort_values(by="id", ascending=False)
plt.figure(figsize=(10, 6))

bar_width = 0.6
bar_positions = range(len(df_sorted["id"]))

colors = plt.cm.plasma(
    df_sorted["count_that"] / max(df_sorted["count_that"])
)

plt.bar(bar_positions, df_sorted["occurrence_ratio"] * 100, width=bar_width, color=colors)
plt.xlabel("Submissions sorted by ID", fontsize=12)
plt.ylabel(f'Occurrence rate of "{token_to_compare}" (%)', fontsize=12)
plt.title(
    f'Frequency of "{token_to_compare}" across TIFU submissions',
    fontsize=14,
    fontweight="bold",
)
plt.ylim(0, 3)
plt.xticks(bar_positions, df_sorted["id"], rotation=90, ha="right", fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()

plt.savefig("../img/tifu-frequency-that_plot.png", dpi=300, bbox_inches="tight")
plt.savefig("../img/tifu-frequency-that_plot.svg", bbox_inches="tight")

plt.show()

## Thats

In [None]:
import glob

# AITA

aita_files = glob.glob("../pickled/annotations/aita/*-thats.pkl")
aita_thats_dfs = [pd.read_pickle(d).assign(**{"post_id": d.split("/")[4].rstrip("-thats.pkl")}).assign(**{"project": "aita"}) for d in aita_files]
aita_thats = pd.concat(aita_thats_dfs).reset_index(drop=True)
aita_thats

In [None]:
# TIFU

tifu_files = glob.glob("../pickled/annotations/tifu/*-thats.pkl")
tifu_thats_dfs = [pd.read_pickle(d).assign(**{"post_id": d.split("/")[4].rstrip("-thats.pkl")}).assign(**{"project": "tifu"}) for d in tifu_files]
tifu_thats = pd.concat(tifu_thats_dfs).reset_index(drop=True)
tifu_thats

In [None]:
# TOTAL

thats = pd.concat([aita_thats, tifu_thats]).reset_index(drop=True)
thats

In [None]:
conjunctions = thats.loc[thats.upos == "SCONJ"]
pronouns = thats.loc[thats.upos == "PRON"]
determiners = thats.loc[thats.upos == "DET"]
adverbs = thats.loc[thats.upos == "ADV"]

print(f"""
    Used as a conjunction: {len(conjunctions)}
    Used as a pronoun: {len(pronouns)} 
    Used as a determiner: {len(determiners)}
    Used as an adverb: {len(adverbs)}
""")

In [None]:
conjunctions.to_csv("../annotated/thats-conjunctions.csv", index=False)

with open("../filtered/thats-conjunctions-aita-tifu-combined.txt", "w+") as file:
    for sent in set(conjunctions.sentence):
        file.write(sent + "\n")


In [None]:
categories = ["SCONJ", "PRON", "DET", "ADV"]
counts = [len(conjunctions), len(pronouns), len(determiners), len(adverbs)]

plt.figure(figsize=(8, 6))
plt.title('"that" by function')
plt.xlabel("Part of speech")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()

bars = plt.bar(categories, counts, color="darkblue")
for bar, count in zip(bars, counts):
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        bar.get_height() + 0.5,
        count,
        ha="center",
        va="bottom",
    )

plt.show()

## Zeros

In [None]:
aita_files = glob.glob("../pickled/annotations/aita/*-zeros.pkl")
aita_zeros_dfs = [pd.read_pickle(d).assign(**{"post_id": d.split("/")[4].rstrip("-zeros.pkl")}).assign(**{"project": "aita"}) for d in aita_files]
aita_zeros = pd.concat(aita_zeros_dfs).reset_index(drop=True)
aita_zeros

In [None]:
tifu_files = glob.glob("../pickled/annotations/tifu/*-zeros.pkl")
tifu_zeros_dfs = [pd.read_pickle(d).assign(**{"post_id": d.split("/")[4].rstrip("-zeros.pkl")}).assign(**{"project": "tifu"}) for d in tifu_files]
tifu_zeros = pd.concat(tifu_zeros_dfs).reset_index(drop=True)
tifu_zeros

In [None]:
zeros = pd.concat([aita_zeros, tifu_zeros]).reset_index(drop=True)
zeros

In [None]:
import os

dropped_cols = ["sentence_begin", "sentence_end"]

os.makedirs("../annotated/", exist_ok=True)  
thats.drop(columns=dropped_cols).to_csv("../annotated/thats.csv", index=False)  
zeros.drop(columns=dropped_cols).to_csv("../annotated/zeros.csv", index=False)  