In [2]:
import json
import pandas as pd
import numpy as np

import networkx as nx
from networkx.algorithms import bipartite
from matplotlib import pyplot as plt

import altair as alt
from collections import defaultdict, Counter

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
TOKENS_DATA = "./data/tokens_data.csv"
CATEGORY_STATS = "./data/category_stats.csv"

In [4]:
tokens_df = pd.read_csv(TOKENS_DATA)
tokens_df

Unnamed: 0,Category,Token,Part-of-Speech,Occurrences,Frequency
0,History,klan,NNP,188,0.001222
1,History,klan,JJ,188,0.001222
2,History,klan,VBP,188,0.001222
3,History,klan,FW,188,0.001222
4,History,klan,NN,188,0.001222
...,...,...,...,...,...
1322301,Citizenship,mrida,NN,1,0.000016
1322302,Citizenship,naples,NNS,1,0.000016
1322303,Citizenship,marrakesh,JJ,1,0.000016
1322304,Citizenship,suzhou,JJ,1,0.000016


In [32]:
tex_df = tokens_df[:100:10].to_latex(
    index=False,
    caption="Dane każdego z tokenów.",
    label="tab:table_label"
)

with open("tokens_df2.tex", "w") as tf:
    tf.write(tex_df)

  tex_df = tokens_df[:100:10].to_latex(


In [5]:
def get_core_pos(pos):
    if pos.startswith("NN"):
        return "Noun"
    elif pos.startswith("PR"):
        return "Pronoun"
    elif pos.startswith("VB"):
        return "Verb"
    elif pos.startswith("JJ"):
        return "Adjective"
    elif pos.startswith("RB"):
        return "Adverb"
    elif pos.startswith("IN"):
        return "Preposition"
    elif pos.startswith("CC"):
        return "Conjunction"
    elif pos.startswith("UH"):
        return "Interjection"
    return pd.NA

tokens_df["Part-of-Speech"] = tokens_df["Part-of-Speech"].apply(get_core_pos)
tokens_df.dropna(inplace=True)
print("Parts of Speech:", ", ".join(tokens_df["Part-of-Speech"].unique()))
tokens_df

Parts of Speech: Noun, Adjective, Verb, Adverb, Preposition, Conjunction, Pronoun, Interjection


Unnamed: 0,Category,Token,Part-of-Speech,Occurrences,Frequency
0,History,klan,Noun,188,0.001222
1,History,klan,Adjective,188,0.001222
2,History,klan,Verb,188,0.001222
4,History,klan,Noun,188,0.001222
5,History,klan,Verb,188,0.001222
...,...,...,...,...,...
1322301,Citizenship,mrida,Noun,1,0.000016
1322302,Citizenship,naples,Noun,1,0.000016
1322303,Citizenship,marrakesh,Adjective,1,0.000016
1322304,Citizenship,suzhou,Adjective,1,0.000016


In [52]:
tex_df = tokens_df.to_latex(
    index=False,
    caption="Dane każdego z tokenów.",
    label="tab:table_label"
)

with open("tokens_df2.tex", "w") as tf:
    tf.write(tex_df)

  tex_df = tokens_df.to_latex(


In [6]:
cat_stats_df = pd.read_csv(CATEGORY_STATS)
cat_stats_df.sort_values(by="Tokens", ascending=False, inplace=True)
cat_stats_df

Unnamed: 0,Category,Tokens,Files
3,Geography,220024,1084
1,People,184725,689
5,Science,154367,1122
0,History,153855,545
6,Everyday_life,100361,374
2,Countries,79565,229
10,Language_and_literature,65750,196
7,Design_and_Technology,63841,254
14,Citizenship,62802,224
12,Religion,54071,134


In [8]:
cat_tex_df = cat_stats_df.to_latex(
    index=False,
    caption="Ilość unikalnych tokenów oraz plików dla kategorii.",
    label="tab:table_label"
)

with open("categories_df.tex", "w") as tf:
    tf.write(cat_tex_df)

  cat_tex_df = cat_stats_df.to_latex(


In [11]:
tb_color, fb_color = "#0e8acc", "#e6990b"
cat_stats_chart = alt.Chart(cat_stats_df).encode(
    alt.X("Category:O", sort="-y", axis=alt.Axis(title="Category", labelAngle=-45))
).properties(width=500, height=300)
tokens_bar = cat_stats_chart.mark_bar(color=tb_color).encode(
    alt.Y("Tokens:Q", axis=alt.Axis(title="Tokens", titleColor=tb_color))
)
files_bar = cat_stats_chart.mark_bar(color=fb_color).encode(
    alt.Y("Files:Q", axis=alt.Axis(title="Files", titleColor=fb_color))
)
stats_layer = alt.layer(tokens_bar, files_bar)
stats_layer.resolve_scale(y="independent")

In [12]:
tokens_pie = alt.Chart(cat_stats_df).mark_arc().encode(
    theta=alt.Theta(field="Tokens", type="quantitative"),
    color=alt.Color(field="Category", type="nominal"),
).properties(
    title="Tokens per category"
).display()

files_pie = alt.Chart(cat_stats_df).mark_arc().encode(
    theta=alt.Theta(field="Files", type="quantitative"),
    color=alt.Color(field="Category", type="nominal"),
).properties(
    title="Files per category"
).display()

In [13]:
categories = cat_stats_df["Category"].unique()
categories

array(['Geography', 'People', 'Science', 'History', 'Everyday_life',
       'Countries', 'Language_and_literature', 'Design_and_Technology',
       'Citizenship', 'Religion', 'Music', 'Business_Studies', 'IT',
       'Art', 'Mathematics'], dtype=object)

In [22]:
pos_tags = tokens_df["Part-of-Speech"].unique()
pos_ratio = defaultdict(list)
for cat in categories:
    pos_ratio["Category"].append(cat)
    total = cat_stats_df.loc[cat_stats_df["Category"] == cat]["Tokens"].to_numpy()[0]
    for pos in pos_tags:
        occurs = tokens_df.loc[
            (tokens_df["Part-of-Speech"] == pos) & \
                (tokens_df["Category"] == cat)].shape[0]
        ratio = occurs / total
        pos_ratio[pos].append(ratio)

In [24]:
pos_df = pd.DataFrame(pos_ratio)
for col in pos_df.columns:
    if col == "Category":
        continue
    pos_df[col] = pos_df[col].astype(float).map("{:,.4f}".format)
pos_df

Unnamed: 0,Category,Noun,Adjective,Verb,Adverb,Preposition,Conjunction,Pronoun,Interjection
0,Geography,0.5965,0.2371,0.1455,0.0411,0.0101,0.0008,0.0004,0.0
1,People,0.5716,0.2335,0.1719,0.0472,0.0098,0.0011,0.0006,0.0
2,Science,0.5803,0.2304,0.1624,0.0479,0.0095,0.001,0.0003,0.0
3,History,0.5627,0.2445,0.1741,0.0458,0.0097,0.0009,0.0004,0.0
4,Everyday_life,0.5817,0.224,0.1803,0.0464,0.0103,0.0008,0.0007,0.0
5,Countries,0.5961,0.2405,0.1484,0.0401,0.0099,0.0008,0.0003,0.0
6,Language_and_literature,0.5881,0.234,0.1673,0.0481,0.0094,0.001,0.0005,0.0
7,Design_and_Technology,0.5792,0.2217,0.1864,0.0471,0.0109,0.0008,0.0005,0.0
8,Citizenship,0.5555,0.2473,0.1774,0.0439,0.0095,0.0006,0.0003,0.0
9,Religion,0.5685,0.2439,0.1731,0.0469,0.0089,0.0011,0.0007,0.0


In [25]:
pos_text_df = pos_df[["Category", "Noun", "Adjective", "Verb", "Adverb"]].to_latex(
    index=False,
    caption="Procentowy udział części mowy według kategorii.",
    label="tab:categories_pos_ratio"
)

with open("pos_df.tex", "w") as tf:
    tf.write(pos_text_df)

  pos_text_df = pos_df[["Category", "Noun", "Adjective", "Verb", "Adverb"]].to_latex(


In [49]:
pos_df.loc[pos_df["Category"] == "People"]["Noun"]

1    0.571644
Name: Noun, dtype: float64

In [27]:
for cat in categories:
    if cat not in ["Art", "Mathematics"]:
        continue
    data = pd.DataFrame({
        "Part-of-Speech": pos_tags,
        "ratio": [
            pos_df.loc[pos_df["Category"] == cat][pos_tag]
            for pos_tag in pos_tags
        ]
    })
    alt.Chart(data).mark_arc().encode(
        theta=alt.Theta(field="ratio", type="quantitative"),
        color=alt.Color(field="Part-of-Speech", type="nominal"),
    ).properties(
        title=cat
    ).display()

In [25]:
pos_charts = []
for pos in pos_tags:
    chart = alt.Chart(pos_df).mark_bar().encode(
        alt.X("Category:O", axis=alt.Axis(title="Category", labelAngle=-45)),
        alt.Y(f"{pos}:Q", axis=alt.Axis(title="PoS ratio"))
    ).properties(width=250, height=200, title=pos)
    pos_charts.append(chart)

alt.vconcat(
    (pos_charts[0]|pos_charts[1]|pos_charts[2]|pos_charts[3]),
    (pos_charts[4]|pos_charts[5]|pos_charts[6]|pos_charts[7])
)

In [28]:
unique_tokens_dict = {
    category: tokens_df.loc[tokens_df["Category"] == category]["Token"].unique()
    for category in categories
}

In [29]:
common_tokens_dict = {}
for cat1 in categories:
    total = cat_stats_df.loc[cat_stats_df["Category"] == cat1]["Tokens"].to_numpy()[0]
    cat1_unique = unique_tokens_dict[cat1]
    bb = 0
    for cat2 in categories:
        if cat1 == cat2:
            continue
        common_tokens = np.intersect1d(cat1_unique, unique_tokens_dict[cat2], assume_unique=True)
        n = len(common_tokens)
        if n > bb:
            common_tokens_dict[cat1] = {
                "cat": cat2,
                "ratio": n / total,
                "tokens": common_tokens
            }
            bb = n

sorted_common_tokens_dict = dict(sorted(
    common_tokens_dict.items(),
    reverse=True,
    key=lambda item: item[1]["ratio"]
))

In [30]:
n_cat = len(categories)
common_tokens_matrix = np.ones(shape=(n_cat, n_cat))
for i in range(n_cat):
    for j in range(n_cat):
        if i == j:
            continue
        i_unique = unique_tokens_dict[categories[i]]
        j_unique = unique_tokens_dict[categories[j]]
        common_tokens = np.intersect1d(i_unique, j_unique, assume_unique=True)
        total = len(j_unique)
        common = len(common_tokens)
        common_tokens_matrix[i][j] = common / total
        

In [31]:
for cat, data in sorted_common_tokens_dict.items():
    print(f"{cat} have {data['ratio']*100:.2f}% of tokens in common with {data['cat']}")

ctm = pd.DataFrame(common_tokens_matrix, columns=categories).corr()
ctm["cat"] = categories

pivot_cols = list(ctm.columns)
ctm["cat"] = ctm.index

alt.Chart(ctm).mark_rect(tooltip=True) \
    .properties(title="Tokens in common") \
    .transform_fold(pivot_cols) \
    .encode(
        x=alt.X("cat:N", title=None),
        y=alt.Y("key:N", title=None),
        color=alt.Color("value:Q", scale=alt.Scale(scheme="redyellowblue"))
    )

Countries have 99.37% of tokens in common with Geography
Art have 56.39% of tokens in common with History
Business_Studies have 49.50% of tokens in common with Geography
Mathematics have 43.67% of tokens in common with People
Music have 41.01% of tokens in common with People
Religion have 39.66% of tokens in common with History
Language_and_literature have 38.76% of tokens in common with People
Citizenship have 38.48% of tokens in common with Geography
Design_and_Technology have 37.95% of tokens in common with Geography
IT have 36.12% of tokens in common with People
Geography have 35.93% of tokens in common with Countries
History have 34.26% of tokens in common with People
Everyday_life have 29.51% of tokens in common with Science
People have 28.54% of tokens in common with History
Science have 19.93% of tokens in common with Geography


In [37]:
for i, row in enumerate(common_tokens_matrix):
    cat = categories[i]
    if cat != "History":
        continue
    print(cat)
    print(json.dumps(
        {
            categories[j]: row[j]
            for j in range(len(row))
        },
        indent=2,
        ensure_ascii=True
    ))
    print("_"*50)
    print(
        np.array(list({categories[j]: row[j] for j in range(len(row))}.values())).mean()
    )


History
{
  "Geography": 0.17679502469829694,
  "People": 0.287003010654341,
  "Science": 0.16735549292841198,
  "History": 1.0,
  "Everyday_life": 0.24890523383403645,
  "Countries": 0.29183887561774036,
  "Language_and_literature": 0.3184709480122324,
  "Design_and_Technology": 0.3576891602669017,
  "Citizenship": 0.35250371839365396,
  "Religion": 0.3986875859761312,
  "Music": 0.369477274530023,
  "Business_Studies": 0.44337049949946494,
  "IT": 0.353517958810712,
  "Art": 0.5667281489855417,
  "Mathematics": 0.4138008205893323
}
__________________________________________________
0.3830762501864547
