In [None]:
from collections import Counter
from pathlib import Path
import xml.etree.ElementTree as ET

import spacy
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
corpus = Path("nexis_corpus").rglob("*.xml")

articles = []
for newspaper in corpus:
    xml_tree = ET.parse(str(newspaper))
    doc_elements = xml_tree.findall("document")
    
    for doc in doc_elements:
        articles.append(
            {
                "newspaper": doc.find("metadata/source").text,
                "publication_type": doc.find("metadata/publication_type").text,
                "distribution": doc.find("metadata/distribution").text,
                "year": doc.find("metadata/year").text,
                "text": " ".join([paragraph.strip() for paragraph in doc.find("text").itertext()]).strip(),
            }
        )
        
articles[:3]

In [None]:
nlp = spacy.load("en_core_web_sm")

for article in articles:
    article["tagged"] = nlp(article["text"], disable=["parser", "ner"])

In [None]:
tokens = [token for article in articles for token in article["tagged"] if not token.is_space]
tokens[:10]

In [None]:
lemmas = [token.lemma_ for token in tokens]
lemmas[:10]

In [None]:
counter = Counter(lemmas).most_common()

In [None]:
counter[:20]

In [None]:
lemmas_sorted = []
counts = []
rank = []
for i, (lemma, count) in enumerate(counter, 1):
    lemmas_sorted.append(lemma)
    counts.append(count)
    rank.append(str(i))

In [None]:
fig = go.Figure(
    data=go.Scatter(
        x=lemmas_sorted[:40],
        y=counts[:40],
        text=rank,
        hovertemplate = "<i>lemma</i>: <b>%{x}</b><br><i>count</i>: %{y}<br><i>rank</i>: %{text}<extra></extra>",
        #line_shape="spline"
    )
)

fig.update_layout(
    title="Zipf distribution",
    xaxis_title="Lemma (lemmatized by spaCy)",
    yaxis_title="Occurences in corpus",
    template="simple_white"
)

fig.show()

In [None]:
fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=("Scatter chart", "Bar chart"),
    x_title="Lemma (lemmatized by spaCy)",
    y_title="Occurences in corpus",
)

fig.add_trace(
    go.Scatter(
        x=lemmas_sorted[:30],
        y=counts[:30],
        text=rank,
        hovertemplate = "<i>lemma</i>: <b>%{x}</b><br><i>count</i>: %{y}<br><i>rank</i>: %{text}<extra></extra>"
    ),
    row=1,
    col=1
)

fig.add_trace(
    go.Bar(
        x=lemmas_sorted[:30],
        y=counts[:30],
        text=rank,
        hovertemplate = "<i>lemma</i>: <b>%{x}</b><br><i>count</i>: %{y}<br><i>rank</i>: %{text}<extra></extra>"
    ),
    row=1,
    col=2
)

fig.update_layout(
    title="Zipf distribution",
    template="seaborn",
    showlegend=False
)

fig.layout.annotations[0]["yshift"] = 10
fig.layout.annotations[1]["yshift"] = 10
fig.layout.annotations[2]["yshift"] = -50

fig.show()

In [None]:
text_token_distribution = {
    "Sunday Mail": {"texts": 0, "tokens": 0},
    "The Guardian": {"texts": 0, "tokens": 0},
    "The Times": {"texts": 0, "tokens": 0},
}

for article in articles:
    newspaper = article["newspaper"]
    text_token_distribution[newspaper]["texts"] += 1
    text_token_distribution[newspaper]["tokens"] += len(article["tagged"])

labels, texts, tokens = zip(*[(newspaper, values["texts"], values["tokens"]) for newspaper, values in text_token_distribution.items()])

text_token_distribution

In [None]:
colors = ["lightsalmon", "lightsteelblue", "gold"]

fig = make_subplots(
    rows=1,
    cols=2,
    specs=[[{"type" :"domain"}, {"type": "domain"}]]
)

fig.add_trace(
    go.Pie(
        labels=labels,
        values=texts,
        direction="counterclockwise",
        rotation=0,
        sort=True,
        hole=.3
    ),
    row=1,
    col=1
)

fig.add_trace(
    go.Pie(
        labels=labels,
        values=tokens,
        direction="counterclockwise",
        rotation=0,
        sort=True,
        hole=.3
    ),
    row=1,
    col=2
)

fig.update_traces(
    textinfo="label+percent",
    textfont_size=15,
    hovertemplate="<b>%{value}</b><extra></extra>",
    marker=dict(colors=colors, line=dict(color='#000000', width=2.3))
)

fig.update_layout(
    title="Text and token distribution - Pie chart",
    width=1000,
    height=500,
    showlegend=False,
    annotations=[
        dict(text="Texts", x=0.195, y=0.5, font_size=17, showarrow=False),
        dict(text="Tokens", x=0.815, y=0.5, font_size=17, showarrow=False)
    ]
)

fig.show()

In [None]:
fig = go.Figure(
    data=[
        go.Bar(
            name="Texts",
            x=labels,
            y=[t/sum(texts) for t in texts],
            marker_color="steelblue"
        ),
        go.Bar(
            name="Tokens",
            x=labels,
            y=[t/sum(tokens) for t in tokens],
            marker_color="salmon"
        )
    ]
)

fig.update_traces(
    marker_line_color="rgb(8,48,107)",
    marker_line_width=1.5, opacity=0.6
)

fig.update_layout(
    barmode="group",
    bargroupgap=0.05,
    title="Text and token distribution - Bar chart",
    xaxis_title="Newspaper",
    yaxis_title="Percentage",
    template="ggplot2"
)

fig.show()

In [None]:
query_distribution = {
    "Sunday Mail": {"hits": 0, "in_texts": 0, "per_text": []},
    "The Guardian": {"hits": 0, "in_texts": 0, "per_text": []},
    "The Times": {"hits": 0, "in_texts": 0, "per_text": []},
}

for article in articles:
    tokens = [token.text for token in article["tagged"]]
    if "London" in tokens:
        per_text = 0
        for token in tokens:
            if token == "London":
                query_distribution[article["newspaper"]]["hits"] += 1
                per_text += 1
        query_distribution[article["newspaper"]]["in_texts"] += 1
        query_distribution[article["newspaper"]]["per_text"].append(per_text)

print(query_distribution)

In [None]:
colors = ["lightsalmon", "lightsteelblue", "gold"]

fig = make_subplots(
    rows=1,
    cols=3,
    specs=[[{"type" :"domain"}, {"type": "domain"}, {"type": "domain"}]],
    subplot_titles=(
        "Absolute frequency",
        "Relative frequency (instances per million words)",
        "Dispersion (texts with at least one hit)"
    ),
)

fig.add_trace(
    go.Pie(
        labels=labels,
        values=[query_distribution[label]["hits"] for label in labels],
        direction="counterclockwise",
        rotation=0,
        sort=True
    ),
    row=1,
    col=1
)

fig.add_trace(
    go.Pie(
        labels=labels,
        values=[(query_distribution[label]["hits"]/text_token_distribution[label]["tokens"])*1000000 for label in labels],
        direction="counterclockwise",
        rotation=0,
        sort=True
    ),
    row=1,
    col=2
)

fig.add_trace(
    go.Pie(
        labels=labels,
        values=[query_distribution[label]["in_texts"] for label in labels],
        direction="counterclockwise",
        rotation=0,
        sort=True
    ),
    row=1,
    col=3
)

fig.update_traces(
    textinfo="label+percent",
    textfont_size=15,
    hovertemplate="<b>%{value}</b><extra></extra>",
    marker=dict(colors=colors, line=dict(color='#000000', width=2.3))
)

fig.update_layout(
    title="Results for query \"London\" - absolute vs. relative frequency vs. dispersion",
    showlegend=False
)

fig.layout.annotations[2]["yshift"] = -30

fig.show()

In [None]:
colors = ["lightsalmon", "lightsteelblue", "gold"]

fig = go.Figure()

for label in labels:
    fig.add_trace(
        go.Violin(
            y=query_distribution[label]["per_text"],
            name=label,
            box_visible=True,
            meanline_visible=True
        )
    )

fig.update_layout(
    title="Results for query \"London\" - Dispersion (hits per text)",
    showlegend=False,
    yaxis_title="No. hits per text",
    template="ggplot2"
)

fig.show()