# LAPA: Language Pattern Analyser

A Digital Tool for the Analysis of Patterns in Spelled Language Sounds in Historical Dutch Theatre Plays.

LAPA allows for converting digitised early modern Dutch theatre plays into (presumed) phonetic script (SAMPA). To achieve this, a ruleset has been created that codifies the transliteration to SAMPA. This codebase contains parsers for the rule sets (xls format), parsers for the digitised texts (naf xml) and logic to perform counts and correlations.

This notebook is just a quick placeholder showing how to run a quick analysis of a text. Mode examples will be added shortly.

In [5]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from lapa_ng.factory import create_matcher
from lapa_ng.naf import parse_naf
from lapa_ng.text_clean import clean_words, default_cleaners
from lapa_ng.translator import CachedTranslator, MatchingTranslator

In [None]:

matcher = create_matcher("ng:../fixtures/RULES_A_V1.5.xls#RULES")
translator = MatchingTranslator(matcher)
translator = CachedTranslator(translator)
  
print(f"Loaded {matcher} matcher.")

In [None]:
# Create a preprocessor pipeline to clean the text prior to tokenization
from collections import Counter

input = parse_naf("../fixtures/vond001gysb04_01.xml")
input = list(clean_words(input, default_cleaners))
translations = list(translator.translate(input, emit="word"))

print(f"Created {len(translations)} translations.")

# Count the number of times each word appears in the text
unique_translations = Counter((t.word.text.lower(), " ".join([p.sampa for p in t.phonemes])) for t in translations)
print(f"Found {len(unique_translations)} unique word translations in the text.")

# Most common words
most_common_words = unique_translations.most_common()
print(f"The most common words are: {most_common_words[:10]}")

# Least common words
print(f"The least common words are: {most_common_words[-10:]}")


Above we looked at some basic statistics of the text. One slightly odd thing about the translate function is that it emits a result for each rule match in the document. But a rule can emit zero or more phonemes.

I don't think we even really want to work with rule matches, apart from for tracing purposes, so we have the functions `coalesce_translations` and `explode_translations` to convert the list of rule matches into a list of words and a list of phonemes respectively. Below we see how we can use it to extract all phonemes across the entire text.

In [None]:
phoneme_translations = list(translator.translate(input, emit="phoneme"))

# We can now count the number of times each phoneme appears in the text
phoneme_counts = Counter(t.phoneme_str() for t in phoneme_translations)

# Convert to DataFrame for plotting
df = pd.DataFrame({
    'Phoneme': list(phoneme_counts.keys()),
    'Count': list(phoneme_counts.values())
})

# Sort by count
df = df.sort_values('Count', ascending=False)

# Create pie chart
fig = px.pie(df, 
             values='Count', 
             names='Phoneme',
             title='Distribution of Phonemes in the Text',
             hover_data=['Count'],
             labels={'Count': 'Frequency'})

# Update layout for better readability
fig.update_layout(
    showlegend=True,
    legend=dict(
        orientation='v',
        yanchor='auto',
        y=1,
        xanchor='left',
        x=1.05
    )
)

fig.show()

It may be a bit easier to read as a  bar chart:

In [None]:
# Calculate total count for percentage calculation
total_count = df['Count'].sum()

# Create horizontal bar chart with custom hover template
fig = px.bar(df, 
             x='Count', 
             y='Phoneme',
             title='Distribution of Phonemes in the Text',
             orientation='h',
             labels={'Count': 'Frequency'})

# Update layout for better readability
fig.update_layout(
    showlegend=False,
    height=800,  # Adjust height to fit all phonemes
    margin=dict(l=100, r=20, t=50, b=50)  # Adjust margins
)

# Sort bars by count
fig.update_yaxes(categoryorder='total ascending')

# Custom hover template showing both count and percentage
fig.update_traces(
    hovertemplate="<b>%{y}</b><br>" +
                 "Count: %{x}<br>" +
                 "Percentage: %{customdata:.1f}%<br>" +
                 "<extra></extra>",
    customdata=df['Count'] / total_count * 100
)

fig.show()

We can now combine the word and phoneme counts to see how different words contribute to the total phoneme counts.

In [None]:
# First we create an object that counts phonemes in each word by looping over each word and splitting the phonemes into a list
all_words = {}
for word, phonemes in unique_translations:
    all_words[word] = Counter(phonemes.split(" "))

df = pd.DataFrame.from_dict(all_words, orient='index').fillna(0).astype(int)

df_phoneme_counts = df.copy()
df_phoneme_counts["Total"] = df_phoneme_counts.sum(axis=1)
df_phoneme_counts.loc["Total"] = df_phoneme_counts.sum(axis=0)
df_phoneme_counts

But the real contribution obviously comes from the words that are used most often, so let's multiple the whole matrix by the word frequencies.

In [None]:
word_frequencies = {k[0]: v for k, v in unique_translations.items()}
df_word_frequencies = pd.DataFrame.from_dict(word_frequencies, orient='index', columns=["Count"])
df_word_frequencies

In [None]:
df_weighted = df.mul(df_word_frequencies['Count'], axis=0).fillna(0).astype(int)
df_weighted

df_weighted_sums = df_weighted.copy()
df_weighted_sums.loc["Total"] = df_weighted_sums.sum(axis=0)
df_weighted_sums


The following chart is a bit insane, so don't worry if it takes a long time to load.

In [None]:
top_n = 20
other_label = "Other"

# 1. Collect top words per phoneme (column)
top_words_all = set()
top_words_per_phoneme = {}

for phoneme in df_weighted.columns:
    word_contributions = df_weighted[phoneme] 
    n_largest = word_contributions.nlargest(top_n).to_dict()
    others = word_contributions.sum() - sum(n_largest.values())
    n_largest['others'] = others
    
    top_words_per_phoneme[phoneme] =n_largest


df_stacked = pd.DataFrame(top_words_per_phoneme).fillna(0).astype(int).T
columns = list(df_stacked.columns)

# Move others to the front
columns.remove('others')
columns.sort()
columns.insert(0, 'others')

traces = []
for word in columns:
    # Compute percentage contribution of this word for each phoneme
    percentages = df_stacked[word] / df_stacked.sum(axis=1) * 100

    traces.append(go.Bar(
        y=df_stacked.index,  # phonemes
        x=df_stacked[word],  # word's contribution
        name=word,
        orientation='h',
        customdata=percentages,
        hovertemplate=(
            "<b>Phoneme: %{y}</b><br>" +
            "Word: " + word + "<br>" +
            "Count: %{x}<br>" +
            "Contribution: %{customdata:.1f}%<br>" +
            "<extra></extra>"
        )
    ))

fig = go.Figure(data=traces)

fig.update_layout(
    barmode='stack',
    title='Stacked Contribution of Words to Phonemes',
    xaxis_title='Total Weighted Phoneme Count',
    yaxis_title='Phoneme',
    height=800,
    margin=dict(l=120, r=40, t=50, b=50),
    legend_title_text='Word'
)

fig.show()


Another intersting diagnostic may be to group words that sound the same.

In [None]:
from collections import defaultdict

sounds = defaultdict(list)
for word, sound in unique_translations.keys():
    sounds[sound].append(word)

# Multisounds (filter out those with just one word)
multisounds = [dict(phonemes=phonemes, words=words) for phonemes, words in sounds.items() if len(words) > 1]
multisounds = sorted(multisounds, key=lambda x: len(x['words']), reverse=True)

multisounds
