In [None]:
import polars as pl

eng_dict = pl.read_csv(
    "eng_dict.txt",
    separator="\t",
    has_header=False,
    skip_lines=18,
    infer_schema_length=10000000,
).drop("column_2")
eng_dict = eng_dict.with_columns()

column_1,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11
str,str,str,str,str,str,str,str,str,str
"""enwraps ""","""enwrap""","""V 3sg PRES""",,,,,,,
"""prolapse's ""","""prolapse""","""N 3sg GEN""",,,,,,,
"""antibiotic's ""","""antibiotic""","""N 3sg GEN""",,,,,,,
"""backdrop's ""","""backcloth""","""N 3sg GEN""",,,,,,,
"""bubals' ""","""bubal""","""N 3pl GEN""",,,,,,,
…,…,…,…,…,…,…,…,…,…
"""collapses ""","""collapse""","""N 3pl#collapse""","""V 3sg PRES""",,,,,,
"""militarises ""","""militarize""","""V 3sg PRES""",,,,,,,
"""open-eyed ""","""open-eyed""","""A""",,,,,,,
"""bellbottomses' ""","""bell-bottoms""","""N 3pl GEN""",,,,,,,


In [None]:
eng

In [None]:
sentence = "The quick brown fox jumps over the lazy dog"
# Split the sentence into words
words = sentence.split()
words = [word.lower() for word in words]
words

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

In [None]:
from collections import defaultdict, namedtuple

Rule = namedtuple("Rule", ["lhs", "rhs"])


class EarleyParser:
    def __init__(self, grammar, start_symbol):
        self.grammar = grammar
        self.start_symbol = start_symbol
        self.rules = self._build_rules(grammar)

    def _build_rules(self, grammar):
        rules = defaultdict(list)
        for lhs, productions in grammar.items():
            for prod in productions:
                rules[lhs].append(prod)
        return rules

    def parse(self, tokens):
        n = len(tokens)
        chart = [set() for _ in range(n + 1)]
        chart[0].add(("S'", ["•", self.start_symbol], 0))  # Start with S' → • S

        def closure(i):
            added = True
            while added:
                added = False
                for state in list(chart[i]):
                    dot_pos = state[1].index("•")
                    if dot_pos < len(state[1]) - 1:
                        next_sym = state[1][dot_pos + 1]
                        if next_sym in self.rules:
                            for prod in self.rules[next_sym]:
                                new_item = (next_sym, ["•"] + prod, i)
                                if new_item not in chart[i]:
                                    chart[i].add(new_item)
                                    added = True

        def scanner(i):
            for state in list(chart[i]):
                dot_pos = state[1].index("•")
                if dot_pos < len(state[1]) - 1:
                    next_sym = state[1][dot_pos + 1]
                    if i < n and next_sym == tokens[i]:
                        new_state = (
                            state[0],
                            state[1][:dot_pos]
                            + [next_sym, "•"]
                            + state[1][dot_pos + 2 :],
                            state[2],
                        )
                        chart[i + 1].add(new_state)

        def completer(i):
            for state in list(chart[i]):
                if state[1][-1] == "•":
                    for s in chart[state[2]]:
                        dot_pos = s[1].index("•")
                        if dot_pos < len(s[1]) - 1 and s[1][dot_pos + 1] == state[0]:
                            new_state = (
                                s[0],
                                s[1][:dot_pos] + [state[0], "•"] + s[1][dot_pos + 2 :],
                                s[2],
                            )
                            chart[i].add(new_state)

        for i in range(n + 1):
            closure(i)
            if i < n:
                scanner(i)
            completer(i)

        # Check acceptance
        final_state = ("S'", [self.start_symbol, "•"], 0)
        return final_state in chart[n], chart

In [12]:
# Example usage
grammar = {
    "S": [["NP", "VP"]],
    "NP": [["Det", "N"]],
    "VP": [["V", "NP"]],
    "Det": [["the"]],
    "N": [["dog"], ["cat"]],
    "V": [["chased"], ["saw"]],
}

parser = EarleyParser(grammar, "S")
words = ["the", "dog", "chased", "the", "cat"]
accepted, parse_chart = parser.parse(words)

print("Accepted:", accepted)
for i, state_set in enumerate(parse_chart):
    print(f"I{i}:")
    for item in state_set:
        print(" ", item)

TypeError: unhashable type: 'list'

In [None]:
class State(object):
    def __init__(
        self, label, rules, dot_idx, start_idx, end_idx, idx, made_from, producer
    ):
        self.label = label
        self.rules = rules
        self.dot_idx = dot_idx
        self.start_idx = start_idx
        self.end_idx = end_idx
        self.idx = idx
        self.made_from = made_from
        self.producer = producer

    def next(self):
        """Returns the tag after the dot"""
        return self.rules[self.dot_idx]

    def complete(self):
        return len(self.rules) == self.dot_idx

    def __eq__(self, other):
        return (
            self.label == other.label
            and self.rules == other.rules
            and self.dot_idx == other.dot_idx
            and self.start_idx == other.start_idx
            and self.end_idx == other.end_idx
        )

    def __str__(self):
        rule_string = ""
        for i, rule in enumerate(self.rules):
            if i == self.dot_idx:
                rule_string += "\\bullet "
            rule_string += rule + " "
        if self.dot_idx == len(self.rules):
            rule_string += "\\bullet"
        return "S%d %s -> %s [%d, %d] %s %s" % (
            self.idx,
            self.label,
            rule_string,
            self.start_idx,
            self.end_idx,
            self.made_from,
            self.producer,
        )


class Earley:
    def __init__(self, words, grammar, terminals):
        self.chart = [[] for _ in range(len(words) + 1)]
        self.current_id = 0
        self.words = words
        self.grammar = grammar
        self.terminals = terminals

    def get_new_id(self):
        self.current_id += 1
        return self.current_id - 1

    def is_terminal(self, tag):
        return tag in self.terminals

    def is_complete(self, state):
        return len(state.rules) == state.dot_idx

    def enqueue(self, state, chart_entry):
        if state not in self.chart[chart_entry]:
            self.chart[chart_entry].append(state)
        else:
            self.current_id -= 1

    def predictor(self, state):
        for production in self.grammar[state.next()]:
            self.enqueue(
                State(
                    state.next(),
                    production,
                    0,
                    state.end_idx,
                    state.end_idx,
                    self.get_new_id(),
                    [],
                    "predictor",
                ),
                state.end_idx,
            )

    def scanner(self, state):
        if self.words[state.end_idx] in self.grammar[state.next()]:
            self.enqueue(
                State(
                    state.next(),
                    [self.words[state.end_idx]],
                    1,
                    state.end_idx,
                    state.end_idx + 1,
                    self.get_new_id(),
                    [],
                    "scanner",
                ),
                state.end_idx + 1,
            )

    def completer(self, state):
        for s in self.chart[state.start_idx]:
            if (
                not s.complete()
                and s.next() == state.label
                and s.end_idx == state.start_idx
                and s.label != "gamma"
            ):
                self.enqueue(
                    State(
                        s.label,
                        s.rules,
                        s.dot_idx + 1,
                        s.start_idx,
                        state.end_idx,
                        self.get_new_id(),
                        s.made_from + [state.idx],
                        "completer",
                    ),
                    state.end_idx,
                )

    def parse(self):
        self.enqueue(
            State("gamma", ["S"], 0, 0, 0, self.get_new_id(), [], "dummy start state"),
            0,
        )

        for i in range(len(self.words) + 1):
            for state in self.chart[i]:
                if not state.complete() and not self.is_terminal(state.next()):
                    self.predictor(state)
                elif (
                    i != len(self.words)
                    and not state.complete()
                    and self.is_terminal(state.next())
                ):
                    self.scanner(state)
                else:
                    self.completer(state)

    def __str__(self):
        res = ""

        for i, chart in enumerate(self.chart):
            res += "\nChart[%d]\n" % i
            for state in chart:
                res += str(state) + "\n"

        return res


def test():
    grammar = {
        "S": [["NP", "VP"], ["Aux", "NP", "VP"], ["VP"]],
        "NP": [["Det", "Nominal"], ["Proper-Noun"]],
        "Nominal": [["Noun"], ["Noun", "Nominal"]],
        "VP": [["Verb"], ["Verb", "NP"]],
        "Det": ["that", "this", "a"],
        "Noun": ["book", "flight", "meal", "money"],
        "Verb": ["book", "include", "prever"],
        "Aux": ["does"],
        "Prep": ["from", "to", "on"],
        "Proper-Noun": ["Houston", "TWA"],
    }
    terminals = ["Det", "Noun", "Verb", "Aux", "Prep", "Proper-Noun"]

    earley = Earley(["book", "that", "flight"], grammar, terminals)
    earley.parse()


test()