In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import xml.etree.ElementTree as ET
import spacy
from spacy.lang.zh.examples import sentences

import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np

import torch
from torch.nn.utils.rnn import pad_sequence # not from scratch
from torch.utils.data import TensorDataset, DataLoader

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split, Subset
import random
import os

from tqdm import tqdm


import matplotlib.pyplot as plt
from scipy.stats import mode

In [37]:
# RULE 1: Subject Verb Agreement
def check_subject_verb_agreement(tagged_tokens):
    """
    Checks for subject-verb agreement errors in a sentence.
    This function assumes that the input is a list of (word, POS) tuples.

    Rules:
    - Singular subjects (e.g., he, she, it, singular nouns) must pair with third-person singular verbs (VBZ).
    - Plural subjects (e.g., they, plural nouns) must pair with plural verb forms (VBP).
    - First-person singular (I) and second-person singular/plural (you) have specific verb forms.

    Parameters:
        tagged_tokens (list of tuples): List of (word, POS) tuples.

    Returns:
        int: Number of subject-verb agreement errors detected.
    """
    errors = 0
    singular_pronouns = {"he", "she", "it"}  # Singular pronouns
    plural_pronouns = {"we", "they"}         # Plural pronouns
    second_person_pronouns = {"you"}         # Second person (singular/plural)
    first_person_singular = {"i"}            # First person singular
    verb_dict = {"VB", "VBP", "VBZ", "VBD", "VBG", "VBN"}

    subject_found = None  # Keep track of the last subject found

    for i, (word, pos) in enumerate(tagged_tokens):
        # Identify subjects (pronouns or nouns)
        if pos == "PRP" or pos == "NN" or pos == "NNS":
            subject_found = (word.lower(), pos)

        # Check verbs only if a subject has been found
        if subject_found and pos in verb_dict:
            subj_word, subj_pos = subject_found

            # Handle singular pronouns
            if subj_pos == "PRP" and subj_word in singular_pronouns:
                if pos != "VBZ":
                    errors += 1

            # Handle plural pronouns
            elif subj_pos == "PRP" and subj_word in plural_pronouns:
                if pos != "VBP":
                    errors += 1

            # Handle second-person pronouns
            elif subj_pos == "PRP" and subj_word in second_person_pronouns:
                if pos not in {"VB", "VBP"}:
                    errors += 1

            # Handle first-person singular pronouns
            elif subj_pos == "PRP" and subj_word in first_person_singular:
                if pos != "VBP":
                    errors += 1

            # Handle singular nouns
            elif subj_pos == "NN":
                if pos != "VBZ":
                    errors += 1

            # Handle plural nouns
            elif subj_pos == "NNS":
                if pos != "VBP":
                    errors += 1

            # Reset subject_found for compound predicates
            subject_found = None

    return errors

In [4]:
# helper function to enforce RULE 2
# not all-encompassing
word_to_num = {
    "one": 1,
    "two": 2,
    "three": 3,
    "four": 4,
    "five": 5,
    "six": 6,
    "seven": 7,
    "eight": 8,
    "nine": 9,
    "ten": 10,
    "hundred": 100,
    "hundreds": 100,
    "thousand": 1000,
    "thousands": 1000,
    "million": 10e6,
    "millions": 10e6
}

def word_to_number(word):
    """
    Converts a word or digit string into an integer.

    Parameters:
        word (str): The input word or string.

    Returns:
        int: The corresponding integer value, or None if not valid.
    """
    if word.isdigit():  # Check if it's a numeric string
        return int(word)
    elif word.lower() in word_to_num:  # Check if it's a number word
        return word_to_num[word.lower()]
    return None

In [5]:
# RULE 2: Singular/Plural Noun Confusion
def check_singular_plural_confusion(tagged_tokens):
    """
    Checks for singular/plural noun confusion errors based on POS tags and context.
    Uses Penn Treebank POS tags to detect errors in singular/plural noun usage.

    Parameters:
        tagged_tokens (list of tuples): List of (word, POS) tuples.

    Returns:
        int: Number of singular/plural noun confusion errors detected.
    """
    errors = 0
    for i, (word, pos) in enumerate(tagged_tokens):
        number = word_to_number(word)
        # Check for singular determiners or numbers
        if pos in {"DT", "CD"} and (word.lower() in {"a", "an", "one", "each", "every", "this", "that"} or (number == 1)):
            # Look for the next noun
            for j in range(i + 1, len(tagged_tokens)):
                if tagged_tokens[j][1] in {"NN", "NNS"}:
                    if tagged_tokens[j][1] == "NNS":  # Plural noun found where singular expected
                        errors += 1
                    break

        # Check for plural determiners or numbers
        elif pos in {"DT", "CD", "JJ", "PDT"} and (word.lower() in {"many", "several", "these", "those", "all", "both"} or (number and number > 1)):
            # Look for the next noun
            for j in range(i + 1, len(tagged_tokens)):
                if tagged_tokens[j][1] in {"NN", "NNS"}:
                    if tagged_tokens[j][1] == "NN":  # Singular noun found where plural expected
                        errors += 1
                    break

    return errors

In [6]:
# RULE 3: Verb Tense Confusion
def check_verb_tense_confusion(tagged_tokens):
    """
    Checks for verb tense confusion errors based on temporal expressions and verb tenses.
    Uses Penn Treebank POS tags to detect mismatches between temporal context and verb forms.

    Parameters:
        tagged_tokens (list of tuples): List of (word, POS) tuples.

    Returns:
        int: Number of verb tense confusion errors detected.
    """
    errors = 0

    # Temporal keywords categorized by tense
    past_markers = {"yesterday", "last", "ago", "earlier"}
    present_markers = {"today", "now", "currently"}
    future_markers = {"tomorrow", "next", "later", "soon", "will"}
    verb_dict = {"VB", "VBD", "VBZ", "VBP", "MD", "VBG", "VBN"}

    skip_next = False # skip checking next token if it's part of an auxiliary phrase

    for i, (word, pos) in enumerate(tagged_tokens):
        if skip_next:
          skip_next = False
          continue

         # Check for temporal markers
        if word.lower() in past_markers:
            # Look for the nearest verb after the temporal marker
            for j in range(i + 1, len(tagged_tokens)):
                if tagged_tokens[j][1] in {"VB", "VBD", "VBZ", "VBP", "MD", "VBG", "VBN"}:
                    if tagged_tokens[j][1] in {"VBD", "VBN"}:  # Past tense is correct
                        if tagged_tokens[j][1] == "VBD" or tagged_tokens[j - 1][1] in {"was", "had"}:
                            skip_next = True  # Skip compound tense verbs
                        break
                    else:
                        errors += 1  # Past tense mismatch
                    break

        elif word.lower() in present_markers:
            # Look for the nearest verb after the temporal marker
            for j in range(i + 1, len(tagged_tokens)):
                if tagged_tokens[j][1] in {"VB", "VBD", "VBZ", "VBP", "MD", "VBG", "VBN"}:
                    if tagged_tokens[j][1] in {"VBZ", "VBP", "VBG"}:  # Present tense is correct
                        if tagged_tokens[j][1] == "VBG" or tagged_tokens[j - 1][1] in {"is", "has"}:
                            skip_next = True  # Skip compound tense verbs
                        break
                    else:
                        errors += 1  # Present tense mismatch
                    break

        elif word.lower() in future_markers:
            # Look for the nearest verb after the temporal marker
            for j in range(i + 1, len(tagged_tokens)):
                if tagged_tokens[j][1] in {"VB", "VBD", "VBZ", "VBP", "MD", "VBG", "VBN"}:
                    if tagged_tokens[j][1] == "MD":  # Future tense auxiliary found
                        skip_next = True  # Skip the next token (part of auxiliary/modal phrase)
                        break
                    elif tagged_tokens[j][1] != "MD":  # Future tense mismatch
                        errors += 1
                    break
    return errors

In [65]:
# RULE 4: Omitting/Inserting Articles
def check_articles(tagged_tokens):
  """
  Checks for errors related to omitting or inserting articles in sentences.
  Uses Penn Treebank POS tags to detect errors in article usage.

  Parameters:
      tagged_tokens (list of tuples): List of (word, POS) tuples.

  Returns:
      int: Number of article-related errors detected.
  """
  errors = 0
  articles = {"a", "an", "the"}
  # uncountable nouns are not comprehensive but common ones are included
  uncountable_nouns = {"homework", "air", "furniture", "information", "advice",
        "rice", "fear", "safety", "water", "beauty", "knowledge", "love",
        "research", "advice", "work", "bread", "traffic", "travel", "weather", "news",
        "soccer", "tennis", "basketball", "swimming", "baseball", "dinner"}
  # Add more as needed

  for i, (word, pos) in enumerate(tagged_tokens):
      # Check for missing articles before singular countable nouns (NN)
      if pos == "NN":
          if i > 0 and tagged_tokens[i - 1][1] == "DT":  # If an article exists
              # Check for redundant articles with uncountable nouns
              if word.lower() in uncountable_nouns and tagged_tokens[i - 1][0].lower() in {"a", "an"}:
                  errors += 1  # Redundant article for uncountable noun
              continue
          elif word.lower() not in uncountable_nouns:  # Singular countable nouns need articles
              # Check if the noun is preceded by a non-article determiner (e.g., "my", "this")
              if i == 0 or tagged_tokens[i - 1][1] not in {"DT", "PRP$", "JJ", "CD"}:
                  errors += 1  # Missing article

      # Check for redundant articles before uncountable nouns or plural nouns (NNS)
      if pos == "NNS" and i > 0 and tagged_tokens[i - 1][1] == "DT":
          prev_word = tagged_tokens[i - 1][0].lower()
          if prev_word in {"a", "an"}:  # Indefinite articles are invalid before plural nouns
              errors += 1  # Redundant or incorrect article
      # Check for conjunctions before proceeding to the next word
      if pos in {"NNS", "NN"} and i > 0 and tagged_tokens[i - 1][1] == "CC":
          continue  # Skip validation for conjunctions like "and"

      # Check for improper articles with proper nouns (NNP, NNPS)
      if pos in {"NNP", "NNPS"} and i > 0 and tagged_tokens[i - 1][1] == "DT":
          if word.lower() not in {"earth", "moon", "sun"}:  # Exceptions where articles are valid
              errors += 1  # Redundant article

  return errors

In [8]:
nlp_en = spacy.load("en_core_web_sm")

def get_ptb_tags(sentence):
  doc = nlp_en(sentence)
  return [(token.text, token.tag_) for token in doc]

In [118]:
def test_rule_fct(rules_fct, sentences):
  results = {}
  for sentence in sentences:
    tagged_tokens = get_ptb_tags(sentence)
    # print(f"Sentence: {sentence}")
    # print(f"Tagged Tokens: {tagged_tokens}")

    errors = rules_fct(tagged_tokens)
    results[sentence] = errors
    # print(f"{rules_fct.__name__} Errors detected: {errors}\n")

  return results

In [64]:
sentences_1 = [
  "He like cheese.",
  "The dog bark loudly.",
  "I am really happy that you were able to attend our function this evening.",
  "The bark of the tree is rough.",
  "I am making dinner in the kitchen. I am cooking the chicken and vegetables."
  ]

In [96]:
results = test_rule_fct(check_subject_verb_agreement, sentences_1)

Sentence: He like cheese.
Tagged Tokens: [('He', 'PRP'), ('like', 'VBP'), ('cheese', 'NN'), ('.', '.')]
check_subject_verb_agreement Errors detected: 1

Sentence: The dog bark loudly.
Tagged Tokens: [('The', 'DT'), ('dog', 'NN'), ('bark', 'NN'), ('loudly', 'RB'), ('.', '.')]
check_subject_verb_agreement Errors detected: 0

Sentence: I am really happy that you were able to attend our function this evening.
Tagged Tokens: [('I', 'PRP'), ('am', 'VBP'), ('really', 'RB'), ('happy', 'JJ'), ('that', 'IN'), ('you', 'PRP'), ('were', 'VBD'), ('able', 'JJ'), ('to', 'TO'), ('attend', 'VB'), ('our', 'PRP$'), ('function', 'NN'), ('this', 'DT'), ('evening', 'NN'), ('.', '.')]
check_subject_verb_agreement Errors detected: 1

Sentence: The bark of the tree is rough.
Tagged Tokens: [('The', 'DT'), ('bark', 'NN'), ('of', 'IN'), ('the', 'DT'), ('tree', 'NN'), ('is', 'VBZ'), ('rough', 'JJ'), ('.', '.')]
check_subject_verb_agreement Errors detected: 0

Sentence: I am making dinner in the kitchen. I am cooki

In [85]:
print(type(results))

<class 'dict'>


In [30]:
sentences_2 = [
    "We have three dog.",
    "One cats is on the roof.",
    "Many cat are sleeping.",
    "These tree are tall.",
    "Each dogs is friendly."
]

In [31]:
results_2 = test_rule_fct(check_singular_plural_confusion, sentences_2)

Sentence: We have three dog.
Tagged Tokens: [('We', 'PRP'), ('have', 'VBP'), ('three', 'CD'), ('dog', 'NN'), ('.', '.')]
Errors detected: 1

Sentence: One cats is on the roof.
Tagged Tokens: [('One', 'CD'), ('cats', 'NNS'), ('is', 'VBZ'), ('on', 'IN'), ('the', 'DT'), ('roof', 'NN'), ('.', '.')]
Errors detected: 1

Sentence: Many cat are sleeping.
Tagged Tokens: [('Many', 'JJ'), ('cat', 'NN'), ('are', 'VBP'), ('sleeping', 'VBG'), ('.', '.')]
Errors detected: 1

Sentence: These tree are tall.
Tagged Tokens: [('These', 'DT'), ('tree', 'NN'), ('are', 'VBP'), ('tall', 'JJ'), ('.', '.')]
Errors detected: 1

Sentence: Each dogs is friendly.
Tagged Tokens: [('Each', 'DT'), ('dogs', 'NNS'), ('is', 'VBZ'), ('friendly', 'JJ'), ('.', '.')]
Errors detected: 1



In [32]:
sentences_3 = [
    "Yesterday she is running to the park.",     # incorrect 1
    "Today he ran to the store.", # incorrect   1
    "Tomorrow she shall run a marathon.",# correct   0
    "Last week he runs every day.",     # incorrect 1
    "Next year they build a new house.",  # incorrect 1
    "Now she is writing a letter.", # correct 0
    "Now she was writing a letter." # incorrect 1
]

In [33]:
results_3 = test_rule_fct(check_verb_tense_confusion, sentences_3)

Sentence: Yesterday she is running to the park.
Tagged Tokens: [('Yesterday', 'NN'), ('she', 'PRP'), ('is', 'VBZ'), ('running', 'VBG'), ('to', 'IN'), ('the', 'DT'), ('park', 'NN'), ('.', '.')]
Errors detected: 1

Sentence: Today he ran to the store.
Tagged Tokens: [('Today', 'NN'), ('he', 'PRP'), ('ran', 'VBD'), ('to', 'IN'), ('the', 'DT'), ('store', 'NN'), ('.', '.')]
Errors detected: 1

Sentence: Tomorrow she shall run a marathon.
Tagged Tokens: [('Tomorrow', 'NN'), ('she', 'PRP'), ('shall', 'MD'), ('run', 'VB'), ('a', 'DT'), ('marathon', 'NN'), ('.', '.')]
Errors detected: 0

Sentence: Last week he runs every day.
Tagged Tokens: [('Last', 'JJ'), ('week', 'NN'), ('he', 'PRP'), ('runs', 'VBZ'), ('every', 'DT'), ('day', 'NN'), ('.', '.')]
Errors detected: 1

Sentence: Next year they build a new house.
Tagged Tokens: [('Next', 'JJ'), ('year', 'NN'), ('they', 'PRP'), ('build', 'VBP'), ('a', 'DT'), ('new', 'JJ'), ('house', 'NN'), ('.', '.')]
Errors detected: 1

Sentence: Now she is writin

In [91]:
sentences_4 = [
    "I went to store.",                       # Error: Missing "the".
    "He likes movie.",                        # Error: Missing "a".
    "The God blessed America.",               # Error: Redundant "The".
    "You gained the weight last month.",      # Error: Redundant "the".
    "She has a homework to finish.",          # Error: Redundant "a".
    "The Earth orbits the Sun.",              # Correct (exception case for proper nouns).
    "I want an information from you.",        # Error: Uncountable noun
    "I like to play soccer with my friends on the weekends.", # Correct
    "I am making dinner in the kitchen. I am cooking chicken and vegetables."
]

In [94]:
results_4 = test_rule_fct(check_articles, sentences_4)

Sentence: I went to store.
Tagged Tokens: [('I', 'PRP'), ('went', 'VBD'), ('to', 'IN'), ('store', 'NN'), ('.', '.')]
check_articles Errors detected: 1
Sentence: He likes movie.
Tagged Tokens: [('He', 'PRP'), ('likes', 'VBZ'), ('movie', 'NN'), ('.', '.')]
check_articles Errors detected: 1
Sentence: The God blessed America.
Tagged Tokens: [('The', 'DT'), ('God', 'NNP'), ('blessed', 'VBD'), ('America', 'NNP'), ('.', '.')]
check_articles Errors detected: 1
Sentence: You gained the weight last month.
Tagged Tokens: [('You', 'PRP'), ('gained', 'VBD'), ('the', 'DT'), ('weight', 'NN'), ('last', 'JJ'), ('month', 'NN'), ('.', '.')]
check_articles Errors detected: 0
Sentence: She has a homework to finish.
Tagged Tokens: [('She', 'PRP'), ('has', 'VBZ'), ('a', 'DT'), ('homework', 'NN'), ('to', 'TO'), ('finish', 'VB'), ('.', '.')]
check_articles Errors detected: 1
Sentence: The Earth orbits the Sun.
Tagged Tokens: [('The', 'DT'), ('Earth', 'NNP'), ('orbits', 'VBZ'), ('the', 'DT'), ('Sun', 'NNP'), ('

In [19]:
global_voices_sentences_df = pd.read_csv('/content/drive/My Drive/IW_Codebase/rule_based_flow/global_voices_en_sentences.csv')
tle_sentences_df = pd.read_csv('/content/drive/My Drive/IW_Codebase/rule_based_flow/tle_sentences.csv')
mac_sentences_df = pd.read_csv('/content/drive/My Drive/IW_Codebase/rule_based_flow/mac_en_sentences.csv')

# SYNTHETIC DATASET
sse_sentences_df = pd.read_csv('/content/drive/My Drive/IW_Codebase/rule_based_flow/sse_sentences.csv')
sle_sentences_df = pd.read_csv('/content/drive/My Drive/IW_Codebase/rule_based_flow/sle_sentences.csv')

sse_sentences_df = sse_sentences_df.drop(columns=["Sentence"])
sse_sentences_df = sse_sentences_df.rename(columns={"Sentence.1": "Sentence"})
sle_sentences_df = sle_sentences_df.rename(columns={"0": "Sentence"})

In [20]:
sse_sentences = sse_sentences_df['Sentence'].tolist()
sse_test = sse_sentences[:10]
sse_test

['I am eating a sandwich for lunch today.',
 'I went to the mall with my mom and bought a new toy.',
 'My friends and I like to hang out at the park on weekends.',
 'I eat my sandwich and fruit for lunch every day',
 'I love spending time with my friends at the park.',
 'I play soccer with my friends every Saturday.',
 'I am making dinner in the kitchen. I am cooking chicken and vegetables.',
 'I like to play soccer with my friends on the weekends.',
 'I like hanging out with my friends at the park on weekends.',
 'I went to the mall with my family and bought a new toy.']

In [88]:
rule_functions = [
    check_subject_verb_agreement,
    check_singular_plural_confusion,
    check_verb_tense_confusion,
    check_articles,
]

def cumulate(results):
    """
    Sums the errors across all rules for each sentence.

    Parameters:
        results (dict): A dictionary where keys are sentences and values are
                       dictionaries of rule names mapping to error counts.

    Returns:
        dict: A dictionary where each sentence maps to its total error count.
    """
    cumulative_errors = {}
    for sentence, rule_errors in results.items():
        cumulative_errors[sentence] = sum(rule_errors.values())
    return cumulative_errors

def check_all_rules(sentence):
    """
    Checks all rules for a list of sentences and sums error counts in parallel.

    Parameters:
        sentences (list of str): List of sentences to be checked.

    Returns:
        dict: A dictionary where each sentence maps to its total error count.
    """
    # Pass all sentences and all rule functions to test_rule_fct
    all_results = test_rule_fct(rule_functions, sentences)

    # Sum errors across all rules for each sentence
    cumulative_results = cumulate(all_results)

    return cumulative_results

In [22]:
nlp_en = spacy.load("en_core_web_sm")

def get_ptb_tags(sentence):
  doc = nlp_en(sentence)
  return [(token.text, token.tag_) for token in doc]

In [89]:
def check_sentences(sentences):
    """
    Checks multiple sentences for grammar errors using all rules.

    Parameters:
        sentences (list of str): List of sentences to be checked.

    Returns:
        dict: A dictionary where each sentence maps to its total error count.
    """
    print(f"Processing sentences: {sentences}")
    total_errors = check_all_rules(sentences)
    return total_errors

In [112]:
def sum_errors_for_sentence(sentence, dict_collection):
  total_errors = 0
  for item in dict_collection:
    if sentence in item:
      total_errors += item[sentence]
  return total_errors

In [120]:
def get_error_rate(sentences):

  dict_collection = []

  for rule in rule_functions:
    dict_collection.append(test_rule_fct(rule, sentences))

  total_errors = 0
  total_words = 0

  for sentence in sentences:
    errors_for_sentence = sum_errors_for_sentence(sentence, dict_collection)
    total_errors += errors_for_sentence

    # Count the number of words in the sentence
    num_words = len(sentence.split())
    total_words += num_words

  average_error_rate = total_errors / total_words if total_words > 0 else 0

  print(f"\nTotal Errors Across All Sentences: {total_errors}")
  print(f"Total Words Across All Sentences: {total_words}")
  print(f"Average Error Rate (errors per word): {average_error_rate:.4f}")

In [123]:
# GET ALL SENTENCES

global_voices_en_df = pd.read_csv('/content/drive/My Drive/IW_Codebase/rule_based_flow/global_voices_en_sentences.csv')
mac_en_sentences_df = pd.read_csv('/content/drive/My Drive/IW_Codebase/rule_based_flow/mac_en_sentences.csv')
tle_sentences_df = pd.read_csv('/content/drive/My Drive/IW_Codebase/rule_based_flow/tle_sentences.csv')

sse_df = pd.read_csv('/content/drive/My Drive/IW_Codebase/rule_based_flow/sse_sentences.csv')
sle_df = pd.read_csv('/content/drive/My Drive/IW_Codebase/rule_based_flow/sle_sentences.csv')

sse_df = sse_df.drop(columns=["Sentence"])
sse_df = sse_df.rename(columns={"Sentence.1": "Sentence"})
sle_df = sle_df.rename(columns={"0": "Sentence"})

In [137]:
global_voices_en_sentences = global_voices_en_df['sentence'].tolist()
mac_en_sentences = mac_en_sentences_df['sentence'].tolist()
tle_sentences = tle_sentences_df['sentence'].tolist()

sse_sentences = sse_df['Sentence'].tolist()
sle_sentences = sle_df['Sentence'].tolist()
# sle_sentences

In [139]:
sentence_list = [global_voices_en_sentences, mac_en_sentences, tle_sentences, sse_sentences, sle_sentences]

for item in sentence_list:
  get_error_rate(item)


Total Errors Across All Sentences: 5147
Total Words Across All Sentences: 35235
Average Error Rate (errors per word): 0.1461

Total Errors Across All Sentences: 5229
Total Words Across All Sentences: 32314
Average Error Rate (errors per word): 0.1618

Total Errors Across All Sentences: 3813
Total Words Across All Sentences: 28327
Average Error Rate (errors per word): 0.1346

Total Errors Across All Sentences: 2864
Total Words Across All Sentences: 23492
Average Error Rate (errors per word): 0.1219

Total Errors Across All Sentences: 4319
Total Words Across All Sentences: 19119
Average Error Rate (errors per word): 0.2259
