<a href="https://colab.research.google.com/github/iverinaivanova/complexity-factors/blob/main/%20complexity-factors/retrieval-compl-factors/supplements.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.7.3
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-curated-transformers<0.3.0,>=0.2.0 (from en-core-web-trf==3.7.3)
  Downloading spacy_curated_transformers-0.2.2-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting curated-transformers<0.2.0,>=0.1.0 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-core-web-trf==3.7.3)
  Downloading curated_transformers-0.1.1-py2.py3-none-any.whl.metadata (965 bytes)
Collecting curated-tokenizers<0.1.0,>=0.0.9 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-core-web-trf==3.7.3)
  Downloading curated_tokenizers-0.0.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.12.0->spacy-curated-transformers<0.3.0

In [None]:
import spacy

# Load the transformer-based English model
nlp = spacy.load("en_core_web_trf")

In [None]:
from spacy.tokens import Span
import re

def has_clausal_supplement(doc):
    """
    Check if a clause contains a clausal supplement (parenthetical), including relative clauses and parataxis.
    Returns 1 if a supplement is found, 0 otherwise.
    """
    # Look for punctuation that might indicate a parenthetical (commas, dashes, parentheses)
    punctuation_marks = {',', '—', '(', ')', '_', '--'}
    has_punctuation = any(token.text in punctuation_marks for token in doc)

    if not has_punctuation:
        return 0

    # Dependency-based detection
    for token in doc:
        # Non-restrictive relative clauses (relcl)
        if token.dep_ == "relcl":
            left_idx = token.left_edge.i
            right_idx = token.right_edge.i
            if left_idx > 0 and right_idx < len(doc) - 1:
                left_punct = doc[left_idx - 1].text.strip()
                right_punct = doc[right_idx + 1].text.strip()
                # Check for comma, dash, or parentheses boundaries
                if (left_punct == ',' and right_punct == ',') or \
                   (left_punct == '—' and right_punct == '—') or \
                   (left_punct == '(' and right_punct == ')'):
                    # Verify it starts with a relative pronoun (who, which, that, etc.)
                    subtree = list(token.subtree)
                    if subtree and subtree[0].text.lower() in {"who", "whom", "which"}:
                        return 1
                    # Check for preposition + relative pronoun (e.g., "of which")
                    elif len(subtree) > 1 and subtree[1].text.lower() in {"who", "whom", "which"} \
                         and subtree[0].pos_ == "ADP":
                        return 1

        # Paratactic parenthetical clauses
        elif token.dep_ == "parataxis":
            subtree = list(token.subtree)
            left_idx = subtree[0].i
            right_idx = subtree[-1].i
            if left_idx > 0 and right_idx < len(doc) - 1:
                left_punct = doc[left_idx - 1].text.strip()
                right_punct = doc[right_idx + 1].text.strip()
                if (left_punct == ',' and right_punct == ',') or \
                   (left_punct == '—' and right_punct == '—') or \
                   (left_punct == '(' and right_punct == ')'):
                    return 1

    # Fallback: Text-based pattern matching for short parentheticals
    text = doc.text
    pattern = r',\s*[A-Za-z\s]{1,20}\s*,'
    matches = re.finditer(pattern, text)
    for match in matches:
        span_text = match.group().strip(', ').strip()
        span_words = span_text.split()
        if len(span_words) <= 5:  # Short clause heuristic
            span_doc = nlp(span_text)
            has_verb = any(token.pos_ == "VERB" for token in span_doc)
            if has_verb:  # Likely a clause
                return 1

    return 0

In [None]:
def process_sentences(input_file, output_file):
    """
    Process sentences from the input file and determine the presence or absence of negation markers.
    Write the results to the output file with two columns: sentence and supplement (0 or 1).
    """
    with open(input_file, "r", encoding="utf-8") as infile, \
         open(output_file, "w", encoding="utf-8") as outfile:

        # Write header to the output file
        outfile.write("clause\tsupplement\n")

        # Process each line in the input file
        for line in infile:
            # Strip leading/trailing whitespace
            sentence = line.strip()

            # Skip empty lines
            if not sentence:
                continue

            # Parse the sentence with spaCy
            doc = nlp(sentence)

            supplements = has_clausal_supplement(doc)

            # Write the result to the output file
            outfile.write(f"{sentence}\t{supplements}\n")

            # Print the result to the console
            print(f"Clause: {sentence}")
            print(f"Clausal Supplement: {supplements}\n")


    print(f"Results have been written to {output_file}")

# That-Clauses
input_file = "/sample_supplements.txt"
output_file = "/supplements_that-cl.tsv"

# To-Inf
# input_file = "/all-nonfinCC-very-final-no-infm.txt"  # Replace with your input file path
# output_file = "/supplements_to-inf.tsv"  # Replace with your desired output file path
process_sentences(input_file, output_file)

[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m

Clause: To get five inches of rain in two hours.
Clausal Supplement: 0

Clause: That he can see this as a great hardship but not a martyrdom.
Clausal Supplement: 0

Clause: To make it into a Christian site with crosses raised high.
Clausal Supplement: 0

Clause: That the financial community is firmly supportive of our project.
Clausal Supplement: 0

Clause: To do away with a regulatory body that has been operational for the last 50 years and that even Europe.
Clausal Supplement: 0

Clause: To play Argentina.
Clausal Supplement: 0

Clause: To be able to go back to back like that.
Clausal Supplement: 0

Clause: To be number one in our group.
Clausal Supplement: 0

Clause: To see how cool and how calculated this guy really is at that age , how big a surprise.
Clausal Supplement: 0

Clause: To gain such a big advantage.
Clausal Supplement: 0

Clause: To look back at the responses that the murder evoked from t