<a href="https://colab.research.google.com/github/iverinaivanova/complexity-factors/blob/main/retrieval-compl-factors/constituents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.7.3
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-curated-transformers<0.3.0,>=0.2.0 (from en-core-web-trf==3.7.3)
  Downloading spacy_curated_transformers-0.2.2-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting curated-transformers<0.2.0,>=0.1.0 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-core-web-trf==3.7.3)
  Downloading curated_transformers-0.1.1-py2.py3-none-any.whl.metadata (965 bytes)
Collecting curated-tokenizers<0.1.0,>=0.0.9 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-core-web-trf==3.7.3)
  Downloading curated_tokenizers-0.0.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.12.0->spacy-curated-transformers<0.3.0

In [None]:
import spacy

# Load the transformer-based English model
nlp = spacy.load("en_core_web_trf")

  model.load_state_dict(torch.load(filelike, map_location=device))


In [None]:
def count_constituents(doc):
    """
    Count the number of constituents in a sentence:
    - Noun Phrases (NPs)
    - Verb Phrases (VPs)
    - Adjectival Phrases (AdjPs)
    - Adverbial Phrases (AdvPs)
    - Prepositional Phrases (PPs)
    """
    np_count = 0
    vp_count = 0
    adjp_count = 0
    advp_count = 0
    pp_count = 0

    # Iterate through tokens to identify constituents
    for token in doc:
        # Noun Phrases (NPs)
        if token.pos_ == "NOUN" or token.dep_ in {"nsubj", "dobj", "pobj", "nsubjpass"}:
            np_count += 1

        # Verb Phrases (VPs)
        if token.pos_ == "VERB" and token.dep_ == "ROOT":
            vp_count += 1

        # Adjectival Phrases (AdjPs)
        if token.pos_ == "ADJ" and token.dep_ in {"amod", "xcomp"}:
            adjp_count += 1

        # Adverbial Phrases (AdvPs)
        if token.pos_ == "ADV" and token.dep_ in {"advmod", "advcl"}:
            advp_count += 1

        # Prepositional Phrases (PPs)
        if token.pos_ == "ADP" and token.dep_ == "prep":
            pp_count += 1

    # Total number of constituents
    total_constituents = np_count + vp_count + adjp_count + advp_count + pp_count
    return total_constituents

def check_constituent_range(doc):
    """
    Check if the total number of constituents in a sentence is > 5 and < 10.
    Returns 1 if the condition is met, otherwise returns 0.
    """
    total_constituents = count_constituents(doc)
    return 1 if 5 < total_constituents < 10 else 0



In [None]:
def more_constituents(doc):
    """
    Count the number of constituents in a sentence:
    - Noun Phrases (NPs)
    - Verb Phrases (VPs)
    - Adjectival Phrases (AdjPs)
    - Adverbial Phrases (AdvPs)
    - Prepositional Phrases (PPs)
    """
    np_count = 0
    vp_count = 0
    adjp_count = 0
    advp_count = 0
    pp_count = 0

    # Iterate through tokens to identify constituents
    for token in doc:
        # Noun Phrases (NPs)
        if token.pos_ == "NOUN" or token.dep_ in {"nsubj", "dobj", "pobj", "nsubjpass"}:
            np_count += 1

        # Verb Phrases (VPs)
        if token.pos_ == "VERB" and token.dep_ == "ROOT":
            vp_count += 1

        # Adjectival Phrases (AdjPs)
        if token.pos_ == "ADJ" and token.dep_ in {"amod", "xcomp"}:
            adjp_count += 1

        # Adverbial Phrases (AdvPs)
        if token.pos_ == "ADV" and token.dep_ in {"advmod", "advcl"}:
            advp_count += 1

        # Prepositional Phrases (PPs)
        if token.pos_ == "ADP" and token.dep_ == "prep":
            pp_count += 1

    # Total number of constituents
    total_constituents = np_count + vp_count + adjp_count + advp_count + pp_count
    return total_constituents

def check_number_const(doc):
    """
    Check if the total number of constituents in a sentence is > 5 and < 10.
    Returns 1 if the condition is met, otherwise returns 0.
    """
    total_constituents = count_constituents(doc)
    return 1 if total_constituents > 10 else 0



In [None]:
def process_sentences(input_file, output_file):
    """
    Process sentences from the input file and check if the number of constituents is > 5 and < 10.
    Writes the results to the output file with two columns: sentence and flag (0 or 1).
    """
    with open(input_file, "r", encoding="utf-8") as infile, \
         open(output_file, "w", encoding="utf-8") as outfile:
        # Write header to the output file
        outfile.write("Sentence\t5<Const<10\tConst>10\n")

        # Process each line in the input file
        for line in infile:
            # Strip leading/trailing whitespace
            sentence = line.strip()

            # Skip empty lines
            if not sentence:
                continue

            # Parse the sentence with spaCy
            doc = nlp(sentence)

            # Check if the number of constituents is within the specified range
            constituents = check_constituent_range(doc)
            more_const = check_number_const(doc)

            # Write the result to the output file
            outfile.write(f"{sentence}\t{constituents}\t{more_const}\n")

            # Print the result to the console
            print(f"Sentence: {sentence}\n")
            print(f"5<Const<10: {constituents}\n")
            print(f"Const>10: {more_const}\n")

    print(f"Results have been written to {output_file}")

# That-Clauses
input_file = "/finiteCCs_cleaned_final.txt"
output_file = "/constituents_that-cl.tsv"

# To-Inf
# input_file = "/all-nonfinCC-very-final-no-infm.txt"
# output_file = "/constituents_to-inf.tsv"
process_sentences(input_file, output_file)

[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m

Sentence: That they give those themes intellectual or emotional vibrancy , however ,.

5<Const<10: 1

Const>10: 0

Sentence: That the rebel who did more than any other to spread the idea that all men are created equal never freed any of his own slaves -- except for Sally 's -LRB- and perhaps some of his -RRB- relatives --.

5<Const<10: 0

Const>10: 1

Sentence: That Saddam Hussein is a pre - civilized lout , a creature from another age , surely.

5<Const<10: 1

Const>10: 0

Sentence: That the EEOC acted at all.

5<Const<10: 0

Const>10: 0

Sentence: That the Thomasons failed to take Washington by storm.

5<Const<10: 0

Const>10: 0

Sentence: That the MPEG-2 disk looked better than the fuzzy - faced VHS tape.

5<Const<10: 1

Const>10: 0

Sentence: That the excerpt looked sharper on the CD than it did on laser disk.

5<Const<10: 1

Const>10: 0

Sentence: That she is not elegible.

5<Const<10: 0

Const>10: 0

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
