###Extracting Nominalized Adjectives from TAYN

In [2]:
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
import os

# Download NLTK data files (run this once)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Define stop words and determiners to help identify nominalized adjectives
stop_words = set(stopwords.words("english"))
determiners = {"the", "this", "that", "these", "those"}

# Function to extract nominalized adjectives from text
def extract_nominalized_adjectives(text):
    sentences = nltk.sent_tokenize(text)
    nominalized_adjectives = []
    
    for sentence in sentences:
        tokens = word_tokenize(sentence.lower())
        tagged = pos_tag(tokens)
        
        for i in range(len(tagged) - 1):
            word, pos = tagged[i]
            next_word, next_pos = tagged[i + 1]
            
            # Check for a determiner followed by an adjective (likely nominalized)
            if word in determiners and next_pos == "JJ":
                nominalized_adjectives.append(next_word)
    
    return nominalized_adjectives

# File path
file_path = r"C:/Users/amusa/code/learning/TXT files/TAYN.txt"

# Ensure the file exists
if os.path.exists(file_path):
    # Read the file content
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Extract nominalized adjectives
    nominalized_adjectives = extract_nominalized_adjectives(text)
    
    print("Nominalized Adjectives Found:")
    print(nominalized_adjectives)
else:
    print(f"File not found: {file_path}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amusa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\amusa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amusa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Nominalized Adjectives Found:
['american', 'unpainted', 'big', 'american', 'average', 'cramped', 'next', 'long', 'last', 'bright', 'other', 'tiny', 'stained', 'small', 'public', 'lumpy', 'american', 'next', 'secondary', 'surprising', 'rich', 'large', 'rich', 'many', 'daily', 'same', 'superior', 'next', 'same', 'third', 'silly', 'fourth', 'only', 'bright', 'only', 'following', 'fifth', 'first', 'next', 'following', 'rust-eaten', 'big', 'other', 'african', 'ghanaian', 'white', 'chinese', 'old', 'calm', 'poor', 'real', 'poor', 'nasty', 'nice', 'old', 'black', 'black', 'black', 'black', 'white', 'big', 'crisp', 'misspelled']


In [4]:
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
import os

# Download NLTK data files (run this once)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Define stop words and determiners to help identify nominalized adjectives
stop_words = set(stopwords.words("english"))
determiners = {"the", "this", "that", "these", "those"}

# Function to extract nominalized adjectives with context
def extract_nominalized_adjectives_with_context(text):
    sentences = nltk.sent_tokenize(text)
    nominalized_adjectives_with_context = []
    
    for sentence in sentences:
        tokens = word_tokenize(sentence.lower())
        tagged = pos_tag(tokens)
        
        for i in range(len(tagged) - 1):
            word, pos = tagged[i]
            next_word, next_pos = tagged[i + 1]
            
            # Check for a determiner followed by an adjective (likely nominalized)
            if word in determiners and next_pos == "JJ":
                start_index = max(0, i - 2)  # Two words before
                end_index = min(len(tokens), i + 3)  # Two words after
                context = " ".join(tokens[start_index:end_index])
                nominalized_adjectives_with_context.append((next_word, context))
    
    return nominalized_adjectives_with_context

# File path
file_path = r"C:/Users/amusa/code/learning/TXT files/TAYN.txt"

# Ensure the file exists
if os.path.exists(file_path):
    # Read the file content
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Extract nominalized adjectives with context
    nominalized_adjectives_with_context = extract_nominalized_adjectives_with_context(text)
    
    print("Nominalized Adjectives with Context:")
    for adjective, context in nominalized_adjectives_with_context:
        print(f"Adjective: {adjective}, Context: {context}")
else:
    print(f"File not found: {file_path}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amusa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\amusa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amusa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Nominalized Adjectives with Context:
Adjective: american, Context: you won the american visa
Adjective: unpainted, Context: leaning against the unpainted walls
Adjective: big, Context: comparison to the big car
Adjective: american, Context: members for the american visa
Adjective: average, Context: more than the average salary
Adjective: cramped, Context: came into the cramped basement
Adjective: next, Context: , and the next morning
Adjective: long, Context: , walking the long windy
Adjective: last, Context: it was the last stop
Adjective: bright, Context: restaurant with the bright ,
Adjective: other, Context: less than the other waitresses
Adjective: tiny, Context: rent for the tiny room
Adjective: stained, Context: room with the stained carpet
Adjective: small, Context: besides , the small connecticut
Adjective: public, Context: went to the public library
Adjective: lumpy, Context: sat on the lumpy mattress
Adjective: american, Context: you won the american visa
Adjective: next, Co

In [5]:
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
import os

# Download NLTK data files (run this once)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Define stop words and determiners to help identify nominalized adjectives
stop_words = set(stopwords.words("english"))
determiners = {"the", "this", "that", "these", "those"}

# Function to extract nominalized adjectives with context
def extract_nominalized_adjectives_with_context(text):
    sentences = nltk.sent_tokenize(text)
    nominalized_adjectives_with_context = []
    
    for sentence in sentences:
        tokens = word_tokenize(sentence.lower())
        tagged = pos_tag(tokens)
        
        for i in range(len(tagged) - 2):  # Ensure there's a word after the adjective
            word, pos = tagged[i]
            next_word, next_pos = tagged[i + 1]
            following_word, following_pos = tagged[i + 2]
            
            # Check for a determiner followed by an adjective, where the next word is not a noun
            if word in determiners and next_pos == "JJ" and not following_pos.startswith("NN"):
                start_index = max(0, i - 2)  # Two words before
                end_index = min(len(tokens), i + 3)  # Two words after
                context = " ".join(tokens[start_index:end_index])
                nominalized_adjectives_with_context.append((next_word, context))
    
    return nominalized_adjectives_with_context

# File path
file_path = r"C:/Users/amusa/code/learning/TXT files/TAYN.txt"

# Ensure the file exists
if os.path.exists(file_path):
    # Read the file content
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Extract nominalized adjectives with context
    nominalized_adjectives_with_context = extract_nominalized_adjectives_with_context(text)
    
    print("Nominalized Adjectives with Context (No Noun Following):")
    for adjective, context in nominalized_adjectives_with_context:
        print(f"Adjective: {adjective}, Context: {context}")
else:
    print(f"File not found: {file_path}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amusa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\amusa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amusa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Nominalized Adjectives with Context (No Noun Following):
Adjective: average, Context: more than the average salary
Adjective: long, Context: , walking the long windy
Adjective: bright, Context: restaurant with the bright ,
Adjective: many, Context: fat and that many did
Adjective: following, Context: said no the following four
Adjective: following, Context: rooted for the following ,
Adjective: poor, Context: not like the poor fat
Adjective: old, Context: the old white
Adjective: big, Context: the buildings that big around


In [6]:
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
import os

# Download NLTK data files (run this once)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Define stop words and determiners to help identify nominalized adjectives
stop_words = set(stopwords.words("english"))
determiners = {"the", "this", "that", "these", "those"}

# Function to extract nominalized adjectives with context
def extract_nominalized_adjectives_with_context(text):
    sentences = nltk.sent_tokenize(text)
    nominalized_adjectives_with_context = []
    
    for sentence in sentences:
        tokens = word_tokenize(sentence.lower())
        tagged = pos_tag(tokens)
        
        for i in range(len(tagged) - 2):  # Ensure there's a word after the adjective
            word, pos = tagged[i]
            next_word, next_pos = tagged[i + 1]
            following_word, following_pos = tagged[i + 2]
            
            # Check for a determiner followed by an adjective, where the next word is not a noun
            if word in determiners and next_pos == "JJ" and not following_pos.startswith("NN"):
                start_index = max(0, i - 3)  # Three words before
                end_index = min(len(tokens), i + 4)  # Three words after
                context = " ".join(tokens[start_index:end_index])
                nominalized_adjectives_with_context.append((next_word, context))
    
    return nominalized_adjectives_with_context

# File path
file_path = r"C:\Users\amusa\code\learning\TXT files\TAYN.txt"

# Ensure the file exists
if os.path.exists(file_path):
    # Read the file content
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Extract nominalized adjectives with context
    nominalized_adjectives_with_context = extract_nominalized_adjectives_with_context(text)
    
    print("Nominalized Adjectives with Context (No Noun Following):")
    for adjective, context in nominalized_adjectives_with_context:
        print(f"Adjective: {adjective}, Context: {context}")
else:
    print(f"File not found: {file_path}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amusa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\amusa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amusa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Nominalized Adjectives with Context (No Noun Following):
Adjective: average, Context: thousand more than the average salary plus
Adjective: long, Context: left , walking the long windy road
Adjective: bright, Context: the restaurant with the bright , clean
Adjective: many, Context: were fat and that many did not
Adjective: following, Context: you said no the following four days
Adjective: following, Context: you rooted for the following , in
Adjective: poor, Context: was not like the poor fat people
Adjective: old, Context: the old white men
Adjective: big, Context: because the buildings that big around your


In [7]:
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
import os

# Download NLTK data files (run this once)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Function to extract adjectives preceded only by "the" and include their context
def extract_adjectives_with_the_context(text):
    sentences = nltk.sent_tokenize(text)
    adjectives_with_context = []
    
    for sentence in sentences:
        tokens = word_tokenize(sentence.lower())
        tagged = pos_tag(tokens)
        
        for i in range(len(tagged) - 2):  # Ensure there's a word after the adjective
            word, pos = tagged[i]
            next_word, next_pos = tagged[i + 1]
            following_word, following_pos = tagged[i + 2]
            
            # Check for "the" followed by an adjective, where the next word is not a noun
            if word == "the" and next_pos == "JJ" and not following_pos.startswith("NN"):
                start_index = max(0, i - 3)  # Three words before "the"
                end_index = min(len(tokens), i + 4)  # Three words after the adjective
                context = " ".join(tokens[start_index:end_index])
                adjectives_with_context.append((next_word, context))
    
    return adjectives_with_context

# File path
file_path = r"C:/Users/amusa/code/learning/TXT files/TAYN.txt"

# Ensure the file exists
if os.path.exists(file_path):
    # Read the file content
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Extract adjectives with "the" and context
    adjectives_with_context = extract_adjectives_with_the_context(text)
    
    print("Adjectives with Context (Preceded by 'The' Only):")
    for adjective, context in adjectives_with_context:
        print(f"Adjective: {adjective}, Context: {context}")
else:
    print(f"File not found: {file_path}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amusa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\amusa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amusa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Adjectives with Context (Preceded by 'The' Only):
Adjective: average, Context: thousand more than the average salary plus
Adjective: long, Context: left , walking the long windy road
Adjective: bright, Context: the restaurant with the bright , clean
Adjective: following, Context: you said no the following four days
Adjective: following, Context: you rooted for the following , in
Adjective: poor, Context: was not like the poor fat people
Adjective: old, Context: the old white men


####Nominalized Adjectives Using Spacy 

In [8]:
import spacy

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to extract adjectives preceded by "the" and their context
def extract_adjectives_with_the_context_spacy(text):
    nominalized_adjectives_with_context = []
    doc = nlp(text)
    
    for sent in doc.sents:
        tokens = [token.text.lower() for token in sent]  # Tokenize the sentence
        for i, token in enumerate(tokens):
            # Check if the current token is "the" and is followed by an adjective
            if token == "the" and i + 1 < len(tokens):
                next_token = doc[i + 1]
                if next_token.pos_ == "ADJ":  # Ensure next token is an adjective
                    # Ensure the word after the adjective is NOT a noun
                    if i + 2 < len(tokens) and doc[i + 2].pos_ != "NOUN":
                        # Extract three words before and after for context
                        start_index = max(0, i - 3)  # Three words before "the"
                        end_index = min(len(tokens), i + 4)  # Three words after the adjective
                        context = " ".join(tokens[start_index:end_index])
                        nominalized_adjectives_with_context.append((next_token.text, context))
    
    return nominalized_adjectives_with_context

# File path
file_path = r"C:/Users/amusa/code/learning/TXT files/TAYN.txt"

# Ensure the file exists
try:
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        
    # Extract nominalized adjectives with spaCy
    nominalized_adjectives_with_context = extract_adjectives_with_the_context_spacy(text)
    
    print("Adjectives with Context (Preceded by 'The' Only):")
    for adjective, context in nominalized_adjectives_with_context:
        print(f"Adjective: {adjective}, Context: {context}")
except FileNotFoundError:
    print(f"File not found: {file_path}")


Adjectives with Context (Preceded by 'The' Only):
Adjective: NECK, Context: they trooped into the room in lagos
Adjective: NECK, Context: in comparison to the big car and
Adjective: NECK, Context: you walked into the restaurant with the
Adjective: NECK, Context: many people at the restaurant asked when
Adjective: NECK, Context: he came in the next day and
Adjective: NECK, Context: he came in the third day and
Adjective: NECK, Context: you said no the following four days
Adjective: NECK, Context: and then , the fifth night ,
Adjective: NECK, Context: you prayed for the first time in
Adjective: NECK, Context: the rain , the swampiness , you
Adjective: NECK, Context: back home , the meat pieces you
Adjective: NECK, Context: later , in the shower , you
Adjective: NECK, Context: your mother wrote the letter herself ;
