In [None]:
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
import pandas as pd

df = pd.read_csv("data.csv",header = "infer")
df = df[['Id','Text']]

In [None]:
len(df)

In [None]:
reviews = list(df['Text'])[:2000]
reviews

In [None]:
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from collections import Counter

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def tokenize_corpus(corpus):
    return [word_tokenize(doc) for doc in corpus]

def case_fold_corpus(tokenized_doc):
    return [word.lower() for word in tokenized_doc]

def get_stop_words():
    return set(stopwords.words('english'))

def remove_stop_words(corpus, stop_words):
    return [[word for word in doc if word.isalpha() and word not in stop_words] for doc in corpus]

def stem_corpus(tokenized_doc):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokenized_doc]

def pos_tag_corpus(tokenized_doc):
    return pos_tag(tokenized_doc)

def lemmatize_corpus(pos_tagged_doc):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tagged_doc]

# Main processing
def process_corpus(corpus):
    tokenized_corpus = tokenize_corpus(corpus)
    tokenized_corpus = [case_fold_corpus(doc) for doc in tokenized_corpus]
    stop_words = get_stop_words()
    no_stop_corpus = remove_stop_words(tokenized_corpus, stop_words)
    stemmed_corpus = [stem_corpus(doc) for doc in no_stop_corpus]
    pos_tagged_corpus = [pos_tag_corpus(doc) for doc in no_stop_corpus]
    lemmatized_corpus = [lemmatize_corpus(doc) for doc in pos_tagged_corpus]

    lemmatized_texts = [' '.join(tokens) for tokens in lemmatized_corpus]

    return {
        'tokenized': tokenized_corpus,
        'no_stop': no_stop_corpus,
        'stemmed': stemmed_corpus,
        'pos_tagged': pos_tagged_corpus,
        'lemmatized_tokens': lemmatized_corpus, 
        'lemmatized_texts': lemmatized_texts     
    }


d = process_corpus(reviews)



In [23]:
def get_frequent_nouns_from_list(text_list, threshold=10):
    noun_list = []
    for text in text_list:
        words = text.split()
        tagged = nltk.pos_tag(words)
        for word, tag in tagged:
            if tag.startswith("NN"):
                noun_list.append(word.lower())
    noun_freq = Counter(noun_list)
    frequent_nouns = {noun for noun, count in noun_freq.items() if count >= threshold}
    print(f"Found {len(frequent_nouns)} frequent nouns with threshold {threshold}\n")
    return frequent_nouns

In [24]:

def find_noun_modifiers_from_list(text_list, frequent_nouns, window_size=2):
    results = {}
    for text in text_list:
        words = text.split()
        tagged = nltk.pos_tag(words)
        for i, (word, tag) in enumerate(tagged):
            noun = word.lower()
            if tag.startswith("NN") and noun in frequent_nouns:
                modifiers = []
                for j in range(max(0, i - window_size), min(len(tagged), i + window_size + 1)):
                    if j == i:
                        continue
                    mod_word, mod_tag = tagged[j]
                    if mod_tag.startswith("JJ"):
                        modifiers.append(mod_word.lower())
                    elif mod_tag.startswith("RB"):
                        if j + 1 < len(tagged) and tagged[j + 1][1].startswith("JJ"):
                            modifiers.append(f"{mod_word.lower()} {tagged[j + 1][0].lower()}")
                        else:
                            modifiers.append(mod_word.lower())
                if modifiers:
                    if noun not in results:
                        results[noun] = []
                    results[noun].extend(modifiers)
    return results

def print_noun_modifiers(results, top_n=15, top_mods=5):
    for noun, mods in list(results.items())[:top_n]:
        mod_counts = Counter(mods)
        top_mod_list = ', '.join([f"{mod} ({count})" for mod, count in mod_counts.most_common(top_mods)])
        print(f"{noun}: {top_mod_list}\n")

In [None]:
processed = process_corpus(reviews)

frequent_nouns = get_frequent_nouns_from_list(processed['lemmatized_texts'], threshold=20)

results = find_noun_modifiers_from_list(processed['lemmatized_texts'], frequent_nouns, window_size=2)

print_noun_modifiers(results)