<a href="https://colab.research.google.com/github/koushik980/NLP/blob/main/NLP_F_24_10_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import itertools
import re
import sys

# ---------------------------
#  Example corpus (replace with your data if desired)
# ---------------------------
documents = [
    "The government passed a new law to improve economic growth and increase employment.",
    "A major military war broke out in the region, leading to peace talks and international sanctions.",
    "The football team won their championship after a dramatic penalty shootout in the final match.",
    "Tech companies released new smartphones and cloud services, boosting innovation in AI and data.",
    "Economic analysts predict inflation will fall next quarter as monetary policy tightens.",
    "Soldiers reported on the front lines about the war's impact on civilians and infrastructure.",
    "The startup raised venture capital to scale its machine learning platform for health care.",
    "Local team signed a new striker who scored a hat-trick and delighted the fans.",
    "Diplomats met to negotiate a ceasefire and humanitarian aid to war-affected areas.",
    "Central bank decisions on interest rates influence the stock market and lending."
]

# ---------------------------
#  Utilities
# ---------------------------
def jaccard(set_a, set_b):
    if not set_a and not set_b:
        return 1.0
    inter = len(set_a & set_b)
    uni = len(set_a | set_b)
    return inter / uni if uni > 0 else 0.0

def simple_tokenize(text):
    """Lowercase tokenization: returns list of word tokens (alphanumeric)."""
    return re.findall(r"\b\w+\b", text.lower())

def make_ngrams(tokens, n=2):
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

# ---------------------------
#  Task 1: LDA with 3,5,8 topics
# ---------------------------
def run_lda_and_print(documents, topic_nums=(3,5,8), top_n_words=5):
    print("=== Task 1: LDA Topic Tuning ===\n")
    vectorizer = CountVectorizer(stop_words='english', max_df=0.9, min_df=1)
    X = vectorizer.fit_transform(documents)
    feature_names = np.array(vectorizer.get_feature_names_out())

    lda_topics = {}
    for k in topic_nums:
        lda = LDA(n_components=k, random_state=42, max_iter=100)
        lda.fit(X)
        print(f"--- LDA with {k} topics ---")
        topics = []
        for topic_idx, topic in enumerate(lda.components_):
            top_indices = topic.argsort()[::-1][:top_n_words]
            top_words = feature_names[top_indices]
            topics.append(list(top_words))
            print(f"Topic #{topic_idx+1}: {', '.join(top_words)}")
        lda_topics[k] = topics
        print("")
    # compute separation metric: 1 - mean(pairwise Jaccard on top-10 words)
    def topic_separation_score(topics, top_n=10):
        sets = [set(t[:top_n]) for t in topics]
        pairs = list(itertools.combinations(range(len(sets)), 2))
        if not pairs:
            return 0.0
        sims = [jaccard(sets[i], sets[j]) for i,j in pairs]
        return 1 - np.mean(sims)
    scores = {k: topic_separation_score(tpcs, top_n=10) for k,tpcs in lda_topics.items()}
    best_k = max(scores, key=scores.get)
    for k, sc in scores.items():
        print(f"Separation score for k={k}: {sc:.4f}")
    print(f"\nRecommended k by separation score: {best_k}")
    print("Explanation: separation score = 1 - mean(pairwise Jaccard of top-word sets). Higher => less overlap => clearer separation.\n")
    return lda_topics, scores

# ---------------------------
#  Task 2: WordNet Hypernyms & Hyponyms for a keyword
# ---------------------------
def wordnet_hyper_hypo(keyword='war'):
    print("=== Task 2: WordNet Hypernyms & Hyponyms ===\n")
    hypernyms = set()
    hyponyms = set()
    try:
        import nltk
        from nltk.corpus import wordnet as wn
        # ensure wordnet is downloaded
        try:
            wn.ensure_loaded()
        except Exception:
            nltk.download('wordnet', quiet=True)
        synsets = wn.synsets(keyword)
        if not synsets:
            print(f"No WordNet synsets found for '{keyword}'.\n")
            raise RuntimeError("no synsets")
        # choose first (often most common) synset for demonstration
        s = synsets[0]
        print("Chosen synset:", s, "-", s.definition())
        for h in s.hypernyms():
            for lemma in h.lemmas():
                hypernyms.add(lemma.name().replace('_', ' '))
        for hypo in s.hyponyms():
            for lemma in hypo.lemmas():
                hyponyms.add(lemma.name().replace('_', ' '))
        print("\nHypernyms (broader terms):")
        print(", ".join(sorted(hypernyms)) if hypernyms else "None found")
        print("\nHyponyms (narrower terms):")
        print(", ".join(sorted(hyponyms)) if hyponyms else "None found")
    except Exception as e:
        # fallback mapping if wordnet isn't available
        print("WordNet not available or error occurred:", str(e))
        print("Using fallback illustrative lists.\n")
        hypernyms = {"conflict", "hostility"}
        hyponyms = {"civil war", "world war i", "world war ii", "guerrilla warfare", "cold war", "civil unrest"}
        print("Hypernyms (fallback):", ", ".join(sorted(hypernyms)))
        print("Hyponyms (fallback):", ", ".join(sorted(hyponyms)))
    # short discussion
    print("\nDiscussion: Hyponyms like 'civil war', 'world war i/ii', 'guerrilla warfare', 'cold war' can be used to build fine-grained subtopics:")
    print(" - 'world war i/ii' -> historical global conflicts subtopic")
    print(" - 'civil war' -> internal conflicts subtopic")
    print(" - 'guerrilla warfare' -> tactics/asymmetric warfare subtopic")
    print()
    return hypernyms, hyponyms

# ---------------------------
#  Task 3: Jaccard similarity with bigrams vs unigrams
# ---------------------------
def jaccard_unigram_bigram_compare(doc_a, doc_b, doc_c=None):
    print("=== Task 3: Jaccard Similarity (Unigram vs Bigram) ===\n")
    print("Document A:", doc_a)
    print("Document B:", doc_b)
    if doc_c:
        print("Document C (contrast):", doc_c)
    print("")

    tokens_a = simple_tokenize(doc_a)
    tokens_b = simple_tokenize(doc_b)
    tokens_c = simple_tokenize(doc_c) if doc_c else None

    unigrams_a = set(tokens_a)
    unigrams_b = set(tokens_b)
    unigrams_c = set(tokens_c) if tokens_c else None

    bigrams_a = set(make_ngrams(tokens_a, 2))
    bigrams_b = set(make_ngrams(tokens_b, 2))
    bigrams_c = set(make_ngrams(tokens_c, 2)) if tokens_c else None

    def print_jacc(name1, name2, s1, s2):
        print(f"Jaccard({name1}, {name2}) = {jaccard(s1,s2):.4f} (|I|={len(s1 & s2)}, |U|={len(s1 | s2)})")

    print("Unigram-based Jaccard similarities:")
    print_jacc("A","B", unigrams_a, unigrams_b)
    if unigrams_c is not None:
        print_jacc("A","C", unigrams_a, unigrams_c)

    print("\nBigram-based Jaccard similarities:")
    print_jacc("A","B", bigrams_a, bigrams_b)
    if bigrams_c is not None:
        print_jacc("A","C", bigrams_a, bigrams_c)

    print("\nInterpretation:")
    print("- Unigram Jaccard measures shared words; can be high if documents share vocabulary.")
    print("- Bigram Jaccard measures shared contiguous 2-word phrases; it captures phrase-level similarity and ordering.")
    print("- Bigram similarity often gives a tighter sense of relatedness when phrase structure matters; unigram is coarser.")
    print()

# ---------------------------
#  Main execution
# ---------------------------
if __name__ == "__main__":
    # Task 1
    lda_topics, scores = run_lda_and_print(documents, topic_nums=(3,5,8), top_n_words=5)

    # Task 2
    hypernyms, hyponyms = wordnet_hyper_hypo('war')

    # Task 3 - pick two documents (war-related) and a contrast (tech)
    docA = documents[1]  # war
    docB = documents[8]  # war/peace negotiations
    docC = documents[3]  # tech doc (contrast)
    jaccard_unigram_bigram_compare(docA, docB, docC)

    print("Script finished.")

=== Task 1: LDA Topic Tuning ===

--- LDA with 3 topics ---
Topic #1: team, venture, startup, scale, platform
Topic #2: economic, tightens, quarter, policy, predict
Topic #3: war, new, services, released, smartphones

--- LDA with 5 topics ---
Topic #1: team, new, scored, signed, striker
Topic #2: war, soldiers, reported, lines, impact
Topic #3: new, tech, services, released, innovation
Topic #4: war, venture, scale, startup, platform
Topic #5: economic, tightens, quarter, policy, predict

--- LDA with 8 topics ---
Topic #1: war, new, economic, infrastructure, reported
Topic #2: stock, central, lending, influence, market
Topic #3: war, ceasefire, areas, aid, negotiate
Topic #4: war, new, economic, infrastructure, reported
Topic #5: new, economic, services, released, tech
Topic #6: team, trick, scored, signed, striker
Topic #7: war, new, economic, infrastructure, reported
Topic #8: venture, startup, scale, learning, platform

Separation score for k=3: 1.0000
Separation score for k=5: 0.