# Keyword Analysis: MI-Based Phrase Extraction

This notebook implements the MI (Mutual Information) scoring methodology to extract meaningful domain-specific phrases from the job corpus.

## Overview

1. **Load job corpus** - Load chunks.jsonl and filter to English jobs
2. **Extract n-grams** - Compute unigrams, bigrams, trigrams with frequencies
3. **Compute MI scores** - Calculate PMI/NPMI for each n-gram
4. **Compare with reference corpus** - Load Switchboard/OpenSubtitles and compute effect sizes
5. **Filter meaningful phrases** - Keep domain-specific, cohesive phrases
6. **Save artifacts** - Write meaningful_phrases.json for cluster labeling

Based on methodology from `meta/KW extraction method/` notebooks.

In [None]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter
from tqdm import tqdm

# Project paths
ARTIFACTS_DIR = Path('artifacts')
CHUNKS_PATH = ARTIFACTS_DIR / 'chunks.jsonl'
REFERENCE_DIR = Path('meta/KW extraction method/corpora')

print(f'Chunks path exists: {CHUNKS_PATH.exists()}')
print(f'Reference dir exists: {REFERENCE_DIR.exists()}')

## 1. Load Job Corpus

In [None]:
# Load chunks
chunks = []
with open(CHUNKS_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        chunks.append(json.loads(line))

print(f'Loaded {len(chunks)} chunks')

# Group by job
jobs = {}
for ch in chunks:
    jk = ch.get('job_key', '')
    if jk not in jobs:
        meta = ch.get('meta', {})
        jobs[jk] = {
            'job_id': jk,
            'title': meta.get('title', ''),
            'company': meta.get('company', ''),
            'language': meta.get('language', 'unknown'),
            'texts': []
        }
    jobs[jk]['texts'].append(ch.get('text', ''))

print(f'Unique jobs: {len(jobs)}')

# Count languages
lang_counts = Counter(j['language'] for j in jobs.values())
print(f'Languages: {dict(lang_counts.most_common(10))}')

In [None]:
# Filter to English jobs
english_jobs = [j for j in jobs.values() if j['language'] == 'en']
print(f'English jobs: {len(english_jobs)} ({100*len(english_jobs)/len(jobs):.1f}%)')

# Combine texts per job
job_texts = [' '.join(j['texts']) for j in english_jobs]
print(f'Total characters: {sum(len(t) for t in job_texts):,}')

## 2. Extract N-grams

In [None]:
from src.nlp.ngram_extraction import extract_ngrams_from_corpus, count_unigrams

# Extract n-grams (this may take a minute)
print('Extracting n-grams from job corpus...')
job_ngrams = extract_ngrams_from_corpus(
    job_texts,
    n_min=1,
    n_max=3,
    min_frequency=2,
)

print(f'Extracted {len(job_ngrams)} unique n-grams')
print(f'Unigrams: {sum(1 for ng in job_ngrams if ng["ngram_length"] == 1)}')
print(f'Bigrams: {sum(1 for ng in job_ngrams if ng["ngram_length"] == 2)}')
print(f'Trigrams: {sum(1 for ng in job_ngrams if ng["ngram_length"] == 3)}')

In [None]:
# Convert to DataFrame for easier manipulation
job_df = pd.DataFrame(job_ngrams)
job_df.head(10)

## 3. Compute MI Scores

In [None]:
from src.nlp.mutual_information import analyze_corpus_mi, normalize_mi_by_length

# Compute MI for bigrams and trigrams
print('Computing MI scores...')
mi_ngrams = analyze_corpus_mi(
    job_texts,
    n_min=2,
    n_max=3,
    min_frequency=3,
)

print(f'N-grams with MI: {len(mi_ngrams)}')

# Normalize MI by length
mi_ngrams = normalize_mi_by_length(mi_ngrams)

mi_df = pd.DataFrame(mi_ngrams)
mi_df.head()

In [None]:
# Top cohesive bigrams (high MI)
bigrams = mi_df[mi_df['ngram_length'] == 2].dropna(subset=['mi_z'])
print('Top cohesive bigrams:')
bigrams.nlargest(20, 'mi_z')[['ngram', 'frequency', 'mi', 'mi_z']]

In [None]:
# Top cohesive trigrams (high MI)
trigrams = mi_df[mi_df['ngram_length'] == 3].dropna(subset=['mi_z'])
print('Top cohesive trigrams:')
trigrams.nlargest(20, 'mi_z')[['ngram', 'frequency', 'mi', 'mi_z']]

## 4. Load Reference Corpus

In [None]:
# Try to load Switchboard (conversational English)
switchboard_path = REFERENCE_DIR / 'switchboard_combined.csv'

if switchboard_path.exists():
    print('Loading Switchboard corpus...')
    ref_df = pd.read_csv(switchboard_path)
    print(f'Switchboard shape: {ref_df.shape}')
    print(f'Columns: {list(ref_df.columns)}')
    
    # Get text column (may vary by file format)
    text_col = 'text' if 'text' in ref_df.columns else ref_df.columns[0]
    ref_texts = ref_df[text_col].dropna().tolist()
    print(f'Reference texts: {len(ref_texts)}')
else:
    print('Switchboard not found, trying OpenSubtitles...')
    opensubtitles_path = REFERENCE_DIR / 'opensubtitles.csv'
    if opensubtitles_path.exists():
        ref_df = pd.read_csv(opensubtitles_path)
        text_col = 'text' if 'text' in ref_df.columns else ref_df.columns[0]
        ref_texts = ref_df[text_col].dropna().tolist()
        print(f'OpenSubtitles texts: {len(ref_texts)}')
    else:
        print('No reference corpus found! Skipping comparison.')
        ref_texts = None

In [None]:
# Extract n-grams from reference corpus (if available)
if ref_texts:
    print('Extracting n-grams from reference corpus...')
    # Sample for speed (use first 50k texts)
    ref_sample = ref_texts[:50000] if len(ref_texts) > 50000 else ref_texts
    print(f'Using {len(ref_sample)} reference texts')
    
    ref_ngrams = extract_ngrams_from_corpus(
        ref_sample,
        n_min=1,
        n_max=3,
        min_frequency=2,
    )
    print(f'Reference n-grams: {len(ref_ngrams)}')

## 5. Compare with Reference Corpus

In [None]:
from src.nlp.mutual_information import compare_with_reference

if ref_texts:
    # Calculate totals
    total_job = sum(ng['frequency'] for ng in job_ngrams if ng['ngram_length'] == 1)
    total_ref = sum(ng['frequency'] for ng in ref_ngrams if ng['ngram_length'] == 1)
    
    print(f'Job corpus unigrams: {total_job:,}')
    print(f'Reference corpus unigrams: {total_ref:,}')
    
    # Compare n-grams
    print('Computing effect sizes and significance...')
    compared = compare_with_reference(
        mi_ngrams,
        ref_ngrams,
        total_job,
        total_ref,
    )
    
    compared_df = pd.DataFrame(compared)
    print(f'Compared n-grams: {len(compared_df)}')
    compared_df.head()

In [None]:
# Top domain-specific bigrams (high effect size + significant)
if ref_texts:
    domain_bigrams = compared_df[
        (compared_df['ngram_length'] == 2) &
        (compared_df['significant'] == True) &
        (compared_df['effect_size'] > 0)
    ].nlargest(30, 'effect_size')
    
    print('Domain-specific bigrams (overrepresented in job corpus):')
    domain_bigrams[['ngram', 'frequency', 'freq_reference', 'effect_size', 'p_value']]

In [None]:
# Top domain-specific trigrams
if ref_texts:
    domain_trigrams = compared_df[
        (compared_df['ngram_length'] == 3) &
        (compared_df['significant'] == True) &
        (compared_df['effect_size'] > 0)
    ].nlargest(30, 'effect_size')
    
    print('Domain-specific trigrams:')
    domain_trigrams[['ngram', 'frequency', 'freq_reference', 'effect_size', 'p_value']]

## 6. Filter Meaningful Phrases

In [None]:
from src.nlp.mutual_information import filter_meaningful_phrases

# Filter to meaningful phrases
if ref_texts:
    meaningful = filter_meaningful_phrases(
        compared,
        min_mi_z=0.0,
        min_effect_size=0.0,
        require_significant=True,
        min_frequency=3,
        min_doc_frequency=2,
    )
else:
    # Without reference, use MI alone
    meaningful = [
        ng for ng in mi_ngrams
        if ng.get('mi_z', 0) > 0.5
        and ng.get('frequency', 0) >= 3
        and ng.get('ngram_length', 1) >= 2
    ]

print(f'Meaningful phrases: {len(meaningful)}')

In [None]:
# View sample of meaningful phrases
meaningful_df = pd.DataFrame(meaningful)
if 'effect_size' in meaningful_df.columns:
    # Sort by combined score (MI + effect size)
    meaningful_df['score'] = meaningful_df['mi_z'].fillna(0) + meaningful_df['effect_size'].fillna(0)
    meaningful_df.nlargest(50, 'score')[['ngram', 'frequency', 'mi_z', 'effect_size', 'score']]
else:
    meaningful_df.nlargest(50, 'mi_z')[['ngram', 'frequency', 'mi_z']]

## 7. Save Artifacts

In [None]:
# Prepare output
output = {
    'phrases': [ng['ngram'] for ng in meaningful],
    'n_phrases': len(meaningful),
    'n_jobs_analyzed': len(english_jobs),
    'details': meaningful[:1000],  # Save top 1000 with full details
}

# Save to artifacts
output_path = ARTIFACTS_DIR / 'meaningful_phrases.json'
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(output, f, indent=2, ensure_ascii=False)

print(f'Saved {len(meaningful)} phrases to {output_path}')

In [None]:
# Also save full analysis as parquet for future use
full_df = meaningful_df if 'meaningful_df' in dir() else pd.DataFrame(meaningful)
parquet_path = ARTIFACTS_DIR / 'phrase_scores.parquet'
full_df.to_parquet(parquet_path, index=False)
print(f'Saved full analysis to {parquet_path}')

## Summary

The analysis has:
1. Loaded the job corpus and filtered to English jobs
2. Extracted n-grams and computed MI scores
3. Compared with reference corpus to find domain-specific phrases
4. Saved meaningful phrases to `artifacts/meaningful_phrases.json`

These phrases can now be used in cluster labeling to filter out irrelevant TF-IDF keywords.