In [1]:
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

def ppmi_weighting(docs):
    # Tokenize the documents and count co-occurrences
    vectorizer = CountVectorizer(tokenizer=lambda text: text.split(), binary=True)
    X = vectorizer.fit_transform(docs)
    co_occurrence_matrix = (X.T * X)

    # Compute word frequencies and total word count
    word_counts = np.array(X.sum(axis=0)).squeeze()
    total_word_count = word_counts.sum()

    # Initialize PPMI matrix
    ppmi_matrix = np.zeros(co_occurrence_matrix.shape)

    # Compute PMI values
    num_docs = len(docs)
    for i, (word1, idx1) in enumerate(vectorizer.vocabulary_.items()):
        for j, (word2, idx2) in enumerate(vectorizer.vocabulary_.items()):
            co_occurrences = co_occurrence_matrix[idx1, idx2]
            if co_occurrences == 0:
                ppmi_matrix[idx1, idx2] = 0
            else:
                word1_count = word_counts[idx1]
                word2_count = word_counts[idx2]
                pmi = np.log((co_occurrences * num_docs) / (word1_count * word2_count))
                ppmi = max(0, pmi)
                ppmi_matrix[idx1, idx2] = ppmi

    # Normalize PPMI matrix
    ppmi_matrix = normalize(ppmi_matrix, norm='l2', axis=1)

    return ppmi_matrix

# Example usage:
docs = [
    "This is a sample document containing some words.",
    "Another document with different words but containing some similar words."
]

ppmi_matrix = ppmi_weighting(docs)
print(ppmi_matrix)


[[0.5        0.         0.         0.         0.         0.
  0.5        0.5        0.         0.         0.5        0.
  0.         0.        ]
 [0.         0.40824829 0.40824829 0.         0.40824829 0.
  0.         0.         0.40824829 0.         0.         0.40824829
  0.40824829 0.        ]
 [0.         0.40824829 0.40824829 0.         0.40824829 0.
  0.         0.         0.40824829 0.         0.         0.40824829
  0.40824829 0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.40824829 0.40824829 0.         0.40824829 0.
  0.         0.         0.40824829 0.         0.         0.40824829
  0.40824829 0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.5        0.         0.         0.         0.         0.
  0.5        0.5        0.         0.         

In [2]:
def ppmi_weighting(docs, window_size=5):
    # Tokenize the documents and build co-occurrence matrix
    tokenized_docs = [doc.split() for doc in docs]
    co_occurrence_matrix = defaultdict(lambda: defaultdict(int))

    for doc in tokenized_docs:
        for i, word in enumerate(doc):
            start = max(0, i - window_size)
            end = min(len(doc), i + window_size + 1)
            context = doc[start:end]
            for j in range(len(context)):
                if context[j] != word:
                    co_occurrence_matrix[word][context[j]] += 1

    # Compute word frequencies and total word count
    word_counts = defaultdict(int)
    for doc in tokenized_docs:
        for word in doc:
            word_counts[word] += 1
    total_word_count = sum(word_counts.values())

    # Initialize PPMI matrix
    ppmi_matrix = defaultdict(lambda: defaultdict(float))

    # Compute PMI values
    num_docs = len(docs)
    for word, context in co_occurrence_matrix.items():
        for co_word, co_count in context.items():
            if co_count == 0:
                ppmi_matrix[word][co_word] = 0
            else:
                pmi = np.log((co_count * num_docs) / (word_counts[word] * word_counts[co_word]))
                ppmi = max(0, pmi)
                ppmi_matrix[word][co_word] = ppmi

    # Convert PPMI matrix to numpy array for easier handling
    vocab = sorted(word_counts.keys())
    ppmi_array = np.zeros((len(vocab), len(vocab)))
    for i, word in enumerate(vocab):
        for j, co_word in enumerate(vocab):
            ppmi_array[i][j] = ppmi_matrix[word][co_word]

    # Normalize PPMI matrix
    ppmi_array = normalize(ppmi_array, norm='l2', axis=1)

    return ppmi_array, vocab

# Example usage:
docs = [
    "This is a sample document containing some words.",
    "Another document with different words but containing some similar words."
]

ppmi_matrix, vocab = ppmi_weighting(docs, window_size=5)
print("Vocabulary:", vocab)
print("PPMI Matrix:")
print(ppmi_matrix)

Vocabulary: ['Another', 'This', 'a', 'but', 'containing', 'different', 'document', 'is', 'sample', 'similar', 'some', 'with', 'words', 'words.']
PPMI Matrix:
[[0.         0.         0.         0.5        0.         0.5
  0.         0.         0.         0.         0.         0.5
  0.5        0.        ]
 [0.         0.         0.57735027 0.         0.         0.
  0.         0.57735027 0.57735027 0.         0.         0.
  0.         0.        ]
 [0.         0.57735027 0.         0.         0.         0.
  0.         0.57735027 0.57735027 0.         0.         0.
  0.         0.        ]
 [0.4472136  0.         0.         0.         0.         0.4472136
  0.         0.         0.         0.4472136  0.         0.4472136
  0.4472136  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.4472136  0.         0.         0.4472136  0.         0.
  0.         0.         0.         0.4472

### Sublinear scaling involves taking the logarithm of the word counts or co-occurrence counts to dampen the effect of very frequent words. Let's modify the PPMI weighting function to incorporate sublinear scaling:

In [3]:
def ppmi_weighting(docs, window_size=5):
    # Tokenize the documents and build co-occurrence matrix
    tokenized_docs = [doc.split() for doc in docs]
    co_occurrence_matrix = defaultdict(lambda: defaultdict(int))

    for doc in tokenized_docs:
        for i, word in enumerate(doc):
            start = max(0, i - window_size)
            end = min(len(doc), i + window_size + 1)
            context = doc[start:end]
            for j in range(len(context)):
                if context[j] != word:
                    co_occurrence_matrix[word][context[j]] += 1

    # Compute word frequencies and total word count
    word_counts = defaultdict(int)
    for doc in tokenized_docs:
        for word in doc:
            word_counts[word] += 1
    total_word_count = sum(word_counts.values())

    # Apply sublinear scaling to word counts
    for word, count in word_counts.items():
        word_counts[word] = 1 + np.log(count)  # Applying log function

    # Initialize PPMI matrix
    ppmi_matrix = defaultdict(lambda: defaultdict(float))

    # Compute PMI values
    num_docs = len(docs)
    for word, context in co_occurrence_matrix.items():
        for co_word, co_count in context.items():
            if co_count == 0:
                ppmi_matrix[word][co_word] = 0
            else:
                pmi = np.log((co_count * num_docs) / (word_counts[word] * word_counts[co_word]))
                ppmi = max(0, pmi)
                ppmi_matrix[word][co_word] = ppmi

    # Convert PPMI matrix to numpy array for easier handling
    vocab = sorted(word_counts.keys())
    ppmi_array = np.zeros((len(vocab), len(vocab)))
    for i, word in enumerate(vocab):
        for j, co_word in enumerate(vocab):
            ppmi_array[i][j] = ppmi_matrix[word][co_word]

    # Normalize PPMI matrix
    ppmi_array = normalize(ppmi_array, norm='l2', axis=1)

    return ppmi_array, vocab

# Example usage:
docs = [
    "This is a sample document containing some words.",
    "Another document with different words but containing some similar words."
]

ppmi_matrix, vocab = ppmi_weighting(docs, window_size=5)
print("Vocabulary:", vocab)
print("PPMI Matrix:")
print(ppmi_matrix)

Vocabulary: ['Another', 'This', 'a', 'but', 'containing', 'different', 'document', 'is', 'sample', 'similar', 'some', 'with', 'words', 'words.']
PPMI Matrix:
[[0.         0.         0.         0.49642982 0.         0.49642982
  0.11928842 0.         0.         0.         0.         0.49642982
  0.49642982 0.        ]
 [0.         0.         0.56654895 0.         0.13613753 0.
  0.13613753 0.56654895 0.56654895 0.         0.         0.
  0.         0.        ]
 [0.         0.55633203 0.         0.         0.13368248 0.
  0.13368248 0.55633203 0.55633203 0.         0.13368248 0.
  0.         0.13368248]
 [0.43722925 0.         0.         0.         0.10506296 0.43722925
  0.10506296 0.         0.         0.43722925 0.10506296 0.43722925
  0.43722925 0.10506296]
 [0.         0.21821789 0.21821789 0.21821789 0.         0.21821789
  0.43643578 0.21821789 0.21821789 0.21821789 0.43643578 0.21821789
  0.21821789 0.43643578]
 [0.43966253 0.         0.         0.43966253 0.10564766 0.
  0.10564

In [6]:
from nltk.corpus import wordnet

# Function to get synonyms of a word using WordNet
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

words = ['happy', 'sad', 'big', 'small', 'good', 'bad', 'beautiful', 'ugly', 'fast', 'slow']

# Get synonyms for each word
synonyms_dict = {}
for word in words:
    synonyms_dict[word] = get_synonyms(word)

# Create a small corpus with sentences containing these words and their synonyms
corpus = [
    "I feel {} when I'm {}.".format(synonyms_dict['happy'][0], synonyms_dict['fast'][0]),
    "He looks {} when he's {}.".format(synonyms_dict['sad'][0], synonyms_dict['slow'][0]),
    "The {} dog chased the {} cat.".format(synonyms_dict['big'][0], synonyms_dict['small'][0]),
    "She said it was {} but it turned out to be {}.".format(synonyms_dict['good'][0], synonyms_dict['bad'][0]),
    "The sunset was {} but the weather was {}.".format(synonyms_dict['beautiful'][0], synonyms_dict['ugly'][0])
]

def ppmi_weighting(corpus):
    co_occurrence_matrix = np.zeros((len(vocab), len(vocab)))
    word_index = {word: i for i, word in enumerate(vocab)}
    total_word_count = 0
    for sentence in corpus:
        words = sentence.split()
        for i, word in enumerate(words):
            if word in vocab:
                total_word_count += 1
                start = max(0, i - window_size)
                end = min(len(words), i + window_size + 1)
                context = words[start:end]
                for j in range(len(context)):
                    if context[j] != word and context[j] in vocab:
                        co_occurrence_matrix[word_index[word]][word_index[context[j]]] += 1

    # Compute PPMI values
    ppmi_matrix = np.zeros_like(co_occurrence_matrix, dtype=float)
    for i in range(len(vocab)):
        for j in range(len(vocab)):
            if co_occurrence_matrix[i][j] == 0:
                ppmi_matrix[i][j] = 0
            else:
                ppmi_matrix[i][j] = max(0, np.log((co_occurrence_matrix[i][j] * total_word_count) /
                                                  (word_counts[i] * word_counts[j])))

    # Normalize PPMI matrix
    ppmi_matrix = normalize(ppmi_matrix, norm='l2', axis=1)

    return ppmi_matrix


# Compute PPMI weighting for the corpus
window_size = 2
vectorizer = CountVectorizer(tokenizer=lambda text: text.split(), binary=True)
X = vectorizer.fit_transform(corpus)
word_counts = np.array(X.sum(axis=0)).squeeze()
vocab = vectorizer.get_feature_names()
ppmi_matrix = ppmi_weighting(corpus)

# Print the PPMI matrix
print("PPMI Matrix:")
print(ppmi_matrix)

# Find the index of each word in the vocabulary
word_indices = {word: i for i, word in enumerate(vocab)}

# Find synonyms with highest PPMI values for each word
for word in words:
    print(f"\nSynonyms with highest PPMI values for '{word}':")
    if word in word_indices:
        word_index = word_indices[word]
        ppmi_scores = ppmi_matrix[word_index]
        top_synonyms_indices = ppmi_scores.argsort()[-5:][::-1]
        top_synonyms = [vocab[idx] for idx in top_synonyms_indices if idx != word_index]
        print(top_synonyms)
    else:
        print(f"No synonyms found for '{word}'")

PPMI Matrix:
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.46885077 ... 0.46885077 0.         0.        ]
 [0.         0.39012075 0.         ... 0.39012075 0.39012075 0.        ]
 ...
 [0.         0.34158248 0.34158248 ... 0.         0.34158248 0.        ]
 [0.         0.         0.46885077 ... 0.46885077 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]

Synonyms with highest PPMI values for 'happy':
['feel', 'when', 'weather', 'beautiful', 'but']

Synonyms with highest PPMI values for 'sad':
No synonyms found for 'sad'

Synonyms with highest PPMI values for 'big':
No synonyms found for 'big'

Synonyms with highest PPMI values for 'small':
No synonyms found for 'small'

Synonyms with highest PPMI values for 'good':
No synonyms found for 'good'

Synonyms with highest PPMI values for 'bad':
No synonyms found for 'bad'

Synonyms with highest PPMI values for 'beautiful':
['sunset', 'was', '

