In [6]:
!pip install numpy==1.24.4 scipy==1.10.1 gensim==4.3.1



In [7]:
import gensim
import gensim.downloader as api
from gensim.models import Word2Vec, KeyedVectors

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download("wordnet")
nltk.download('averaged_perceptron_tagger_eng')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [18]:
df = pd.read_json("/content/News_Category_Dataset_v3.json", lines=True)
df.to_csv("/content/News_Category_Dataset_v3.csv", index=False)
data = pd.read_csv("/content/News_Category_Dataset_v3.csv")

In [19]:
sentences = data['headline']

clean_sentences = sentences.dropna().reset_index(drop=True)

tokenized_sentences = [word_tokenize(str(sentence)) for sentence in clean_sentences]

In [21]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

lemmatizer = WordNetLemmatizer()

lemmatized_sentences = []
for sentence in tokenized_sentences:
  lowered = [word.lower() for word in sentence]
  tagged = pos_tag(lowered)
  lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tagged]
  lemmatized_sentences.append(lemmatized)

In [22]:
stop_words = set(stopwords.words('english'))

filtered_sentences = []
for sentence in lemmatized_sentences:
    filtered = [word for word in sentence if word not in stop_words]
    filtered_sentences.append(filtered)

In [23]:
wv = api.load('word2vec-google-news-300')



In [24]:
filtered_sentences_cleaned = []
vectorized_sentences = []

for sentence in filtered_sentences:
    sentence_vectors = [wv[word] for word in sentence if word in wv]
    if sentence_vectors:  # for only sentences that dont have NA
        avg_vector = np.mean(sentence_vectors, axis=0)
        vectorized_sentences.append(avg_vector)
        filtered_sentences_cleaned.append(sentence)

In [25]:
def get_sentence_vector(sentence, wv):

    sentence_vectors = [wv[word] for word in sentence if word in wv]
    if sentence_vectors:
        return np.mean(sentence_vectors, axis=0).reshape(1, -1)
    else:
        return None

In [26]:
def find_top_k_similar(input_sentence, wv, vectorized_sentences, filtered_sentences, k=5):
    vec = get_sentence_vector(input_sentence, wv)
    if vec is None:
        return

    similarities = cosine_similarity(vec, np.array(vectorized_sentences))[0]
    top_k_idx = similarities.argsort()[::-1][:k]

    return [(filtered_sentences[i], similarities[i]) for i in top_k_idx]

In [27]:
input_sentence = ["president", "got", "no", "money"]
top_matches = find_top_k_similar(input_sentence, wv, vectorized_sentences, filtered_sentences)

for i, (sentence, score) in enumerate(top_matches, 1):
    print(f"{i}. {' '.join(sentence)} (Similarity: {score:.4f})")

1. guy 's running president want give 'free ' money (Similarity: 0.7809)
2. copycat chick-fil-a sandwich recipe ( hungry sunday ) (Similarity: 0.6981)
3. clothe organization : family 's closet say ( photo ) (Similarity: 0.6963)
4. 'la la land ' win bafta 's top prize , continue hot streak road oscar (Similarity: 0.6809)
5. guy 's get 2 word president , 's put d.c . (Similarity: 0.6600)
