In [3]:
import nltk
import numpy as np
from nltk.corpus import reuters, stopwords
from gensim.models import Word2Vec

# Download datasets
nltk.download('reuters')
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# Step 1: Prepare corpus for Word2Vec training
corpus_sentences = []

for fileid in reuters.fileids():
    words = [
        word.lower()
        for word in reuters.words(fileid)
        if word.isalnum() and word.lower() not in stop_words
    ]
    corpus_sentences.append(words)

# Step 2: Train Word2Vec (LOCAL ONLY)
model = Word2Vec(
    sentences=corpus_sentences,
    vector_size=100,
    window=5,
    min_count=5,
    workers=4
)

# Step 3: Compute document embeddings
document_embeddings = []
document_ids = reuters.fileids()

for fileid in document_ids:
    words = [
        word.lower()
        for word in reuters.words(fileid)
        if word.isalnum() and word.lower() not in stop_words
    ]

    word_vectors = [
        model.wv[word]
        for word in words
        if word in model.wv
    ]

    if word_vectors:
        doc_embedding = np.mean(word_vectors, axis=0)
    else:
        doc_embedding = np.zeros(model.vector_size)

    document_embeddings.append(doc_embedding)

# Save embeddings
np.save("embeddings.npy", np.array(document_embeddings))

# ✅ NEW: Save FULL DOCUMENT TEXT
with open("documents.txt", "w", encoding="utf-8") as f:
    for fileid in document_ids:
        raw_text = reuters.raw(fileid)
        f.write(raw_text.replace("\n", " ") + "\n")

print("✅ embeddings.npy and documents.txt saved successfully!")

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


✅ embeddings.npy and documents.txt saved successfully!
