In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

In [None]:
from gensim.corpora import WikiCorpus
import os

# Step 1: Download the Wikipedia dump
wiki_dump_url = "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2"
filename = "enwiki-latest-pages-articles.xml.bz2"

# Step 2: Extract and process the Wikipedia dump
output_fname = "wiki_texts.txt"

def process_wiki(inp, outp):
    """Convert Wikipedia xml dump file to text corpus"""
    wiki = WikiCorpus(inp, dictionary={})
    with open(outp, 'w', encoding='utf-8') as output:
        for text in wiki.get_texts():
            output.write(' '.join(text) + '\n')
    print('Processing complete!')

# Check if the output file already exists
if not os.path.exists(output_fname):
    print("Extracting and processing Wikipedia dump...")
    process_wiki(filename, output_fname)
else:
    print(f"{output_fname} already exists. Skipping processing.")

print(f"Corpus is ready in {output_fname}")

# Step 3: Prepare the corpus for Word2Vec (similar to previous example)
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess(sentence):
    tokens = word_tokenize(sentence.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

# This function will process the corpus in chunks to handle large files
def process_corpus(input_file, chunk_size=1000):
    with open(input_file, 'r', encoding='utf-8') as f:
        while True:
            lines = f.readlines(chunk_size)
            if not lines:
                break
            yield [preprocess(line) for line in lines]


In [None]:
preprocessed_corpus = process_corpus(output_fname)
model = Word2Vec(vector_size=100, window=5, min_count=5, workers=4)
model.build_vocab(preprocessed_corpus)
model.train(preprocessed_corpus, total_examples=model.corpus_count, epochs=5)

In [None]:
model.wv.most_similar('king', topn=5)