# 9. Word2Vec Embeddings using Gensim
 

In [None]:
import gensim.downloader as api

# Load pre-trained Word2Vec model
model = api.load("word2vec-google-news-300")

# Example usage
print(model['king'])  # vector for the word 'king'
print(model.most_similar('king'))  # similar words to 'king'


In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Sample corpus
corpus = ["This is a sample sentence", "Word2Vec is fun to learn"]
tokenized_corpus = [word_tokenize(sent.lower()) for sent in corpus]

# Train Word2Vec model
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Save and load
model.save("custom_word2vec.model")
model = Word2Vec.load("custom_word2vec.model")

print(model.wv['sample'])
print(model.wv.most_similar('sample'))


# 10. IMDB Sentiment Analysis using Word2Vec + LSTM and BERT 

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.datasets import imdb
import numpy as np

# Load data
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)
word_index = imdb.get_word_index()

# Reverse index to word mapping
reverse_word_index = {value: key for key, value in word_index.items()}

# Convert back to text
x_train_text = [" ".join([reverse_word_index.get(i - 3, "?") for i in seq]) for seq in x_train[:1000]]

# Tokenize and pad
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train_text)
sequences = tokenizer.texts_to_sequences(x_train_text)
padded = pad_sequences(sequences, maxlen=300)

# Load pre-trained Word2Vec
w2v = api.load("word2vec-google-news-300")
embedding_dim = 300
vocab_size = len(tokenizer.word_index) + 1

# Create embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in w2v:
        embedding_matrix[i] = w2v[word]

# Define LSTM model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=300, trainable=False))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()
# model.fit(padded, y_train[:1000], epochs=5, validation_split=0.2)  # Uncomment to train


# 11. IMDB Sentiment Analysis using BERT Classification
 

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch

# Load dataset
dataset = load_dataset("imdb")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize
def tokenize_fn(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_ds = dataset.map(tokenize_fn, batched=True)

# Format for PyTorch
tokenized_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Load pre-trained BERT
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Training args
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'].shuffle(seed=42).select(range(2000)),  # use subset
    eval_dataset=tokenized_ds['test'].select(range(500))
)

# Train
trainer.train()
