In [1]:
import numpy as np
import pandas as pd
import re
import math
import matplotlib.pyplot as plt
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.manifold import TSNE
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import time

# Download necessary NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

# Load the IMDB dataset
print("Loading IMDB dataset...")

train_review = load_files('./aclImdb/train/', encoding='utf-8')
x_train, y_train = train_review.data, train_review.target

test_review = load_files('./aclImdb/test/', encoding='utf-8')
x_test, y_test = test_review.data, test_review.target

print(f"Loaded {len(x_train)} training samples and {len(x_test)} test samples")
print(f"Labels: {train_review.target_names}")

# Preview a sample
print("\nSample review:")
print(x_train[0][:300], "...")

Loading IMDB dataset...
Loaded 25000 training samples and 25000 test samples
Labels: ['neg', 'pos']

Sample review:
Zero Day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. It captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own/mu ...


In [2]:
# Problem 1: Scratch implementation of BoW
print("\n\n===== Problem 1: Scratch implementation of BoW =====")

sentences = [
    "This movie is SOOOO funny!!!",
    "What a movie! I never",
    "best movie ever!!!!! this movie"
]

def create_bow(sentences, use_bigrams=False):
    # Preprocessing - convert to lowercase and remove punctuation
    processed_sentences = []
    for sentence in sentences:
        # Convert to lowercase
        sentence = sentence.lower()
        # Remove punctuation
        sentence = re.sub(r'[^\w\s]', '', sentence)
        processed_sentences.append(sentence)
    
    # Create vocabulary
    vocabulary = set()
    for sentence in processed_sentences:
        words = sentence.split()
        
        # Add unigrams to vocabulary
        vocabulary.update(words)
        
        # Add bigrams to vocabulary if required
        if use_bigrams:
            bigrams = [words[i] + " " + words[i+1] for i in range(len(words)-1)]
            vocabulary.update(bigrams)
    
    # Convert vocabulary to ordered list
    vocabulary = sorted(list(vocabulary))
    
    # Create BoW representation
    bow_matrix = []
    for sentence in processed_sentences:
        words = sentence.split()
        
        # Count unigrams
        bow_vector = {word: 0 for word in vocabulary}
        for word in words:
            if word in bow_vector:
                bow_vector[word] += 1
        
        # Count bigrams if required
        if use_bigrams:
            bigrams = [words[i] + " " + words[i+1] for i in range(len(words)-1)]
            for bigram in bigrams:
                if bigram in bow_vector:
                    bow_vector[bigram] += 1
        
        bow_matrix.append(bow_vector)
    
    return bow_matrix, vocabulary

# Compute unigram BoW
unigram_bow, unigram_vocab = create_bow(sentences, use_bigrams=False)
print("Unigram Vocabulary:", unigram_vocab)
print("\nUnigram BoW Matrix:")
for i, vec in enumerate(unigram_bow):
    print(f"Sentence {i+1}:", {word: count for word, count in vec.items() if count > 0})

# Compute bigram BoW
bigram_bow, bigram_vocab = create_bow(sentences, use_bigrams=True)
print("\nBigram Vocabulary:", bigram_vocab)
print("\nBigram BoW Matrix:")
for i, vec in enumerate(bigram_bow):
    print(f"Sentence {i+1}:", {word: count for word, count in vec.items() if count > 0 and " " in word})



===== Problem 1: Scratch implementation of BoW =====
Unigram Vocabulary: ['a', 'best', 'ever', 'funny', 'i', 'is', 'movie', 'never', 'soooo', 'this', 'what']

Unigram BoW Matrix:
Sentence 1: {'funny': 1, 'is': 1, 'movie': 1, 'soooo': 1, 'this': 1}
Sentence 2: {'a': 1, 'i': 1, 'movie': 1, 'never': 1, 'what': 1}
Sentence 3: {'best': 1, 'ever': 1, 'movie': 2, 'this': 1}

Bigram Vocabulary: ['a', 'a movie', 'best', 'best movie', 'ever', 'ever this', 'funny', 'i', 'i never', 'is', 'is soooo', 'movie', 'movie ever', 'movie i', 'movie is', 'never', 'soooo', 'soooo funny', 'this', 'this movie', 'what', 'what a']

Bigram BoW Matrix:
Sentence 1: {'is soooo': 1, 'movie is': 1, 'soooo funny': 1, 'this movie': 1}
Sentence 2: {'a movie': 1, 'i never': 1, 'movie i': 1, 'what a': 1}
Sentence 3: {'best movie': 1, 'ever this': 1, 'movie ever': 1, 'this movie': 1}


In [3]:
# Problem 2: Calculating TF-IDF
print("\n\n===== Problem 2: Calculating TF-IDF =====")

# Get NLTK stopwords
stop_words = stopwords.words('english')

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(
    stop_words=stop_words,  # Use NLTK stopwords
    max_features=5000,      # Limit vocabulary size to 5000
    ngram_range=(1, 1),     # Use unigrams
    norm='l2'               # Apply L2 normalization by default
)

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(x_train)

# Transform the test data using the same vocabulary
X_test_tfidf = tfidf_vectorizer.transform(x_test)

print(f"Training data shape: {X_train_tfidf.shape}")
print(f"Test data shape: {X_test_tfidf.shape}")
print(f"Vocabulary size: {len(tfidf_vectorizer.get_feature_names_out())}")

# Preview some of the vocabulary
print("\nSample vocabulary words:")
print(list(tfidf_vectorizer.get_feature_names_out()[:20]))





===== Problem 2: Calculating TF-IDF =====
Training data shape: (25000, 5000)
Test data shape: (25000, 5000)
Vocabulary size: 5000

Sample vocabulary words:
['00', '000', '10', '100', '11', '12', '13', '13th', '14', '15', '16', '17', '18', '1930', '1930s', '1933', '1940', '1950', '1950s', '1960']


In [4]:
# Problem 3: Learning using TF-IDF
print("\n\n===== Problem 3: Learning using TF-IDF =====")

def train_and_evaluate(tfidf_vectorizer_params):
    # Create TF-IDF vectorizer with given parameters
    tfidf_vectorizer = TfidfVectorizer(**tfidf_vectorizer_params)
    
    # Fit and transform the training data
    start_time = time.time()
    X_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
    X_test_tfidf = tfidf_vectorizer.transform(x_test)
    vectorize_time = time.time() - start_time
    
    # Train a logistic regression model
    start_time = time.time()
    classifier = LogisticRegression(random_state=42, max_iter=1000)
    classifier.fit(X_train_tfidf, y_train)
    train_time = time.time() - start_time
    
    # Predict and evaluate
    start_time = time.time()
    y_pred = classifier.predict(X_test_tfidf)
    predict_time = time.time() - start_time
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Print results
    print(f"Parameters: {tfidf_vectorizer_params}")
    print(f"Vocabulary size: {len(tfidf_vectorizer.get_feature_names_out())}")
    print(f"Vectorization time: {vectorize_time:.2f} seconds")
    print(f"Training time: {train_time:.2f} seconds")
    print(f"Prediction time: {predict_time:.2f} seconds")
    print(f"Accuracy: {accuracy:.4f}")
    print("-" * 80)
    
    return accuracy

# Experiment 1: Baseline with default settings
print("Experiment 1: Baseline")
baseline_params = {
    'stop_words': stopwords.words('english'),
    'max_features': 5000,
    'ngram_range': (1, 1)
}
baseline_accuracy = train_and_evaluate(baseline_params)

# Experiment 2: Increase vocabulary size
print("Experiment 2: Larger vocabulary")
large_vocab_params = {
    'stop_words': stopwords.words('english'),
    'max_features': 10000,
    'ngram_range': (1, 1)
}
large_vocab_accuracy = train_and_evaluate(large_vocab_params)

# Experiment 3: Include bigrams
print("Experiment 3: Include bigrams")
bigram_params = {
    'stop_words': stopwords.words('english'),
    'max_features': 5000,
    'ngram_range': (1, 2)
}
bigram_accuracy = train_and_evaluate(bigram_params)

# Experiment 4: No stop words
print("Experiment 4: No stop words")
no_stopwords_params = {
    'stop_words': None,
    'max_features': 5000,
    'ngram_range': (1, 1)
}
no_stopwords_accuracy = train_and_evaluate(no_stopwords_params)

# Summarize results
print("Summary of Results:")
print(f"Baseline (5000 features, unigrams, with stopwords): {baseline_accuracy:.4f}")
print(f"Larger vocabulary (10000 features): {large_vocab_accuracy:.4f}")
print(f"Including bigrams: {bigram_accuracy:.4f}")
print(f"No stopwords: {no_stopwords_accuracy:.4f}")



===== Problem 3: Learning using TF-IDF =====
Experiment 1: Baseline
Parameters: {'stop_words': ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'ot

In [5]:
# Problem 4: Scratch implementation of TF-IDF
print("\n\n===== Problem 4: Scratch implementation of TF-IDF =====")

def compute_tfidf(sentences, standard_formula=True):
    # Preprocessing
    processed_sentences = []
    for sentence in sentences:
        # Convert to lowercase
        sentence = sentence.lower()
        # Remove punctuation
        sentence = re.sub(r'[^\w\s]', '', sentence)
        processed_sentences.append(sentence)
    
    # Tokenize sentences into words
    tokenized_sentences = [sentence.split() for sentence in processed_sentences]
    
    # Build vocabulary
    vocabulary = set()
    for tokens in tokenized_sentences:
        vocabulary.update(tokens)
    vocabulary = sorted(list(vocabulary))
    
    # Calculate term frequency (TF)
    term_freq = []
    for tokens in tokenized_sentences:
        # Count word occurrences
        word_counts = {word: 0 for word in vocabulary}
        for token in tokens:
            if token in word_counts:
                word_counts[token] += 1
        
        if standard_formula:
            # Standard formula: tf(t,d) = n(t,d) / sum(n(s,d))
            total_words = sum(word_counts.values())
            if total_words > 0:  # Avoid division by zero
                tf = {word: count / total_words for word, count in word_counts.items()}
            else:
                tf = word_counts
        else:
            # scikit-learn formula: tf(t,d) = n(t,d)
            tf = word_counts
        
        term_freq.append(tf)
    
    # Calculate document frequency (DF)
    doc_freq = {word: 0 for word in vocabulary}
    for tokens in tokenized_sentences:
        # Count documents containing each word
        unique_tokens = set(tokens)
        for word in unique_tokens:
            if word in doc_freq:
                doc_freq[word] += 1
    
    # Calculate inverse document frequency (IDF)
    num_docs = len(tokenized_sentences)
    if standard_formula:
        # Standard formula: idf(t) = log(N / df(t))
        idf = {word: math.log(num_docs / df) if df > 0 else 0 for word, df in doc_freq.items()}
    else:
        # scikit-learn formula: idf(t) = log((1+N)/(1+df(t))) + 1
        idf = {word: math.log((1 + num_docs) / (1 + df)) + 1 for word, df in doc_freq.items()}
    
    # Calculate TF-IDF
    tfidf_matrix = []
    for tf in term_freq:
        tfidf = {word: tf[word] * idf[word] for word in vocabulary}
        tfidf_matrix.append(tfidf)
    
    return tfidf_matrix, vocabulary

# Test sentences
sentences = [
    "This movie is SOOOO funny!!!",
    "What a movie! I never",
    "best movie ever!!!!! this movie"
]

# Calculate TF-IDF using standard formula
standard_tfidf, vocabulary = compute_tfidf(sentences, standard_formula=True)
print("Standard TF-IDF Formula:")
for i, tfidf in enumerate(standard_tfidf):
    print(f"Sentence {i+1}:")
    for word, value in sorted(tfidf.items(), key=lambda x: -x[1])[:5]:  # Show top 5 highest values
        print(f"  {word}: {value:.4f}")

# Calculate TF-IDF using scikit-learn formula
sklearn_tfidf, _ = compute_tfidf(sentences, standard_formula=False)
print("\nScikit-learn TF-IDF Formula:")
for i, tfidf in enumerate(sklearn_tfidf):
    print(f"Sentence {i+1}:")
    for word, value in sorted(tfidf.items(), key=lambda x: -x[1])[:5]:  # Show top 5 highest values
        print(f"  {word}: {value:.4f}")



===== Problem 4: Scratch implementation of TF-IDF =====
Standard TF-IDF Formula:
Sentence 1:
  funny: 0.2197
  is: 0.2197
  soooo: 0.2197
  this: 0.0811
  a: 0.0000
Sentence 2:
  a: 0.2197
  i: 0.2197
  never: 0.2197
  what: 0.2197
  best: 0.0000
Sentence 3:
  best: 0.2197
  ever: 0.2197
  this: 0.0811
  a: 0.0000
  funny: 0.0000

Scikit-learn TF-IDF Formula:
Sentence 1:
  funny: 1.6931
  is: 1.6931
  soooo: 1.6931
  this: 1.2877
  movie: 1.0000
Sentence 2:
  a: 1.6931
  i: 1.6931
  never: 1.6931
  what: 1.6931
  movie: 1.0000
Sentence 3:
  movie: 2.0000
  best: 1.6931
  ever: 1.6931
  this: 1.2877
  a: 0.0000


In [10]:
# Problem 5: Corpus preprocessing

print("\n\n===== Problem 5: Corpus preprocessing =====")


# def preprocess_text(text):
    
#     # Convert to lowercase
#     text = text.lower()
    
#     # Remove URLs
#     text = re.sub(r'http\S+', '', text)
    
#     # Remove HTML tags
#     text = re.sub(r'<.*?>', '', text)
    
#     # Remove special characters and numbers
#     text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
#     # Tokenize text
#     tokens = word_tokenize(text)
    
#     # Remove extra spaces and empty tokens
#     tokens = [token for token in tokens if token.strip()]
    
#     return tokens

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation and replace with spaces
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Simple tokenization by splitting on whitespace
    tokens = text.split()
    
    # Remove empty tokens
    tokens = [token for token in tokens if token.strip()]
    
    return tokens

# Preprocess training and testing data
preprocessed_train = [preprocess_text(review) for review in x_train]
preprocessed_test = [preprocess_text(review) for review in x_test]

# Print statistics
print(f"Number of training reviews: {len(preprocessed_train)}")
print(f"Number of test reviews: {len(preprocessed_test)}")

# Print a sample preprocessed review
sample_idx = 0
print(f"Original review (first 150 chars): {x_train[sample_idx][:150]}...")
print(f"Preprocessed review (first 30 tokens): {preprocessed_train[sample_idx][:30]}...")




===== Problem 5: Corpus preprocessing =====
Number of training reviews: 25000
Number of test reviews: 25000
Original review (first 150 chars): Zero Day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. It...
Preprocessed review (first 30 tokens): ['zero', 'day', 'leads', 'you', 'to', 'think', 'even', 're', 'think', 'why', 'two', 'boys', 'young', 'men', 'would', 'do', 'what', 'they', 'did', 'commit', 'mutual', 'suicide', 'via', 'slaughtering', 'their', 'classmates', 'it', 'captures', 'what', 'must']...


In [11]:
# Problem 6: Learning Word2Vec
print("\n\n===== Problem 6: Learning Word2Vec =====")

# Set Word2Vec parameters
vector_size = 100   # Dimensionality of the word vectors
window = 5          # Maximum distance between current and predicted word
min_count = 5       # Minimum word frequency to include in vocabulary
workers = 4         # Number of CPU cores to use

# Train Word2Vec model
start_time = time.time()
w2v_model = Word2Vec(
    sentences=preprocessed_train,
    vector_size=vector_size,
    window=window,
    min_count=min_count,
    workers=workers,
    sg=1  # Use skip-gram (sg=1) instead of CBOW (sg=0)
)
training_time = time.time() - start_time

# Print model information
print(f"Word2Vec model trained in {training_time:.2f} seconds")
print(f"Vocabulary size: {len(w2v_model.wv.key_to_index)}")

# Save the model (optional)
# w2v_model.save("imdb_word2vec.model")

# Explore some word vectors
common_words = ['movie', 'good', 'bad', 'excellent', 'terrible']
for word in common_words:
    if word in w2v_model.wv:
        print(f"\nVector for '{word}' (first 5 dimensions):")
        print(w2v_model.wv[word][:5])
        
        # Find similar words
        similar_words = w2v_model.wv.most_similar(word, topn=5)
        print(f"Most similar words to '{word}':")
        for similar_word, similarity in similar_words:
            print(f"  {similar_word}: {similarity:.4f}")



===== Problem 6: Learning Word2Vec =====
Word2Vec model trained in 101.37 seconds
Vocabulary size: 28770

Vector for 'movie' (first 5 dimensions):
[ 0.22120029  0.23028044  0.16105013 -0.04356096 -0.1844663 ]
Most similar words to 'movie':
  film: 0.9156
  programme: 0.7718
  flick: 0.7713
  monstrosity: 0.7538
  loooong: 0.7387

Vector for 'good' (first 5 dimensions):
[-0.37467164 -0.08395848 -0.28677136  0.08928741 -0.1305031 ]
Most similar words to 'good':
  decent: 0.8041
  great: 0.7842
  bad: 0.7632
  fine: 0.7312
  nice: 0.7229

Vector for 'bad' (first 5 dimensions):
[-0.14845057 -0.14544067 -0.32181013  0.00300282  0.13404873]
Most similar words to 'bad':
  terrible: 0.8243
  horrible: 0.7884
  awful: 0.7787
  good: 0.7632
  lousy: 0.7443

Vector for 'excellent' (first 5 dimensions):
[ 0.11832446  0.04754427  0.20652366 -0.05519987 -0.15238181]
Most similar words to 'excellent':
  outstanding: 0.8729
  exceptional: 0.8139
  terrific: 0.7853
  superb: 0.7769
  fantastic: 0.776

In [12]:
# Problem 7: Vector Visualization
print("\n\n===== Problem 7: Vector Visualization =====")

def visualize_embeddings(model, words=None, n_words=50):
    """
    Visualize word embeddings using t-SNE
    
    Parameters:
    -----------
    model : Word2Vec model
        Trained Word2Vec model
    words : list, optional
        Specific words to visualize. If None, top n_words by frequency will be used
    n_words : int, optional
        Number of words to visualize if words is None
    """
    # Get embedding matrix
    if words is None:
        # Get the most common words
        words = [word for word, vocab in 
                 sorted(model.wv.key_to_index.items(), 
                        key=lambda item: model.wv.get_vecattr(item[0], "count"),
                        reverse=True)[:n_words]]
    else:
        # Filter out words not in vocabulary
        words = [word for word in words if word in model.wv]
    
    # Extract word vectors
    word_vectors = np.array([model.wv[word] for word in words])
    
    # Apply t-SNE dimensionality reduction
    tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(words)-1))
    embedded_vectors = tsne.fit_transform(word_vectors)
    
    # Create plot
    plt.figure(figsize=(10, 8))
    
    # Plot all points
    plt.scatter(embedded_vectors[:, 0], embedded_vectors[:, 1], s=10, alpha=0.5)
    
    # Annotate words
    for i, word in enumerate(words):
        plt.annotate(word, xy=(embedded_vectors[i, 0], embedded_vectors[i, 1]),
                     xytext=(5, 2), textcoords='offset points',
                     ha='right', va='bottom', fontsize=9)
    
    plt.title("t-SNE visualization of Word2Vec embeddings")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Define interesting word groups to visualize
sentiment_words = ['great', 'good', 'excellent', 'fantastic', 'terrible', 'bad', 'awful', 'horrible']
movie_words = ['movie', 'film', 'cinema', 'documentary', 'scene', 'actor', 'actress', 'director']
rating_words = ['star', 'rating', 'review', 'recommend', 'suggest', 'watch', 'avoid']

# Filter to include only words in vocabulary
sentiment_words = [word for word in sentiment_words if word in w2v_model.wv]
movie_words = [word for word in movie_words if word in w2v_model.wv]
rating_words = [word for word in rating_words if word in w2v_model.wv]

print(f"Sentiment words in vocabulary: {sentiment_words}")
print(f"Movie words in vocabulary: {movie_words}")
print(f"Rating words in vocabulary: {rating_words}")

print("Visualizing would display plots here, but in a real environment we'd see the t-SNE plots")



===== Problem 7: Vector Visualization =====
Sentiment words in vocabulary: ['great', 'good', 'excellent', 'fantastic', 'terrible', 'bad', 'awful', 'horrible']
Movie words in vocabulary: ['movie', 'film', 'cinema', 'documentary', 'scene', 'actor', 'actress', 'director']
Rating words in vocabulary: ['star', 'rating', 'review', 'recommend', 'suggest', 'watch', 'avoid']
Visualizing would display plots here, but in a real environment we'd see the t-SNE plots


In [13]:
# Problem 8: Movie review classification using Word2Vec
print("\n\n===== Problem 8: Movie review classification using Word2Vec =====")

def create_document_vector(tokens, word_vectors, vector_size=100):
    """
    Create a document vector by averaging word vectors
    
    Parameters:
    ----------
    tokens : list
        List of words in the document
    word_vectors : Word2Vec.wv
        Word vectors from trained Word2Vec model
    vector_size : int
        Dimensionality of word vectors
    
    Returns:
    -------
    numpy.ndarray
        Document vector of shape (vector_size,)
    """
    # Initialize document vector
    doc_vector = np.zeros(vector_size)
    
    # Count words with vectors
    word_count = 0
    
    # Sum all word vectors
    for token in tokens:
        if token in word_vectors:
            doc_vector += word_vectors[token]
            word_count += 1
    
    # Average the vectors
    if word_count > 0:
        doc_vector /= word_count
    
    return doc_vector

def vectorize_documents(documents, word_vectors, vector_size=100):
    """
    Vectorize a list of documents using Word2Vec
    
    Parameters:
    ----------
    documents : list
        List of tokenized documents
    word_vectors : Word2Vec.wv
        Word vectors from trained Word2Vec model
    vector_size : int
        Dimensionality of word vectors
    
    Returns:
    -------
    numpy.ndarray
        Document vectors of shape (n_documents, vector_size)
    """
    # Initialize matrix for all document vectors
    doc_vectors = np.zeros((len(documents), vector_size))
    
    # Process each document
    for i, tokens in enumerate(documents):
        doc_vectors[i] = create_document_vector(tokens, word_vectors, vector_size)
        
    return doc_vectors

# Vectorize documents using our trained Word2Vec model
print("Vectorizing documents...")
start_time = time.time()
X_train_w2v = vectorize_documents(preprocessed_train, w2v_model.wv, vector_size)
X_test_w2v = vectorize_documents(preprocessed_test, w2v_model.wv, vector_size)
vectorize_time = time.time() - start_time
print(f"Vectorization completed in {vectorize_time:.2f} seconds")

# Train a logistic regression model on Word2Vec vectors
print("\nTraining logistic regression on Word2Vec vectors...")
classifier = LogisticRegression(max_iter=1000, random_state=42)
start_time = time.time()
classifier.fit(X_train_w2v, y_train)
train_time = time.time() - start_time

# Predict
start_time = time.time()
y_pred = classifier.predict(X_test_w2v)
predict_time = time.time() - start_time

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Training time: {train_time:.2f} seconds")
print(f"Prediction time: {predict_time:.2f} seconds")
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))

# Compare Word2Vec with TF-IDF
print("\nComparison of methods:")
print(f"1. TF-IDF (baseline): {baseline_accuracy:.4f}")
print(f"2. Word2Vec average vectors: {accuracy:.4f}")



===== Problem 8: Movie review classification using Word2Vec =====
Vectorizing documents...
Vectorization completed in 39.76 seconds

Training logistic regression on Word2Vec vectors...
Training time: 0.62 seconds
Prediction time: 0.01 seconds
Accuracy: 0.8535
              precision    recall  f1-score   support

           0       0.85      0.86      0.86     12500
           1       0.86      0.84      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000


Comparison of methods:
1. TF-IDF (baseline): 0.8810
2. Word2Vec average vectors: 0.8535
