In [21]:
import numpy as np
import torch
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Download and load pre-trained GloVe embeddings
from gensim.models import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format('glove.6B.100d.txt', no_header=True)

# Download and load pre-trained ELMo model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ainize/bart-base-cnn")
elmo_model = AutoModelForSequenceClassification.from_pretrained("ainize/bart-base-cnn")

# Define a function to preprocess text data
def preprocess_text(text):
  # Lowercase, remove punctuation, tokenize
  text = text.lower().strip().replace(",", " ").replace(".", " ")
  tokens = tokenizer.tokenize(text, return_tensors="pt")
  return tokens

# Load dataset
from datasets import load_dataset
data = load_dataset("imdb")

# Split data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["train"]["text"], data["train"]["label"], test_size=0.2
)

# Function to get word embeddings (Glove or ELMO)
def get_embeddings(text, model_type="glove"):
  embeddings = []
  for word in text.split():
    if model_type == "glove":
      # Check if word exists in vocabulary
      if word in glove_model.key_to_index:
        embeddings.append(glove_model[word])
      else:
        embeddings.append(np.zeros(100))  # Use zero vector for missing words
    elif model_type == "elmo":
      # Get ELMo embeddings for the entire sentence
      with torch.no_grad():
        inputs = preprocess_text(text)
        encoded_layers = elmo_model(*inputs)
        embeddings = encoded_layers[0].squeeze(0).cpu().numpy()
    else:
      raise ValueError("Invalid model type. Choose 'glove' or 'elmo'.")
  # Average word embeddings to get sentence embedding
  return np.mean(embeddings, axis=0)


Some weights of the model checkpoint at ainize/bart-base-cnn were not used when initializing BartForSequenceClassification: ['lm_head.weight', 'final_logits_bias']
- This IS expected if you are initializing BartForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BartForSequenceClassification were not initialized from the model checkpoint at ainize/bart-base-cnn and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.dense.bias']
You should probably TRAIN this model on a down-strea

In [29]:
# Train and evaluate models with Glove and ELMO embeddings
def train_evaluate_model(embeddings, train_labels, test_labels):
  # Train a logistic regression model
  model = LogisticRegression(max_iter=1000)
  model.fit(embeddings, train_labels)
  predictions = model.predict(embeddings)
  accuracy = np.mean(predictions == train_labels)
  return accuracy

In [30]:

# Evaluate Glove and ELMO embeddings
glove_embeddings = [get_embeddings(text, model_type="glove") for text in train_texts]
#elmo_embeddings = [get_embeddings(text, model_type="elmo") for text in train_texts]

glove_accuracy = train_evaluate_model(glove_embeddings, train_labels, test_labels)
#elmo_accuracy = train_evaluate_model(elmo_embeddings, train_labels, test_labels)

print("Glove embedding accuracy:", glove_accuracy)
#print("ELMo embedding accuracy:", elmo_accuracy)

Glove embedding accuracy: 0.7876
