In [77]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.optim as optim

In [78]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [79]:
def tokenize_text(text):
    tokenized_text = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        pad_to_max_length=True,
        truncation=True,  # Explicitly activate truncation
        return_attention_mask=True,
        return_tensors='pt'
    )
    return tokenized_text['input_ids'], tokenized_text['attention_mask']

In [80]:
# Define a Support Vector Machine (SVM) classifier
svm_classifier = SVC(kernel='linear')

# Define a neural network for sentiment analysis
class SentimentAnalysisNN(nn.Module):
    def __init__(self):
        super(SentimentAnalysisNN, self).__init__()
        self.embedding = bert_model
        self.fc = nn.Linear(768, 1)  # Output 1 because it's binary sentiment classification

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.embedding(input_ids=input_ids, attention_mask=attention_mask)
        output = self.fc(pooled_output)
        return torch.sigmoid(output)

In [81]:
# Define a neural network for contextual analysis
class ContextualAnalysisNN(nn.Module):
    def __init__(self):
        super(ContextualAnalysisNN, self).__init__()
        self.embedding = bert_model
        self.fc = nn.Linear(768, 1)  # Output 1 because it's binary classification

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.embedding(input_ids=input_ids, attention_mask=attention_mask)
        output = self.fc(pooled_output)
        return torch.sigmoid(output)
    
    # Initialize sentiment analysis neural network
sentiment_model = SentimentAnalysisNN()

# Initialize contextual analysis neural network
contextual_model = ContextualAnalysisNN()

# Define loss function and optimizer for sentiment analysis
sentiment_criterion = nn.BCELoss()
sentiment_optimizer = optim.Adam(sentiment_model.parameters(), lr=0.001)

# Define loss function and optimizer for contextual analysis
contextual_criterion = nn.BCELoss()
contextual_optimizer = optim.Adam(contextual_model.parameters(), lr=0.001)

In [82]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move models and data to the available device
sentiment_model.to(device)
contextual_model.to(device)

ContextualAnalysisNN(
  (embedding): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [83]:
# Training loop for sentiment analysis
def train_sentiment(train_loader, num_epochs=5):
    for _ in range(num_epochs):
        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            sentiment_optimizer.zero_grad()

            outputs = sentiment_model(*inputs)
            loss = sentiment_criterion(outputs, labels.unsqueeze(1).float())
            loss.backward()
            sentiment_optimizer.step()

In [84]:
# Training loop for contextual analysis
def train_contextual(train_loader, num_epochs=5):
    for _ in range(num_epochs):  # Use _ instead of epoch as it's not being used
        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            contextual_optimizer.zero_grad()

            outputs = contextual_model(inputs)
            loss = contextual_criterion(outputs, labels.unsqueeze(1).float())
            loss.backward()
            contextual_optimizer.step()

In [85]:
# Function to predict sentiment
def predict_sentiment(text):
    input_ids, attention_mask = tokenize_text(text)
    with torch.no_grad():
        output = sentiment_model(input_ids.unsqueeze(0), attention_mask.unsqueeze(0))
    return output.item()

In [86]:
# Function to predict contextual analysis
def predict_contextual(text):
    input_ids, attention_mask = tokenize_text(text)
    with torch.no_grad():
        output = contextual_model(input_ids.unsqueeze(0), attention_mask.unsqueeze(0))
    return output.item()

In [87]:
# Example usage
text = "I did not steal the money."
sentiment_score = predict_sentiment(text)
contextual_score = predict_contextual(text)
print("Sentiment score:", sentiment_score)
print("Contextual score:", contextual_score)

ValueError: too many values to unpack (expected 2)

In [3]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Input texts
input_text_1 = "Young women, 'account for a shocking 74% of all new HIV infections among adolescents in Africa.'"
input_text_2 = "HIV infections are disproportionately affecting young women in Africa."

# Tokenize input texts
input_ids_1 = tokenizer.encode(input_text_1, add_special_tokens=True, return_tensors="pt")
input_ids_2 = tokenizer.encode(input_text_2, add_special_tokens=True, return_tensors="pt")

# Perform forward passes
with torch.no_grad():
    outputs_1 = model(input_ids_1)
    outputs_2 = model(input_ids_2)

# Get the CLS embeddings
cls_embedding_1 = outputs_1.last_hidden_state[:, 0, :]  # CLS embedding for input_text_1
cls_embedding_2 = outputs_2.last_hidden_state[:, 0, :]  # CLS embedding for input_text_2

# Convert CLS embeddings to numpy arrays
cls_embedding_np_1 = cls_embedding_1.numpy()
cls_embedding_np_2 = cls_embedding_2.numpy()

# Calculate cosine similarity
similarity_score = cosine_similarity(cls_embedding_np_1, cls_embedding_np_2)

print("Cosine Similarity Score:", similarity_score[0][0])

Cosine Similarity Score: 0.77243125


In [7]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Input texts
input_text_1 = "Young women, 'account for a shocking 74% of all new HIV infections among adolescents in Africa.'"
input_text_2 = "HIV infections are disproportionately affecting young women in Africa."

# Tokenize input texts
input_ids_1 = tokenizer.encode(input_text_1, add_special_tokens=True, return_tensors="pt")
input_ids_2 = tokenizer.encode(input_text_2, add_special_tokens=True, return_tensors="pt")

# Perform forward passes
with torch.no_grad():
    outputs_1 = model(input_ids_1)
    outputs_2 = model(input_ids_2)

# Get the CLS embeddings
cls_embedding_1 = outputs_1.last_hidden_state[:, 0, :].detach().numpy()  # CLS embedding for input_text_1
cls_embedding_2 = outputs_2.last_hidden_state[:, 0, :].detach().numpy()  # CLS embedding for input_text_2

# Calculate cosine similarity
cosine_similarity_score = cosine_similarity(cls_embedding_1, cls_embedding_2)

# Calculate Euclidean distance
euclidean_distance = euclidean_distances(cls_embedding_1, cls_embedding_2)

# Convert embeddings to sets for Jaccard similarity
set_1 = set(np.where(cls_embedding_1 > 0)[1])
set_2 = set(np.where(cls_embedding_2 > 0)[1])

# Calculate Jaccard similarity manually
intersection = len(set_1.intersection(set_2))
union = len(set_1.union(set_2))
jaccard_similarity = intersection / union

print("Cosine Similarity Score:", cosine_similarity_score[0][0])
print("Euclidean Distance:", euclidean_distance[0][0])
print("Jaccard Similarity Score:", jaccard_similarity)


Cosine Similarity Score: 0.77243125
Euclidean Distance: 10.458046
Jaccard Similarity Score: 0.567287784679089


In [9]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Input text
input_text = "Young women, 'account for a shocking 74% of all new HIV infections among adolescents in Africa.'"

# Tokenize input text
input_ids = tokenizer.encode(input_text, add_special_tokens=True, return_tensors="pt")

# Perform forward pass
with torch.no_grad():
    outputs = model(input_ids)

# Get the contextual embeddings for each token
contextual_embeddings = outputs.last_hidden_state[0]  # Batch dimension is 1

# Convert contextual embeddings to numpy array
contextual_embeddings_np = contextual_embeddings.numpy()

# Get the vocabulary from the tokenizer
vocab = tokenizer.get_vocab()

# Map indices to tokens
indices_to_tokens = {idx: token for token, idx in vocab.items()}

# Find the top similar words for each word and calculate scores
word_scores = {}
for i in range(len(input_ids[0])):
    word = indices_to_tokens[input_ids[0][i].item()]
    word_embedding = contextual_embeddings_np[i].reshape(1, -1)
    similarity_scores = cosine_similarity(word_embedding, contextual_embeddings_np)[0]
    most_similar_indices = np.argsort(similarity_scores)[::-1][1:6]  # Exclude the word itself
    most_similar_words = [indices_to_tokens[idx] for idx in most_similar_indices]
    average_similarity = np.mean(similarity_scores[most_similar_indices])
    word_scores[word] = average_similarity

# Print scores
print("Word Scores:")
for word, score in word_scores.items():
    print(word + ":", score)


Word Scores:
[CLS]: 0.37001008
young: 0.5208982
women: 0.54647523
,: 0.48232278
': 0.45623493
account: 0.67192876
for: 0.7431141
a: 0.7444166
shocking: 0.61947006
74: 0.41476735
%: 0.63859886
of: 0.7561859
all: 0.7466289
new: 0.6660677
hiv: 0.5798317
infections: 0.6483834
among: 0.6813404
adolescents: 0.58263814
in: 0.6916919
africa: 0.5170099
.: 0.32017565
[SEP]: 0.31911638
