In [54]:
import pandas as pd 
import ssl
import nltk
import re
import time 
import numpy as np
import warnings
import matplotlib 
import torch
from sklearn.feature_extraction.text import CountVectorizer

from utils.system import *

from transformers import BertModel, BertTokenizer
from scipy.spatial.distance import cosine
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

# To suppress all warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="the specific warning message", module="the_module")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\weigfan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Functions

In [55]:
# Function to split text into sentences
def split_into_sentences(text):
    # Correcting common abbreviations and numbers with periods to avoid incorrect splits
    text = re.sub(r'\b(e.g.|i.e.|etc.)\b', lambda x: x.group().replace('.', ''), text)
    text = re.sub(r'(\d+)\.', r'\1', text)
    text = re.sub(r'\d+', '', text)
    # Tokenize the corrected text into sentences
    return sent_tokenize(text)

# Mean pooling function
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

# Function to get word embedding using mean pooling
def get_word_embedding(word):
    inputs = tokenizer(word, return_tensors="pt")
    with torch.no_grad():  # Disable gradient calculation for efficiency
        outputs = model(**inputs)
    attention_mask = inputs['attention_mask']
    word_embedding = mean_pooling(outputs, attention_mask)
    return word_embedding.squeeze().numpy()

# Function to find top N closest words
def cosine_similarity_for_words(word, word_list):
    word_embedding = get_word_embedding(word)
    closest_words = []

    for other_word in word_list:
        other_word_embedding = get_word_embedding(other_word)
        similarity = 1 - cosine(word_embedding, other_word_embedding)
        closest_words.append((other_word, similarity))

    return closest_words

# Function to find top N closest words
def top_n_closest_words(word, word_list, n=5):
    word_embedding = get_word_embedding(word)
    closest_words = []

    for other_word in word_list:
        other_word_embedding = get_word_embedding(other_word)
        similarity = 1 - cosine(word_embedding, other_word_embedding)
        closest_words.append((other_word, similarity))

    # Sort by similarity
    closest_words.sort(key=lambda x: x[1], reverse=True)

    return closest_words[:n]

### Load Data

In [None]:
# Load BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [None]:
data = pd.read_parquet(get_data() / 'clean_data.parquet.brotli')

### Get Sentence Data

In [58]:
# Get Lonely Articles
lonely_data = data.copy(deep=True)
lonely_data = lonely_data.reset_index(drop=True)
lonely_data['id'] = lonely_data.index

In [59]:
# Split text into sentences
lonely_data['sentences'] = lonely_data['cleaned_article'].apply(split_into_sentences)

### Convert to Word Corpus

In [60]:
corpus = []
for i, text in enumerate(lonely_data['sentences']):
    post = text
    post = ' '.join(post)
    corpus.append(post)

### Get Cosine Similarity (Ngram = 1)

In [61]:
size_of_feature=3000
cv=CountVectorizer(max_features = size_of_feature, stop_words='english', ngram_range=(1, 1))
X=cv.fit_transform(corpus)
feat_dict=cv.get_feature_names_out()
X.shape

(7999, 3000)

In [62]:
# Apply the get_bert_embedding function to your cleaned sentences
start_time = time.time()
word_to_compare = "loneliness"
top_n_closest_wd_loneliness = cosine_similarity_for_words(word_to_compare, feat_dict)

word_to_compare = "lonely"
top_n_closest_wd_lonely = cosine_similarity_for_words(word_to_compare, feat_dict)

word_to_compare = "aloneness"
top_n_closest_wd_aloneness = cosine_similarity_for_words(word_to_compare, feat_dict)

word_to_compare = "isolation"
top_n_closest_wd_isolation = cosine_similarity_for_words(word_to_compare, feat_dict)

total_time = time.time() - start_time
print(f"Total Time: {total_time}")

Total Time: 236.8773136138916


#### Concatenate and Calculate Average Cosine Similarity

In [None]:
df_top_n_closest_wd_loneliness = pd.DataFrame(top_n_closest_wd_loneliness, columns=['word', 'loneliness-similarity'])
df_top_n_closest_wd_lonely = pd.DataFrame(top_n_closest_wd_lonely, columns=['word', 'lonely-similarity'])
df_top_n_closest_wd_aloneness = pd.DataFrame(top_n_closest_wd_aloneness, columns=['word', 'aloneness-similarity'])
df_top_n_closest_wd_isolation = pd.DataFrame(top_n_closest_wd_isolation, columns=['word', 'isolation-similarity'])

df_top_n_closet_wd = df_top_n_closest_wd_loneliness
df_top_n_closet_wd['lonely-similarity'] = df_top_n_closest_wd_lonely['lonely-similarity']
df_top_n_closet_wd['aloneness-similarity'] = df_top_n_closest_wd_aloneness['aloneness-similarity']
df_top_n_closet_wd['isolation-similarity'] = df_top_n_closest_wd_isolation['isolation-similarity']

In [None]:
# Export Data
df_top_n_closet_wd['average'] = df_top_n_closet_wd[['loneliness-similarity', 'lonely-similarity', 'aloneness-similarity','isolation-similarity']].mean(axis=1)
df_top_n_closet_wd_sorted.to_parquet(get_data() / 'bert_word_cosine.parquet.brotli', compression='brotli')

### Get Cosine Similarity (Ngram = 1)

In [76]:
size_of_feature=3000
cv=CountVectorizer(max_features = size_of_feature, stop_words='english', ngram_range=(2, 2))
X=cv.fit_transform(corpus)
feat_dict=cv.get_feature_names_out()
X.shape

(7999, 3000)

### Get Word Embeddings

In [77]:
# Apply the get_bert_embedding function to your cleaned sentences
start_time = time.time()
word_to_compare = "feel lonely"

top_n_closest_phrase = top_n_closest_words(word_to_compare, feat_dict, 2000)
total_time = time.time() - start_time
print(f"Total Time: {total_time}")

Total Time: 67.83481073379517
