In [54]:
import pandas as pd 
import ssl
import nltk
import re
import time 
import numpy as np
import warnings
import matplotlib 
import torch
from sklearn.feature_extraction.text import CountVectorizer

from utils.system import *

from transformers import BertModel, BertTokenizer
from scipy.spatial.distance import cosine
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

# To suppress all warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="the specific warning message", module="the_module")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\weigfan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Functions

In [55]:
# Function to split text into sentences
def split_into_sentences(text):
    # Correcting common abbreviations and numbers with periods to avoid incorrect splits
    text = re.sub(r'\b(e.g.|i.e.|etc.)\b', lambda x: x.group().replace('.', ''), text)
    text = re.sub(r'(\d+)\.', r'\1', text)
    text = re.sub(r'\d+', '', text)
    # Tokenize the corrected text into sentences
    return sent_tokenize(text)

# Mean pooling function
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

# Function to get word embedding using mean pooling
def get_word_embedding(word):
    inputs = tokenizer(word, return_tensors="pt")
    with torch.no_grad():  # Disable gradient calculation for efficiency
        outputs = model(**inputs)
    attention_mask = inputs['attention_mask']
    word_embedding = mean_pooling(outputs, attention_mask)
    return word_embedding.squeeze().numpy()

# Function to find top N closest words
def cosine_similarity_for_words(word, word_list):
    word_embedding = get_word_embedding(word)
    closest_words = []

    for other_word in word_list:
        other_word_embedding = get_word_embedding(other_word)
        similarity = 1 - cosine(word_embedding, other_word_embedding)
        closest_words.append((other_word, similarity))

    return closest_words


# Function to find top N closest words
def top_n_closest_words(word, word_list, n=5):
    word_embedding = get_word_embedding(word)
    closest_words = []

    for other_word in word_list:
        other_word_embedding = get_word_embedding(other_word)
        similarity = 1 - cosine(word_embedding, other_word_embedding)
        closest_words.append((other_word, similarity))

    # Sort by similarity
    closest_words.sort(key=lambda x: x[1], reverse=True)

    return closest_words[:n]

In [56]:
# Load BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Ensure the model is in evaluation mode
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [57]:
data = pd.read_parquet(get_data() / 'clean_data.parquet.brotli')
#print(data)

                                                   text  overall_label  \
id                                                                       
0     Red streak girlSo uh yeah she seems pretty chi...              0   
1            Just wanna talkIm here to talk if you want              0   
2     How to get over jealousy of socially active fr...              0   
3     Were all lonely people, right?Saw a post on he...              0   
4     i hate my birthdaymy birthday is in two days a...              1   
...                                                 ...            ...   
7994  HelloHow are you today? \r\r\nAnd how was your...              0   
7995  I have nobody and nothing to live for.At this ...              1   
7996  26M [Friendship] - Clean audio chat - Depressi...              0   
7997  I forgot how to make friendsIt's been so long ...              1   
7998  Please.Hi. If anyone can keep me company tonig...              0   

                                     

### Get Sentence Data

In [58]:
# Get Lonely Articles
lonely_data = data.copy(deep=True)
lonely_data = lonely_data.reset_index(drop=True)
lonely_data['id'] = lonely_data.index

In [59]:
# Split text into sentences
lonely_data['sentences'] = lonely_data['cleaned_article'].apply(split_into_sentences)

### Combined all sentences into one list

In [60]:
corpus = []
for i, text in enumerate(lonely_data['sentences']):
    post = text
    post = ' '.join(post)
    corpus.append(post)

### Get Consine Similarity when ngram = 1

In [61]:
size_of_feature=3000
cv=CountVectorizer(max_features = size_of_feature, stop_words='english', ngram_range=(1, 1))
X=cv.fit_transform(corpus)
feat_dict=cv.get_feature_names_out()
X.shape

(7999, 3000)

### Calculate the Cosine Similarity of each word

In [62]:
# Apply the get_bert_embedding function to your cleaned sentences
start_time = time.time()
word_to_compare = "loneliness"
top_n_closest_wd_loneliness = cosine_similarity_for_words(word_to_compare, feat_dict)

word_to_compare = "lonely"
top_n_closest_wd_lonely = cosine_similarity_for_words(word_to_compare, feat_dict)

word_to_compare = "aloneness"
top_n_closest_wd_aloneness = cosine_similarity_for_words(word_to_compare, feat_dict)

word_to_compare = "isolation"
top_n_closest_wd_isolation = cosine_similarity_for_words(word_to_compare, feat_dict)


total_time = time.time() - start_time
print(f"Total Time: {total_time}")

Total Time: 236.8773136138916


### Combine all Cosine Similarity into one dataframe

In [65]:
df_top_n_closest_wd_loneliness = pd.DataFrame(top_n_closest_wd_loneliness, columns=['word', 'loneliness-similarity'])
df_top_n_closest_wd_lonely = pd.DataFrame(top_n_closest_wd_lonely, columns=['word', 'lonely-similarity'])
df_top_n_closest_wd_aloneness = pd.DataFrame(top_n_closest_wd_aloneness, columns=['word', 'aloneness-similarity'])
df_top_n_closest_wd_isolation = pd.DataFrame(top_n_closest_wd_isolation, columns=['word', 'isolation-similarity'])

df_top_n_closet_wd = df_top_n_closest_wd_loneliness
df_top_n_closet_wd['lonely-similarity'] = df_top_n_closest_wd_lonely['lonely-similarity']
df_top_n_closet_wd['aloneness-similarity'] = df_top_n_closest_wd_aloneness['aloneness-similarity']
df_top_n_closet_wd['isolation-similarity'] = df_top_n_closest_wd_isolation['isolation-similarity']

           word  loneliness-similarity  lonely-similarity  \
0       abandon               0.918922           0.898049   
1     abandoned               0.875125           0.873156   
2       ability               0.890194           0.876321   
3          able               0.868866           0.857165   
4     abortions               0.662040           0.610981   
...         ...                    ...                ...   
2995         yr               0.687149           0.680593   
2996        yrs               0.624784           0.620009   
2997       zero               0.833694           0.855695   
2998       zone               0.834811           0.829768   
2999       zoom               0.809533           0.804235   

      aloneness-similarity  isolation-similarity  
0                 0.774498              0.927264  
1                 0.760706              0.886401  
2                 0.725984              0.889708  
3                 0.675333              0.842308  
4           

### Calaulate the mean of Cosine Similarity for each word

In [72]:
df_top_n_closet_wd['Average'] = df_top_n_closet_wd[['loneliness-similarity', 'lonely-similarity', 'aloneness-similarity','isolation-similarity']].mean(axis=1)
df_top_n_closet_wd_sorted
df_top_n_closet_wd_sorted.to_parquet(get_data() / 'bert_word.parquet.brotli', compression='brotli')

Rows where Average is greater than 0.75 :
            word  loneliness-similarity  lonely-similarity  \
1599  loneliness               1.000000           0.931305   
1600      lonely               0.931305           1.000000   
1607     longing               0.936320           0.946851   
1448   isolation               0.905714           0.890028   
2171   rejection               0.938780           0.911769   
...          ...                    ...                ...   
1701       memes               0.761811           0.773769   
2307      season               0.759253           0.773558   
948     external               0.781475           0.755014   
1004        fees               0.745641           0.747480   
2007   political               0.771576           0.749205   

      aloneness-similarity  isolation-similarity   Average  
1599              0.822874              0.905714  0.914973  
1600              0.786491              0.890028  0.901956  
1607              0.804302    

### Get Consine Similarity when ngram = 2

In [76]:
size_of_feature=3000
cv=CountVectorizer(max_features = size_of_feature, stop_words='english', ngram_range=(2, 2))
X=cv.fit_transform(corpus)
feat_dict=cv.get_feature_names_out()
X.shape

(7999, 3000)

### Get Word Embeddings

In [77]:
# Apply the get_bert_embedding function to your cleaned sentences
start_time = time.time()
word_to_compare = "feel lonely"

top_n_closest_phrase = top_n_closest_words(word_to_compare, feat_dict, 2000)
total_time = time.time() - start_time
print(f"Total Time: {total_time}")

Total Time: 67.83481073379517
