In [55]:
import pandas as pd 
import ssl
import nltk
import re
import time 
import numpy as np
import warnings
import matplotlib 
import torch

from utils.system import *

from transformers import AutoTokenizer, AutoModel
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

# To suppress all warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="the specific warning message", module="the_module")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\weigfan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Functions

In [132]:
# Function to split text into sentences
def split_into_sentences(text):
    # Correcting common abbreviations and numbers with periods to avoid incorrect splits
    text = re.sub(r'\b(e.g.|i.e.|etc.)\b', lambda x: x.group().replace('.', ''), text)
    text = re.sub(r'(\d+)\.', r'\1', text)
    # Tokenize the corrected text into sentences
    return sent_tokenize(text)

def get_bert_embedding(sentence):
    # Tokenize the sentence and convert it to tensor
    tokens = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    # Pass the tokens through the BERT model
    with torch.no_grad():
        outputs = model(**tokens)
    
    # Get the output representation of [CLS] token
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    # Convert the PyTorch tensor to a NumPy array
    cls_embedding = cls_embedding.numpy()
    return cls_embedding[0]

def cosine_similarity_matrix(embeddings, label_embedding):
    # Normalize the label_embedding
    label_norm = np.linalg.norm(label_embedding)
    if label_norm == 0:
        return np.zeros(len(embeddings))

    # Normalize the review embeddings
    embeddings_norm = np.linalg.norm(embeddings, axis=1)
    valid_indices = embeddings_norm != 0

    # Compute cosine similarity
    cosine_sim = np.zeros(len(embeddings))
    cosine_sim[valid_indices] = (embeddings[valid_indices] @ label_embedding) / (embeddings_norm[valid_indices] * label_norm)
    return cosine_sim

def zero_shot(embedding_data, embedding_col, labels):
    print("Retrieving label embeddings...")
    data = embedding_data.copy(deep=True)
    label_embeddings = [get_bert_embedding(label) for label in labels]
    embeddings_matrix = np.stack(data[embedding_col].values)

    print("Computing cosine similiarity with label embeddings...")
    for i, label_embedding in enumerate(label_embeddings):
        print("Label: ", i)
        column_name = f'cosine_sim_{i}'
        data[column_name] = cosine_similarity_matrix(embeddings_matrix, label_embedding)
    return data

# Retrive largest and smallest cosine_sim_mean score for each article
def get_max_rows(group):
    max_row = group[group['cosine_sim_0'] == group['cosine_sim_0'].max()]
    return max_row

def get_min_rows(group):
    min_row = group[group['cosine_sim_0'] == group['cosine_sim_0'].min()]
    return min_row

In [112]:
# Load the pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [114]:
data = pd.read_parquet(get_data() / 'clean_data.parquet.brotli')

------------------------------------------------------------
red streak girl. so uh yeah she seems pretty chill but i'm too shy to talk to her. i'm cool with my teacher so i was gonna ask for tht persons name but i'm too shy to do tht too ( afraid teacher is gonna think i like the person cause my teacher likes drama and makes assumptions like that daily) so i just sit in the back and draw her idk if rsg has seen them cause she passes by me multiple times during class but if she has she hasnt said anything bout it. is tht weird? i mean its not like i'm lewding them they wear a hoodie and a lot of chains stuff like that so thats wht i draw. sometimes i draw dresses but thats besides the point. idk y i have a hard time talking to people maybe i should start taking my anti anxiety meds again. i don't mind being alone but sometimes i want just want a physical pressensce there. i don't think i explained things well i never do tbh i don't say a lot but theres a lot to say so it comes out it l

### Get Sentence Data

In [116]:
# Get Lonely Articles
lonely_data = data.copy(deep=True)
lonely_data = lonely_data.reset_index(drop=True)
lonely_data['id'] = lonely_data.index

In [117]:
# Split text into sentences and create a new column
lonely_data['sentences'] = lonely_data['cleaned_article'].apply(split_into_sentences)

In [118]:
# Create a new DataFrame with exploded sentences
sentence_data = lonely_data.explode('sentences').reset_index(drop=True)

In [119]:
# Remove rows with None (i.e., sentences with one word or special characters only)
sentence_data = sentence_data.dropna()

In [120]:
sentence_data = sentence_data.set_index('id')

### Get Sentence Embeddings

In [121]:
sentence_data = sentence_data[['sentences']]

In [122]:
# Apply the get_bert_embedding function to your cleaned sentences
start_time = time.time()
sentence_data['bert_emb_sentence'] = sentence_data['sentences'].apply(get_bert_embedding)
total_time = time.time() - start_time
print(f"Total Time: {total_time}")

Total Time: 2069.5610268115997


In [123]:
sentence_data.to_parquet(get_data() / 'bert_sentence_emb.parquet.brotli', compression='brotli')

### Get Article Embeddings

In [124]:
article = data.copy(deep=True)
article['id'] = article.index
article = article.set_index('id')

In [125]:
article = article.dropna()

In [126]:
# Apply the get_bert_embedding function to your cleaned sentences
start_time = time.time()
article['bert_emb_art'] = article['cleaned_article'].apply(get_bert_embedding)
total_time = time.time() - start_time
print(f"Total Time: {total_time}")

Total Time: 729.6872766017914


In [127]:
article.to_parquet(get_data() / 'bert_article_emb.parquet.brotli', compression='brotli')

### Get Sentence Label Embedding

In [128]:
label_sentence = ["feeling alone despite being with others.",
                  "a quiet room highlights my loneliness.",
                  "seeking connections, but left alone.",
                  "lost in the crowd, feeling isolated.",
                  "alone with my thoughts every night."]

In [129]:
cosine_sim = zero_shot(sentence_data, 'bert_emb_sentence', label_sentence)

Retrieving label embeddings...
Computing cosine similiarity with label embeddings...
Label:  0
Label:  1
Label:  2
Label:  3
Label:  4


In [130]:
cosine_sim['cosine_sim_mean'] = cosine_sim[[f"cosine_sim_{i}" for i in range(0, 5)]].mean(axis=1)

In [133]:
max = cosine_sim.groupby('id').apply(get_max_rows).reset_index(level=0, drop=True)
max = max[['bert_emb_sentence']]
max.columns = ['bert_emb_max']

In [134]:
min = cosine_sim.groupby('id').apply(get_min_rows).reset_index(level=0, drop=True)
min = min[['bert_emb_sentence']]
min.columns = ['bert_emb_min']

In [135]:
min_max = pd.merge(min, max, left_index=True, right_index=True, how='inner')

In [136]:
min_max.to_parquet(get_data() / 'bert_sentence_cosine.parquet.brotli', compression='brotli')

#### Get Article Label Embedding

In [137]:
art_emb = pd.read_parquet(get_data() / 'bert_article_emb.parquet.brotli')

In [138]:
label_article = [
    "Even when I'm surrounded by people, a deep sense of loneliness clutches at me. It's like being in a room full of conversations, laughter, and life, yet feeling completely detached and isolated. My mind wanders to thoughts that maybe I don't quite fit in, or perhaps I'm fundamentally different. This sense of separation isn’t about physical presence; it's about not feeling connected or understood. It's like being an outsider in my own life, watching others interact with ease, while I'm trapped behind an invisible barrier of disconnection.",
    
    "In the silence of my room, the loudness of my loneliness becomes overwhelming. The quietness echoes the emptiness I feel inside. It's in these moments of solitude that my thoughts become my only company, often leading me down a path of introspection and melancholy. The walls seem to close in, and the lack of sound or distraction brings a stark awareness to my isolation. It's as if the silence speaks a truth I try to avoid in the day's hustle – that I am profoundly alone, even in a world full of people.",
    
    "I often find myself reaching out, trying to make connections with those around me. Yet, despite my efforts, it feels like I'm left alone, standing on the periphery of relationships. Each attempt at conversation, each effort to engage, seems to fall short, leaving me more disheartened. It's a cycle of seeking and hoping, only to be met with the cold reality of solitude. The more I try, the more apparent my isolation becomes, and the more I wonder if there’s something about me that just doesn’t resonate with others.",
    
    "Being in a crowd is a strange experience for me. I’m surrounded by people, yet I’ve never felt more isolated. It's like being adrift in a sea of faces, voices, and emotions, yet not being a part of any of it. I watch as others interact and connect, forming bonds and sharing moments, while I remain just an observer. This feeling of isolation isn't just about being physically alone; it's about feeling disconnected and unseen, even in the midst of a bustling crowd.",
    
    "Nighttime is the hardest for me. As the world quiets down, I'm left alone with my thoughts, which often turn into a whirlpool of loneliness and reflection. The darkness of the night seems to mirror the darkness of my mood. These are the hours when the feeling of solitude is most acute, and my mind relentlessly revisits moments of the day, analyzing and overthinking. The silence of the night is a stark reminder of my solitude, a time when I’m left to confront my deepest feelings of isolation and longing for connection."
]

In [139]:
cosine_sim = zero_shot(art_emb, 'bert_emb_art', label_article)

Retrieving label embeddings...
Computing cosine similiarity with label embeddings...
Label:  0
Label:  1
Label:  2
Label:  3
Label:  4


In [140]:
cosine_sim['cosine_sim_art_mean'] = cosine_sim[[f"cosine_sim_{i}" for i in range(0, 5)]].mean(axis=1)

In [149]:
cosine_sim[['cosine_sim_art_mean'] + [f"cosine_sim_{i}" for i in range(0, 5)]].to_parquet(get_data() / 'bert_art_cosine.parquet.brotli', compression='brotli')