In [11]:
# %pip install -r requirements.txt

In [12]:
import os
import json
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from utils.vector_db import VectorDB
from dotenv import load_dotenv
from chromadb import EmbeddingFunction

import torch
from transformers import BertTokenizer, BertModel

In [13]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [14]:
# nltk.download('punkt_tab')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [15]:
data_dir = "data/"
train_data_dir = "data/train/"
train_csv = "data/train.csv"

stored_data_path = "data/stored_data.csv"

if os.path.exists(stored_data_path):
    # Load stored data
    paired_df = pd.read_csv(stored_data_path)
else:
    # Load train.csv
    df = pd.read_csv(train_csv)

    # Detect correct column names
    article_id_col = "article_id" if "article_id" in df.columns else df.columns[0]
    real_col = "real" if "real" in df.columns else df.columns[-1]

    # Prepare list for paired texts
    paired_data = []

    for idx, row in df.iterrows():
        article_id = f"article_{int(row[article_id_col]):04d}"
        file_1_path = os.path.join(train_data_dir, article_id, "file_1.txt")
        file_2_path = os.path.join(train_data_dir, article_id, "file_2.txt")
        try:
            with open(file_1_path, "r", encoding="utf-8") as f1:
                text_1 = f1.read()
        except Exception:
            text_1 = ""
        try:
            with open(file_2_path, "r", encoding="utf-8") as f2:
                text_2 = f2.read()
        except Exception:
            text_2 = ""
        paired_data.append({
            "text_1": text_1,
            "text_2": text_2,
            "real": row[real_col]  # 1 or 2
        })

    paired_df = pd.DataFrame(paired_data)
    paired_df.to_csv(stored_data_path, index=False)
paired_df.head()

Unnamed: 0,text_1,text_2,real
0,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...,1
1,China\nThe goal of this project involves achie...,The project aims to achieve an accuracy level ...,2
2,Scientists can learn about how galaxies form a...,Dinosaur eggshells offer clues about what dino...,1
3,China\nThe study suggests that multiple star s...,The importance for understanding how stars evo...,2
4,Dinosaur Rex was excited about his new toy set...,Analyzing how fast stars rotate within a galax...,2


In [16]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    # Join the tokens back into a cleaned string
    cleaned_text = ' '.join(lemmatized_tokens)
    return cleaned_text



paired_df['cleaned_text_1'] = paired_df['text_1'].apply(clean_text)
paired_df['cleaned_text_2'] = paired_df['text_2'].apply(clean_text)
paired_df.head()

Unnamed: 0,text_1,text_2,real,cleaned_text_1,cleaned_text_2
0,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...,1,virsa visible infrared survey telescope array ...,china relay network released significant amoun...
1,China\nThe goal of this project involves achie...,The project aims to achieve an accuracy level ...,2,china goal project involves achieving accuracy...,project aim achieve accuracy level dex analyzi...
2,Scientists can learn about how galaxies form a...,Dinosaur eggshells offer clues about what dino...,1,scientist learn galaxy form evolve two method ...,dinosaur eggshell offer clue dinosaur ate long...
3,China\nThe study suggests that multiple star s...,The importance for understanding how stars evo...,2,china study suggests multiple star system play...,importance understanding star evolve led resea...
4,Dinosaur Rex was excited about his new toy set...,Analyzing how fast stars rotate within a galax...,2,dinosaur rex excited new toy set many dinosaur...,analyzing fast star rotate within galaxy compa...


In [17]:
def extract_bert_embeddings(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)

    # Get BERT embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        # The last hidden state contains the embeddings
        embeddings = outputs.last_hidden_state

    return embeddings

class MyEmbeddingFunction(EmbeddingFunction):
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def __call__(self, input: list) -> list:
        # input: list of strings
        embeddings = []
        for text in input:
            inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True)
            # with torch.no_grad():
            outputs = self.model(**inputs)
            # Use the [CLS] token embedding as sentence embedding
            cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().detach().cpu().numpy()
            embeddings.append(cls_embedding.tolist())
        return embeddings

(extract_bert_embeddings("Sample text for embedding.").shape)

torch.Size([1, 9, 768])

In [18]:
# # Loop through both columns and extract embeddings
# embeddings_list_1 = []
# embeddings_list_2 = []

# for index, row in paired_df.iterrows():
#     # Extract embeddings for cleaned_text_1
#     sample_text_1 = row['cleaned_text_1']
#     embeddings_1 = extract_bert_embeddings(sample_text_1)
#     embeddings_list_1.append(embeddings_1)

#     # Extract embeddings for cleaned_text_2
#     sample_text_2 = row['cleaned_text_2']
#     embeddings_2 = extract_bert_embeddings(sample_text_2)
#     embeddings_list_2.append(embeddings_2)

# # Convert embeddings lists to tensors or save them as needed
# print(f"Processed {len(embeddings_list_1)} rows for cleaned_text_1 and cleaned_text_2.")

In [None]:
documents = []
for idx, row in paired_df.iterrows():
    if str(row['cleaned_text_1']).strip():
        documents.append({
            "id": f"{idx}_1",
            "content": row['cleaned_text_1'],
            "metadata": {"real": row["real"] == 1}
        })
    if str(row['cleaned_text_2']).strip():
        documents.append({
            "id": f"{idx}_2",
            "content": row['cleaned_text_2'],
            "metadata": {"real": row["real"] == 2}
        })

# Delete the existing collection if it exists (to fix dimension mismatch)
rebuild_collection = False
if rebuild_collection:
    vector_db_tmp = VectorDB(
        collection_name="impostor_hunt_texts",
        embedding_length=384,
        working_dir=os.getcwd()
    )
    vector_db_tmp.delete_collection()

embedding_function = MyEmbeddingFunction(model, tokenizer)


# Initialize VectorDB (embedding_function can be left as None to use default)
vector_db = VectorDB(
    collection_name="impostor_hunt_texts",
    embedding_length=768,
    # embedding_function=embedding_function,
    working_dir=os.getcwd(),
    documents=documents,
    dont_add_if_collection_exist=not rebuild_collection
)

vector_db.search("""ChromeDriver music player
This study focused on identifying any non-spherical shapes within specific types of celestial bodies (music music) using various techniques like comparing how they look from different directions and analyzing their changes in sound pressure vs time .
The extent to which these artists' images show evidence for an overall shape rather than individual tracks was found across multiple tracks:
Two specific songs had clearly visible distortions due to their complex structure compared to others playing just simple beats
This research found that while most recordings showed a relatively simple structure (like when you only see one instrument rather than an entire grand orchestra), some featured noticeable deviations from those expectations (like if there were multiple instruments playing at once). These results suggest there may be a correlation between how musicians program their compositions and how much curvature they chose for their soundscape — it seems as though tracks with more intricate arrangements tend towards greater complexity!
Please note: This is just an example response based on your input text as I am not able access real world information such as music information or even what "music music" means without further context!
Let me know if you want me to try working through some real world examples instead? I can also provide alternative ways I could rephrase your initial statement!""")

Deleted collection: impostor_hunt_texts


[{'id': '13_2',
  'score': -0.31127309799194336,
  'metadata': {'real': False},
  'content': 'used data two instrument kmos muse create detailed map showing much gas emitting light within different type galaxy using specific line chinese language redgreen laser light emitted element like hydrogen helium movement pattern within area velocity spread movement pattern velocity dispersion overall spin rate based size identified direction movement happened along within individual galaxy axis also created simplified representation showed fast spin around imaginary line center line spread motion along line china language redgreen laser light emitted element like hydrogen helium helped u understand spinning faster others based fell along imaginary line china language redgreen laser light emitted element like hydrogen helium involved carefully accounting blurring caused telescope earth atmosphere making observation could get accurate result fast rotate around imaginary line center line addition 