In [60]:
# %pip install -r requirements.txt

In [61]:
import os
import json
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from utils.vector_db import VectorDB
from chromadb import EmbeddingFunction
from tqdm import tqdm


import torch
from transformers import BertTokenizer, BertModel

In [62]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [63]:
# nltk.download('punkt_tab')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [64]:
def read_text(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def load_or_create_paired_df(data_dir, csv_path, has_real=True):
    """
    If csv_path exists -> load it.
    Else -> loop through article_* folders in data_dir and build a dataframe with:
    - text_1, text_2
    - real (only if has_real=True), looked up from <parent_of_data_dir>/train.csv
    """

    if os.path.exists(csv_path):
        return pd.read_csv(csv_path)

    rows = []

    if has_real:
        # load the csv at "data/train.csv"
        real_df = pd.read_csv("data/train.csv")

    for article_dir in sorted(d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))):
        article_path = os.path.join(data_dir, article_dir)
        f1 = os.path.join(article_path, "file_1.txt")
        f2 = os.path.join(article_path, "file_2.txt")

        text_1 = read_text(f1)
        text_2 = read_text(f2)

        row = {"text_1": text_1, "text_2": text_2}

        if has_real:
            # lookup the "real" value from the real_df
            real_row = real_df[real_df["id"] == int(article_dir.split("_")[1])]
            real_value = real_row["real_text_id"].values[0] if not real_row.empty else np.nan
            row["real"] = real_value

        rows.append(row)

    df = pd.DataFrame(rows)
    df.to_csv(csv_path, index=False)

    return df

# Usage
train_data_dir = "data/train"
test_data_dir  = "data/test"
train_csv = "data/stored_train_data.csv"
test_csv  = "data/stored_test_data.csv"

paired_df = load_or_create_paired_df(train_data_dir, train_csv, has_real=True)
test_df   = load_or_create_paired_df(test_data_dir,  test_csv,  has_real=False)

In [65]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    # Join the tokens back into a cleaned string
    cleaned_text = ' '.join(lemmatized_tokens)
    return cleaned_text


def clean_df(df):
    df['cleaned_text_1'] = df['text_1'].apply(clean_text)
    df['cleaned_text_2'] = df['text_2'].apply(clean_text)
    return df

paired_df = clean_df(paired_df)
paired_df.head()

Unnamed: 0,text_1,text_2,real,cleaned_text_1,cleaned_text_2
0,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...,1,virsa visible infrared survey telescope array ...,china relay network released significant amoun...
1,China\nThe goal of this project involves achie...,The project aims to achieve an accuracy level ...,2,china goal project involves achieving accuracy...,project aim achieve accuracy level dex analyzi...
2,Scientists can learn about how galaxies form a...,Dinosaur eggshells offer clues about what dino...,1,scientist learn galaxy form evolve two method ...,dinosaur eggshell offer clue dinosaur ate long...
3,China\nThe study suggests that multiple star s...,The importance for understanding how stars evo...,2,china study suggests multiple star system play...,importance understanding star evolve led resea...
4,Dinosaur Rex was excited about his new toy set...,Analyzing how fast stars rotate within a galax...,2,dinosaur rex excited new toy set many dinosaur...,analyzing fast star rotate within galaxy compa...


In [66]:
test_df = clean_df(test_df)

In [67]:
def extract_bert_embeddings(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)

    # Get BERT embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        # The last hidden state contains the embeddings
        embeddings = outputs.last_hidden_state

    return embeddings

class MyEmbeddingFunction(EmbeddingFunction):
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def __call__(self, input: list) -> list:
        # input: list of strings
        embeddings = []
        for text in input:
            inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True)
            # with torch.no_grad():
            outputs = self.model(**inputs)
            # Use the [CLS] token embedding as sentence embedding
            cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().detach().cpu().numpy()
            embeddings.append(cls_embedding.tolist())
        return embeddings

(extract_bert_embeddings("Sample text for embedding.").shape)

torch.Size([1, 9, 768])

In [68]:
# documents = []
# for idx, row in paired_df.iterrows():
#     if str(row['cleaned_text_1']).strip():
#         documents.append({
#             "id": f"{idx}_1",
#             "content": row['cleaned_text_1'],
#             "metadata": {"real": row["real"] == 1}
#         })
#     if str(row['cleaned_text_2']).strip():
#         documents.append({
#             "id": f"{idx}_2",
#             "content": row['cleaned_text_2'],
#             "metadata": {"real": row["real"] == 2}
#         })

# # Delete the existing collection if it exists (to fix dimension mismatch)
# rebuild_collection = False
# if rebuild_collection:
#     vector_db_tmp = VectorDB(
#         collection_name="impostor_hunt_texts",
#         embedding_length=384,
#         working_dir=os.getcwd()
#     )
#     vector_db_tmp.delete_collection()

# embedding_function = MyEmbeddingFunction(model, tokenizer)


# # Initialize VectorDB (embedding_function can be left as None to use default)
# vector_db = VectorDB(
#     collection_name="impostor_hunt_texts",
#     embedding_length=768,
#     working_dir=os.getcwd(),
#     documents=documents,
#     dont_add_if_collection_exist=not rebuild_collection
# )

# vector_db.search("""ChromeDriver music player
# This study focused on identifying any non-spherical shapes within specific types of celestial bodies (music music) using various techniques like comparing how they look from different directions and analyzing their changes in sound pressure vs time .
# The extent to which these artists' images show evidence for an overall shape rather than individual tracks was found across multiple tracks:
# Two specific songs had clearly visible distortions due to their complex structure compared to others playing just simple beats
# This research found that while most recordings showed a relatively simple structure (like when you only see one instrument rather than an entire grand orchestra), some featured noticeable deviations from those expectations (like if there were multiple instruments playing at once). These results suggest there may be a correlation between how musicians program their compositions and how much curvature they chose for their soundscape — it seems as though tracks with more intricate arrangements tend towards greater complexity!
# Please note: This is just an example response based on your input text as I am not able access real world information such as music information or even what "music music" means without further context!
# Let me know if you want me to try working through some real world examples instead? I can also provide alternative ways I could rephrase your initial statement!""")

In [69]:
# --- Late Chunking for 'real' and 'not real' groups ---
real_docs = []
not_real_docs = []
for idx, row in paired_df.iterrows():
    text_1 = row['cleaned_text_1']
    text_2 = row['cleaned_text_2']
    # Only process if text_1 is a string and not empty
    if isinstance(text_1, str) and text_1.strip():
        doc = {
            "id": f"{idx}_1",
            "content": text_1,
            "metadata": {"real": row["real"] == 1}
        }
        if row["real"] == 1:
            real_docs.append(doc)
        else:
            not_real_docs.append(doc)
    # Only process if text_2 is a string and not empty
    if isinstance(text_2, str) and text_2.strip():
        doc = {
            "id": f"{idx}_2",
            "content": text_2,
            "metadata": {"real": row["real"] == 2}
        }
        if row["real"] == 2:
            real_docs.append(doc)
        else:
            not_real_docs.append(doc)

# Delete the existing collection if it exists (to fix dimension mismatch)
rebuild_collection = False
if rebuild_collection:
    vector_db_tmp = VectorDB(
        collection_name="impostor_hunt_texts",
        embedding_length=384,
        working_dir=os.getcwd()
    )
    vector_db_tmp.delete_collection()


# Add late chunked documents for both groups
vector_db_real = VectorDB(
    collection_name="impostor_hunt_texts_real",
    embedding_length=768,
    working_dir=os.getcwd(),
    # embedding_function=embedding_function
)

if rebuild_collection:
    vector_db_real.add_documents_with_late_chunking(real_docs, chunk_size=1500, chunk_overlap=200, max_context=8192)
    vector_db_real.add_documents_with_late_chunking(not_real_docs, chunk_size=1500, chunk_overlap=200, max_context=8192)

search_limit = 20

# count real/fake
def count_real_fake(results, search_limit):
    real_count = sum(1 for doc in results if doc['metadata']['real'])
    fake_count = len(results) - real_count
    return (real_count / search_limit)


In [72]:
def get_cls_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        cls_emb = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return cls_emb

def get_features(df, vector_db_real, search_limit=20):
    features = []
    labels = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Extracting features"):
        ct1 = row['cleaned_text_1']
        ct2 = row['cleaned_text_2']
        t1 = row['text_1']
        t2 = row['text_2']
        # Skip rows where t1 or t2 is not a string
        if not isinstance(t1, str) or not isinstance(t2, str):
            continue
        emb1 = get_cls_embedding(ct1)
        emb2 = get_cls_embedding(ct2)
        score1 = count_real_fake(vector_db_real.search(t1, limit=search_limit), search_limit)
        score2 = count_real_fake(vector_db_real.search(t2, limit=search_limit), search_limit)
        feat = np.concatenate([emb1, emb2, [score1, score2], emb1-emb2])
        features.append(feat)
        if 'real' in row:
            labels.append(1 if row['real'] == 1 else 2)
    return np.array(features), np.array(labels)



In [71]:
# --- Prepare train/test features ---
X_train, y_train = get_features(paired_df, vector_db_real, search_limit=20)
X_test, _ = get_features(test_df, vector_db_real, search_limit=20)

Extracting features:   0%|          | 0/95 [00:00<?, ?it/s]

Extracting features:  11%|█         | 10/95 [00:03<00:29,  2.93it/s]


ValueError: Expected document to be a str, got nan in query.

In [None]:
from sklearn.naive_bayes import GaussianNB

# --- Train classifier ---
clf = GaussianNB()
clf.fit(X_train, y_train)

# --- Predict on test set ---
y_pred = clf.predict(X_test)

# --- Ensemble with RAG scores ---
def ensemble_predict(test_df, vector_db_real, clf, X_test, search_limit=20, alpha=0.5):
    results = []
    for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Ensembling"):
        t1 = row['cleaned_text_1']
        t2 = row['cleaned_text_2']
        score1 = count_real_fake(vector_db_real.search(t1, limit=search_limit), search_limit)
        score2 = count_real_fake(vector_db_real.search(t2, limit=search_limit), search_limit)
        proba = clf.predict_proba([X_test[idx]])[0]
        combined_1 = alpha * proba[0] + (1-alpha) * score1
        combined_2 = alpha * proba[1] + (1-alpha) * score2
        predicted_real = 1 if combined_1 >= combined_2 else 2
        results.append({'id': idx, 'real_text_id': predicted_real})
    return pd.DataFrame(results)

# --- Save ensemble predictions ---
ensemble_df = ensemble_predict(test_df, vector_db_real, clf, X_test, search_limit=20, alpha=0.5)
ensemble_df.to_csv("ensemble_predictions.csv", index=False)
ensemble_df.head()

Ensembling: 100%|██████████| 1068/1068 [01:10<00:00, 15.16it/s]


Unnamed: 0,id,real_text_id
0,0,2
1,1,2
2,2,1
3,3,1
4,4,2
