In [10]:
#%pip install pandas nltk transformers torch
#%pip install -r requirements.txt
%pip install hf_xet

Collecting hf_xet
  Downloading hf_xet-1.1.7-cp37-abi3-win_amd64.whl.metadata (703 bytes)
Downloading hf_xet-1.1.7-cp37-abi3-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   ---------------------- ----------------- 1.6/2.8 MB 9.3 MB/s eta 0:00:01
   ---------------------------------------- 2.8/2.8 MB 9.6 MB/s  0:00:00
Installing collected packages: hf_xet
Successfully installed hf_xet-1.1.7
Note: you may need to restart the kernel to use updated packages.


In [14]:
import os
import json
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from utils.vector_db import VectorDB
from dotenv import load_dotenv
import torch
from transformers import BertTokenizer, BertModel

In [15]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [17]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\vishn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vishn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vishn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vishn\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [18]:
data_dir = "data/"
train_data_dir = "data/train/"
train_csv = "data/train.csv"

stored_data_path = "data/stored_data.csv"

if os.path.exists(stored_data_path):
    # Load stored data
    paired_df = pd.read_csv(stored_data_path)
else:
    # Load train.csv
    df = pd.read_csv(train_csv)

    # Detect correct column names
    article_id_col = "article_id" if "article_id" in df.columns else df.columns[0]
    real_col = "real" if "real" in df.columns else df.columns[-1]

    # Prepare list for paired texts
    paired_data = []

    for idx, row in df.iterrows():
        article_id = f"article_{int(row[article_id_col]):04d}"
        file_1_path = os.path.join(train_data_dir, article_id, "file_1.txt")
        file_2_path = os.path.join(train_data_dir, article_id, "file_2.txt")
        try:
            with open(file_1_path, "r", encoding="utf-8") as f1:
                text_1 = f1.read()
        except Exception:
            text_1 = ""
        try:
            with open(file_2_path, "r", encoding="utf-8") as f2:
                text_2 = f2.read()
        except Exception:
            text_2 = ""
        paired_data.append({
            "text_1": text_1,
            "text_2": text_2,
            "real": row[real_col]  # 1 or 2
        })

    paired_df = pd.DataFrame(paired_data)
    paired_df.to_csv(stored_data_path, index=False)
paired_df.head()

Unnamed: 0,text_1,text_2,real
0,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...,1
1,China\nThe goal of this project involves achie...,The project aims to achieve an accuracy level ...,2
2,Scientists can learn about how galaxies form a...,Dinosaur eggshells offer clues about what dino...,1
3,China\nThe study suggests that multiple star s...,The importance for understanding how stars evo...,2
4,Dinosaur Rex was excited about his new toy set...,Analyzing how fast stars rotate within a galax...,2


In [19]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    # Join the tokens back into a cleaned string
    cleaned_text = ' '.join(lemmatized_tokens)
    return cleaned_text



paired_df['cleaned_text_1'] = paired_df['text_1'].apply(clean_text)
paired_df['cleaned_text_2'] = paired_df['text_2'].apply(clean_text)
paired_df.head()

Unnamed: 0,text_1,text_2,real,cleaned_text_1,cleaned_text_2
0,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...,1,virsa visible infrared survey telescope array ...,china relay network released significant amoun...
1,China\nThe goal of this project involves achie...,The project aims to achieve an accuracy level ...,2,china goal project involves achieving accuracy...,project aim achieve accuracy level dex analyzi...
2,Scientists can learn about how galaxies form a...,Dinosaur eggshells offer clues about what dino...,1,scientist learn galaxy form evolve two method ...,dinosaur eggshell offer clue dinosaur ate long...
3,China\nThe study suggests that multiple star s...,The importance for understanding how stars evo...,2,china study suggests multiple star system play...,importance understanding star evolve led resea...
4,Dinosaur Rex was excited about his new toy set...,Analyzing how fast stars rotate within a galax...,2,dinosaur rex excited new toy set many dinosaur...,analyzing fast star rotate within galaxy compa...


In [20]:
def extract_bert_embeddings(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)

    # Get BERT embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        # The last hidden state contains the embeddings
        embeddings = outputs.last_hidden_state

    return embeddings

In [25]:
# Loop through both columns and extract embeddings
embeddings_list_1 = []
embeddings_list_2 = []

for index, row in paired_df.iterrows():
    # Extract embeddings for cleaned_text_1
    sample_text_1 = row['cleaned_text_1']
    embeddings_1 = extract_bert_embeddings(sample_text_1)
    embeddings_list_1.append(embeddings_1)

    # Extract embeddings for cleaned_text_2
    sample_text_2 = row['cleaned_text_2']
    embeddings_2 = extract_bert_embeddings(sample_text_2)
    embeddings_list_2.append(embeddings_2)

# Convert embeddings lists to tensors or save them as needed
print(f"Processed {len(embeddings_list_1)} rows for cleaned_text_1 and cleaned_text_2.")

Processed 95 rows for cleaned_text_1 and cleaned_text_2.


In [22]:
from langchain_core.documents import Document

# Prepare documents for both columns
documents = []
for idx, row in paired_df.iterrows():
    doc1 = Document(page_content=row['cleaned_text_1'], metadata={"id": f"{idx}_1", "real": row["real"]})
    doc2 = Document(page_content=row['cleaned_text_2'], metadata={"id": f"{idx}_2", "real": row["real"]})
    documents.extend([doc1, doc2])

# Filter out documents with empty page_content
non_empty_documents = [doc for doc in documents if doc.page_content.strip() != ""]

# Initialize VectorDB
vector_db = VectorDB(
    collection_name="impostor_hunt_texts",
    embedding_length=768,  # Gemini embedding size, adjust if needed
    google_api_key=google_api_key,
    working_dir=os.getcwd(),
    documents=non_empty_documents
)

NameError: name 'google_api_key' is not defined