In [27]:
# %pip install -r requirements.txt.12 scikit-learn
#%pip install datasets
%pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.5
Note: you may need to restart the kernel to use updated packages.


In [51]:
import os
import json
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from utils.vector_db import VectorDB
from chromadb import EmbeddingFunction
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import nlpaug.augmenter.word as naw
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from transformers import (
    DebertaForSequenceClassification,
    DebertaTokenizer,
    AutoTokenizer
)
import evaluate
from datasets import Dataset, DatasetDict


In [52]:
df=pd.read_csv("data/train.csv")
df.head()

Unnamed: 0,id,real_text_id
0,0,1
1,1,2
2,2,1
3,3,2
4,4,2


In [53]:
aug=naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")
print(aug.augment("The movie was excellent!"))

['the disney movie was absolutely excellent!']


In [54]:
# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

# Load pre-trained BERT model and tokenizer with GPU support
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Enable mixed precision for faster training (if GPU supports it)
if torch.cuda.is_available():
    model.half()  # Use FP16 for faster inference
    print("Model loaded with FP16 precision for faster GPU processing")
else:
    print("GPU not available, using CPU")

print(f"Model loaded on: {next(model.parameters()).device}")

Using device: cpu


GPU not available, using CPU
Model loaded on: cpu


In [55]:
# nltk.download('punkt_tab')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [58]:
def read_text(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def load_or_create_paired_df(data_dir, csv_path, has_real=True, do_augment=True, n_Aug=10):
    """
    If csv_path exists -> load it.
    Else -> loop through article_* folders in data_dir and build a dataframe with:
    - text_1, text_2
    - real (only if has_real=True), looked up from <parent_of_data_dir>/train.csv
    """

    if os.path.exists(csv_path):
        return pd.read_csv(csv_path)

    rows = []

    if has_real:
        # load the csv at "data/train.csv"
        real_df = pd.read_csv("data/train.csv")

    for article_dir in sorted(d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))):
        article_path = os.path.join(data_dir, article_dir)
        f1 = os.path.join(article_path, "file_1.txt")
        f2 = os.path.join(article_path, "file_2.txt")

        text_1 = read_text(f1)
        text_2 = read_text(f2)

        row = {"text_1": text_1, "text_2": text_2}

        if has_real:
            # lookup the "real" value from the real_df
            real_row = real_df[real_df["id"] == int(article_dir.split("_")[1])]
            real_value = real_row["real_text_id"].values[0] if not real_row.empty else np.nan
            row["real"] = real_value

        rows.append(row)

    df = pd.DataFrame(rows)
    

    if not do_augment:
        df.to_csv(csv_path, index=False)
        return df

    aug=naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")

    augmented_df=[]

    for _, row in df.iterrows():
        for _ in range(n_Aug):
            new_text=aug.augment(row['text_1'], n=1)
            new_text2=aug.augment(row['text_2'], n=1)
            augmented_df.append({'text_1': new_text, 'text_2': new_text2, "real": row['real']})

    aug_df = pd.DataFrame(augmented_df)
    final_df = pd.concat([df, aug_df], ignore_index=True)
    final_df.to_csv(csv_path.split(".")[0] + "_augmented.csv", index=False)
    print("did but no resut")

    return final_df
    

    

# Usage
train_data_dir = "data/train"
test_data_dir  = "data/test"
train_csv = "data/stored_train_data_augmented_augmented.csv"
test_csv  = "data/stored_test_data.csv"

paired_df = load_or_create_paired_df(train_data_dir, train_csv, has_real=True, do_augment=True, n_Aug=10)
test_df   = load_or_create_paired_df(test_data_dir,  test_csv,  has_real=False, do_augment=False, n_Aug=10)

In [59]:
paired_df['real'].value_counts()

real
2    539
1    506
Name: count, dtype: int64

In [60]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    # Join the tokens back into a cleaned string
    cleaned_text = ' '.join(lemmatized_tokens)
    return cleaned_text


def clean_df(df):
    df['cleaned_text_1'] = df['text_1'].apply(clean_text)
    df['cleaned_text_2'] = df['text_2'].apply(clean_text)
    return df

paired_df = clean_df(paired_df)

paired_df.head()


Unnamed: 0,text_1,text_2,real,cleaned_text_1,cleaned_text_2
0,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...,1,virsa visible infrared survey telescope array ...,china relay network released significant amoun...
1,China\nThe goal of this project involves achie...,The project aims to achieve an accuracy level ...,2,china goal project involves achieving accuracy...,project aim achieve accuracy level dex analyzi...
2,Scientists can learn about how galaxies form a...,Dinosaur eggshells offer clues about what dino...,1,scientist learn galaxy form evolve two method ...,dinosaur eggshell offer clue dinosaur ate long...
3,China\nThe study suggests that multiple star s...,The importance for understanding how stars evo...,2,china study suggests multiple star system play...,importance understanding star evolve led resea...
4,Dinosaur Rex was excited about his new toy set...,Analyzing how fast stars rotate within a galax...,2,dinosaur rex excited new toy set many dinosaur...,analyzing fast star rotate within galaxy compa...


In [61]:
test_df = clean_df(test_df)

In [63]:
def extract_bert_embeddings(text, device=None):
    if device is None:
        device = next(model.parameters()).device
    
    # Tokenize input text
    tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get BERT embeddings
    with torch.no_grad():
        if device.type == 'cuda':
            with torch.cuda.amp.autocast():  # Use automatic mixed precision
                outputs = model(**inputs)
        else:
            outputs = model(**inputs)
        # The last hidden state contains the embeddings
        embeddings = outputs.last_hidden_state.cpu()  # Move back to CPU for return

    return embeddings

class MyEmbeddingFunction(EmbeddingFunction):
    def __init__(self, model, tokenizer, device=None):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device if device is not None else next(model.parameters()).device

    def __call__(self, input: list) -> list:
        # input: list of strings
        embeddings = []
        
        # Process in batches for better GPU utilization
        batch_size = 16 if self.device.type == 'cuda' else 4
        
        for i in range(0, len(input), batch_size):
            batch_texts = input[i:i + batch_size]
            
            # Tokenize batch
            inputs = self.tokenizer(
                batch_texts, 
                return_tensors='pt', 
                truncation=True, 
                padding=True, 
                max_length=512
            )
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                if self.device.type == 'cuda':
                    with torch.cuda.amp.autocast():
                        outputs = self.model(**inputs)
                else:
                    outputs = self.model(**inputs)
                
                # Use the [CLS] token embedding as sentence embedding
                cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
                
                for emb in cls_embeddings:
                    embeddings.append(emb.tolist())
        
        return embeddings

# Test the GPU-accelerated embedding function


# Clear GPU cache if using CUDA
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [None]:
# documents = []
# for idx, row in paired_df.iterrows():
#     if str(row['cleaned_text_1']).strip():
#         documents.append({
#             "id": f"{idx}_1",
#             "content": row['cleaned_text_1'],
#             "metadata": {"real": row["real"] == 1}
#         })
#     if str(row['cleaned_text_2']).strip():
#         documents.append({
#             "id": f"{idx}_2",
#             "content": row['cleaned_text_2'],
#             "metadata": {"real": row["real"] == 2}
#         })

# # Delete the existing collection if it exists (to fix dimension mismatch)
# rebuild_collection = False
# if rebuild_collection:
#     vector_db_tmp = VectorDB(
#         collection_name="impostor_hunt_texts",
#         embedding_length=384,
#         working_dir=os.getcwd()
#     )
#     vector_db_tmp.delete_collection()

# embedding_function = MyEmbeddingFunction(model, tokenizer)


# # Initialize VectorDB (embedding_function can be left as None to use default)
# vector_db = VectorDB(
#     collection_name="impostor_hunt_texts",
#     embedding_length=768,
#     working_dir=os.getcwd(),
#     documents=documents,
#     dont_add_if_collection_exist=not rebuild_collection
# )

# vector_db.search("""ChromeDriver music player
# This study focused on identifying any non-spherical shapes within specific types of celestial bodies (music music) using various techniques like comparing how they look from different directions and analyzing their changes in sound pressure vs time .
# The extent to which these artists' images show evidence for an overall shape rather than individual tracks was found across multiple tracks:
# Two specific songs had clearly visible distortions due to their complex structure compared to others playing just simple beats
# This research found that while most recordings showed a relatively simple structure (like when you only see one instrument rather than an entire grand orchestra), some featured noticeable deviations from those expectations (like if there were multiple instruments playing at once). These results suggest there may be a correlation between how musicians program their compositions and how much curvature they chose for their soundscape — it seems as though tracks with more intricate arrangements tend towards greater complexity!
# Please note: This is just an example response based on your input text as I am not able access real world information such as music information or even what "music music" means without further context!
# Let me know if you want me to try working through some real world examples instead? I can also provide alternative ways I could rephrase your initial statement!""")

In [64]:
# --- Late Chunking for 'real' and 'not real' groups ---
real_docs = []
not_real_docs = []
for idx, row in paired_df.iterrows():
    text_1 = row['cleaned_text_1']
    text_2 = row['cleaned_text_2']
    # Only process if text_1 is a string and not empty
    if isinstance(text_1, str) and text_1.strip():
        doc = {
            "id": f"{idx}_1",
            "content": text_1,
            "metadata": {"real": row["real"] == 1}
        }
        if row["real"] == 1:
            real_docs.append(doc)
        else:
            not_real_docs.append(doc)
    # Only process if text_2 is a string and not empty
    if isinstance(text_2, str) and text_2.strip():
        doc = {
            "id": f"{idx}_2",
            "content": text_2,
            "metadata": {"real": row["real"] == 2}
        }
        if row["real"] == 2:
            real_docs.append(doc)
        else:
            not_real_docs.append(doc)

# Delete the existing collection if it exists (to fix dimension mismatch)
rebuild_collection = False
if rebuild_collection:
    vector_db_tmp = VectorDB(
        collection_name="impostor_hunt_texts",
        embedding_length=384,
        working_dir=os.getcwd()
    )
    vector_db_tmp.delete_collection()


# Add late chunked documents for both groups
vector_db_real = VectorDB(
    collection_name="impostor_hunt_texts_real",
    embedding_length=768,
    working_dir=os.getcwd(),
    # embedding_function=embedding_function
)

if rebuild_collection:
    vector_db_real.add_documents_with_late_chunking(real_docs, chunk_size=1500, chunk_overlap=200, max_context=8192)
    vector_db_real.add_documents_with_late_chunking(not_real_docs, chunk_size=1500, chunk_overlap=200, max_context=8192)

search_limit = 20

# count real/fake
def count_real_fake(results, search_limit):
    real_count = sum(1 for doc in results if doc['metadata']['real'])
    fake_count = len(results) - real_count
    return (real_count / search_limit)


In [65]:
def get_cls_embedding(text, device=None):
    if device is None:
        device = next(model.parameters()).device
    
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        if device.type == 'cuda':
            with torch.cuda.amp.autocast():
                outputs = model(**inputs)
        else:
            outputs = model(**inputs)
        cls_emb = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return cls_emb

def get_features_gpu_optimized(df, vector_db_real, search_limit=20, batch_size=8):
    """GPU-optimized feature extraction with batching"""
    device = next(model.parameters()).device
    features = []
    labels = []
    
    # Prepare all texts for batch processing
    all_texts_1 = []
    all_texts_2 = []
    valid_indices = []
    
    for idx, row in df.iterrows():
        t1 = row['text_1']
        t2 = row['text_2']
        if isinstance(t1, str) and isinstance(t2, str):
            all_texts_1.append(row['cleaned_text_1'])
            all_texts_2.append(row['cleaned_text_2'])
            valid_indices.append(idx)
    
    print(f"Processing {len(valid_indices)} valid text pairs...")
    
    # Batch process embeddings for better GPU utilization
    all_emb1 = []
    all_emb2 = []
    
    # Process text_1 embeddings in batches
    for i in tqdm(range(0, len(all_texts_1), batch_size), desc="Processing text_1 embeddings"):
        batch_texts = all_texts_1[i:i + batch_size]
        inputs = tokenizer(
            batch_texts, 
            return_tensors='pt', 
            truncation=True, 
            padding=True, 
            max_length=512
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            if device.type == 'cuda':
                with torch.cuda.amp.autocast():
                    outputs = model(**inputs)
            else:
                outputs = model(**inputs)
            batch_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_emb1.extend(batch_emb)
    
    # Process text_2 embeddings in batches
    for i in tqdm(range(0, len(all_texts_2), batch_size), desc="Processing text_2 embeddings"):
        batch_texts = all_texts_2[i:i + batch_size]
        inputs = tokenizer(
            batch_texts, 
            return_tensors='pt', 
            truncation=True, 
            padding=True, 
            max_length=512
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            if device.type == 'cuda':
                with torch.cuda.amp.autocast():
                    outputs = model(**inputs)
            else:
                outputs = model(**inputs)
            batch_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_emb2.extend(batch_emb)
    
    # Clear GPU cache after batch processing
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    # Now process RAG scores and combine features
    for i, idx in tqdm(enumerate(valid_indices), desc="Extracting RAG scores and combining features"):
        row = df.iloc[idx]
        t1 = row['text_1']
        t2 = row['text_2']
        
        emb1 = all_emb1[i]
        emb2 = all_emb2[i]
        
        # Get RAG scores
        score1 = count_real_fake(vector_db_real.search(t1, limit=search_limit), search_limit)
        score2 = count_real_fake(vector_db_real.search(t2, limit=search_limit), search_limit)
        
        # Combine features
        feat = np.concatenate([emb1, emb2, [score1, score2], emb1-emb2])
        features.append(feat)
        
        if 'real' in row:
            labels.append(1 if row['real'] == 1 else 2)
    
    return np.array(features), np.array(labels)

# Keep the original function as backup
def get_features(df, vector_db_real, search_limit=20):
    features = []
    labels = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Extracting features"):
        ct1 = row['cleaned_text_1']
        ct2 = row['cleaned_text_2']
        t1 = row['text_1']
        t2 = row['text_2']
        # Skip rows where t1 or t2 is not a string
        if not isinstance(t1, str) or not isinstance(t2, str):
            continue
        emb1 = get_cls_embedding(ct1)
        emb2 = get_cls_embedding(ct2)
        score1 = count_real_fake(vector_db_real.search(t1, limit=search_limit), search_limit)
        score2 = count_real_fake(vector_db_real.search(t2, limit=search_limit), search_limit)
        feat = np.concatenate([emb1, emb2, [score1, score2], emb1-emb2])
        features.append(feat)
        if 'real' in row:
            labels.append(1 if row['real'] == 1 else 2)
    return np.array(features), np.array(labels)

In [66]:
# --- Prepare train/test features ---
X_train, y_train = get_features(paired_df, vector_db_real, search_limit=20)
X_test, _ = get_features(test_df, vector_db_real, search_limit=20)

Extracting features: 100%|██████████| 1045/1045 [10:46<00:00,  1.62it/s]
Extracting features: 100%|██████████| 1045/1045 [10:46<00:00,  1.62it/s]
Extracting features: 100%|██████████| 1068/1068 [08:36<00:00,  2.07it/s]
Extracting features: 100%|██████████| 1068/1068 [08:36<00:00,  2.07it/s]


In [68]:
X_train[0].shape

(2306,)

In [16]:
# # Rename to better reflect what this classifier actually does
# class FeatureClassifier(torch.nn.Module):
#     """
#     Multi-layer neural network for classification using pre-extracted features:
#     - BERT embeddings (768 * 2 = 1536 dims)
#     - RAG similarity scores (2 dims) 
#     - Embedding differences (768 dims)
#     Total input size: 2306 features
#     """
#     def __init__(self, input_size, hidden_size=512, num_classes=2, dropout=0.3):
#         super(FeatureClassifier, self).__init__()
#         self.fc1 = torch.nn.Linear(input_size, hidden_size)
#         self.fc2 = torch.nn.Linear(hidden_size, hidden_size // 2)
#         self.fc3 = torch.nn.Linear(hidden_size // 2, num_classes)
#         self.dropout = torch.nn.Dropout(dropout)
#         self.relu = torch.nn.ReLU()
        
#     def forward(self, x):
#         x = self.relu(self.fc1(x))
#         x = self.dropout(x)
#         x = self.relu(self.fc2(x))
#         x = self.dropout(x)
#         x = self.fc3(x)
#         return x

# # Initialize the feature classifier
# input_size = X_train.shape[1]  # Should be 2306
# print(f"Input feature size: {input_size}")
# feature_classifier = FeatureClassifier(input_size, hidden_size=512, num_classes=2, dropout=0.3).to(device)

# # Training setup
# from sklearn.preprocessing import LabelEncoder
# from sklearn.metrics import accuracy_score, classification_report
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import DataLoader, TensorDataset

# # Convert labels to binary (0, 1) for binary classification
# label_encoder = LabelEncoder()
# y_train_encoded = label_encoder.fit_transform(y_train - 1)  # Convert 1,2 to 0,1

# # Convert to tensors
# X_train_tensor = torch.FloatTensor(X_train)
# y_train_tensor = torch.LongTensor(y_train_encoded)
# X_test_tensor = torch.FloatTensor(X_test)

# # Create datasets and dataloaders
# train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# # Training parameters
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(feature_classifier.parameters(), lr=0.001, weight_decay=1e-5)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)

# # Training loop
# num_epochs = 50
# best_loss = float('inf')
# patience = 10
# patience_counter = 0

# print(f"Training feature classifier on {len(X_train)} samples...")
# feature_classifier.train()
# for epoch in range(num_epochs):
#     total_loss = 0
#     for batch_x, batch_y in train_loader:
#         batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
#         optimizer.zero_grad()
#         outputs = feature_classifier(batch_x)
#         loss = criterion(outputs, batch_y)
#         loss.backward()
#         optimizer.step()
        
#         total_loss += loss.item()
    
#     avg_loss = total_loss / len(train_loader)
#     scheduler.step(avg_loss)
    
#     if (epoch + 1) % 10 == 0:
#         print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')
    
#     # Early stopping
#     if avg_loss < best_loss:
#         best_loss = avg_loss
#         patience_counter = 0
#     else:
#         patience_counter += 1
#         if patience_counter >= patience:
#             print(f"Early stopping at epoch {epoch+1}")
#             break

# # Evaluate on test set
# feature_classifier.eval()
# with torch.no_grad():
#     X_test_device = X_test_tensor.to(device)
#     test_outputs = feature_classifier(X_test_device)
#     test_probabilities = torch.softmax(test_outputs, dim=1)
#     y_pred = torch.argmax(test_outputs, dim=1).cpu().numpy()

# # Convert predictions back to original labels (1, 2)
# y_pred_original = label_encoder.inverse_transform(y_pred) + 1

# print(f"Test predictions shape: {y_pred_original.shape}")
# print(f"Unique predictions: {np.unique(y_pred_original)}")

# def ensemble_predict_features(test_df, vector_db_real, feature_classifier, X_test, device, search_limit=20, alpha=0.5):
#     """
#     Enhanced ensemble prediction using the trained feature classifier.
#     Combines classifier probabilities with RAG scores.
#     """
#     results = []
#     feature_classifier.eval()
    
#     # Get all classifier probabilities at once
#     with torch.no_grad():
#         X_test_tensor = torch.FloatTensor(X_test).to(device)
#         classifier_probs = torch.softmax(feature_classifier(X_test_tensor), dim=1).cpu().numpy()
    
#     # Create a mapping from DataFrame index to prediction array index
#     valid_indices = []
#     for idx, row in test_df.iterrows():
#         t1 = row['text_1']
#         t2 = row['text_2']
#         if isinstance(t1, str) and isinstance(t2, str):
#             valid_indices.append(idx)
    
#     print(f"Processing {len(valid_indices)} valid samples out of {len(test_df)} total samples")
#     print(f"Classifier predictions array size: {classifier_probs.shape[0]}")
    
#     # Create mapping from DataFrame index to prediction array index
#     idx_to_pred_idx = {df_idx: pred_idx for pred_idx, df_idx in enumerate(valid_indices)}
    
#     for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Feature Ensemble"):
#         t1 = row['cleaned_text_1']
#         t2 = row['cleaned_text_2']
        
#         # Check if this row has valid predictions
#         if idx not in idx_to_pred_idx:
#             # Handle invalid samples by using simple RAG-based prediction
#             score1 = count_real_fake(vector_db_real.search(str(t1), limit=search_limit), search_limit)
#             score2 = count_real_fake(vector_db_real.search(str(t2), limit=search_limit), search_limit)
#             predicted_real = 1 if score1 >= score2 else 2
#         else:
#             # RAG scores
#             score1 = count_real_fake(vector_db_real.search(t1, limit=search_limit), search_limit)
#             score2 = count_real_fake(vector_db_real.search(t2, limit=search_limit), search_limit)
            
#             # Get classifier probabilities using the correct mapping
#             pred_idx = idx_to_pred_idx[idx]
#             classifier_prob = classifier_probs[pred_idx]
            
#             # Combine classifier predictions with RAG scores
#             # classifier_prob[0] = probability that text_1 is real
#             # classifier_prob[1] = probability that text_2 is real
#             combined_1 = alpha * classifier_prob[0] + (1-alpha) * score1
#             combined_2 = alpha * classifier_prob[1] + (1-alpha) * score2
            
#             predicted_real = 1 if combined_1 >= combined_2 else 2
        
#         results.append({'id': idx, 'real_text_id': predicted_real})
    
#     return pd.DataFrame(results)

# # Generate ensemble predictions
# ensemble_df = ensemble_predict_features(test_df, vector_db_real, feature_classifier, X_test, device, search_limit=20, alpha=0.5)
# ensemble_df.to_csv("feature_ensemble_predictions.csv", index=False)
# print("Feature ensemble predictions saved!")
# ensemble_df.head()

In [17]:


# # --- GridSearchCV on the training split ---
# param_grid = {
#     'C': [0.1, 1, 10, 100],
#     'kernel': ['linear', 'rbf', 'poly'],
#     'gamma': ['scale', 'auto']
# }

# svm = SVC()
# grid_search = GridSearchCV(
#     svm, param_grid, cv=10, scoring='accuracy', verbose=1, n_jobs=-1
# )

# grid_search.fit(X_train, y_train)

# print("\nBest params:", grid_search.best_params_)
# print("Best CV accuracy: {:.4f}".format(grid_search.best_score_))

# # --- Final Model: Predict on Test Data (unlabeled) ---
# y_pred = grid_search.predict(X_test)

# # Build DataFrame like your example
# pred_df = pd.DataFrame({
#     "id": range(len(y_pred)),
#     "real_text_id": y_pred
# })

# # Save to CSV if needed
# pred_df.to_csv("submission1.csv", index=False)
# print(pred_df.head(20))


In [32]:
train_df, val_df=train_test_split(paired_df, test_size=0.2, random_state=42, stratify=paired_df['real'])
val_df.shape

(209, 5)

In [47]:
train_df.head()

Unnamed: 0,text_1,text_2,real,cleaned_text_1,cleaned_text_2
305,[dioramas is designed to observe faint astrono...,[visible and near infrared ( nir ) coverage ha...,1,,
838,[the primary mirror design of the european ext...,[the primary mirror design for the european ex...,1,,
605,"[from the glittering expanse of the cosmos, ta...",[the only modification made to the system was ...,2,,
387,[# # cosmic alchemy : the vlt survey telescope...,[presently the turin vlt survey telescope ( vs...,2,,
423,[the * * wrybeasts * * in the * * o observator...,[the azimuth drive system experienced problems...,2,,


In [None]:
model=DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=2)

input= X_train

with torch.no_grad():
    logits=model(**input).logits

predicted_class_id=logits.argmax().item()
model.config.id2label[predicted_class_id]

labels = torch.tensor([1])
loss = model(**input, labels=labels).loss
round(loss.item(), 2)


config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): DebertaIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
          (output): DebertaOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): DebertaLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (rel_embeddings): Embedding(1024, 768)
    )
  )
  (pooler): ContextPooler(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0, inplace=False)
  )
  (classifier): Linear(in_features=768, out_features=2, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
) argument after ** must be a mapping, not numpy.ndarray

model.safetensors:   0%|          | 0.00/559M [00:00<?, ?B/s]

In [None]:
model=DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base")

# Load tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")

# Example: use a sample from your paired_df for inference
sample_text_1 = paired_df.iloc[0]["cleaned_text_1"] if "cleaned_text_1" in paired_df.columns else paired_df.iloc[0]["text_1"]
inputs = tokenizer(sample_text_1, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits
predicted_class_id = logits.argmax().item()
print(f"Predicted class: {model.config.id2label[predicted_class_id]}")

# Training: set num_labels to match your task
num_labels = len(model.config.id2label)
model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=num_labels)

# Example label (replace with your actual label from paired_df)
label = torch.tensor([paired_df.iloc[0]["real"]])
loss = model(**inputs, labels=label).loss
print(f"Loss: {round(loss.item(), 2)}")

In [69]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")
inputs = tokenizer("Hi I am vishnu!", return_tensors='pt', truncation=True, padding=True, max_length=512)
inputs

{'input_ids': tensor([[    1, 30086,    38,   524,   748,  1173, 18373,   328,     2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}