In [1]:
!pip install torch transformers datasets scikit-learn pandas tqdm




In [4]:
import torch
import transformers
import datasets
import sklearn
import pandas as pd
print("All packages installed successfully!")


All packages installed successfully!


In [5]:
with open("shared_task_dev.jsonl", "r", encoding="utf-8") as f:
    content = f.readlines()
print(f"Total lines in file: {len(content)}")


Total lines in file: 19999


In [13]:
import pandas as pd
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load dataset with error handling
def load_jsonl_subset(file_path, sample_size=20000):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for i, line in enumerate(file):
            line = line.strip()
            if not line:
                continue
            try:
                json_data = json.loads(line)
                data.append(json_data)
            except json.JSONDecodeError:
                print(f"Skipping invalid JSON at line {i + 1}: {line[:100]}")
                continue
    if not data:
        raise ValueError(f"No valid data found in {file_path}")
    df = pd.DataFrame(data)
    return df.sample(n=min(sample_size, len(df)), random_state=42)

# Paths
train_file_path = "train (1).jsonl"
dev_file_path = "shared_task_dev.jsonl"

# Load datasets
train_df = load_jsonl_subset(train_file_path, sample_size=20000)
dev_df = load_jsonl_subset(dev_file_path, sample_size=5000)
print("loaded datasets")

# Convert labels
label_map = {"SUPPORTS": 1, "REFUTES": 0}
train_df = train_df[train_df['label'].isin(label_map)].copy()
dev_df = dev_df[dev_df['label'].isin(label_map)].copy()
train_df['label'] = train_df['label'].map(label_map)
dev_df['label'] = dev_df['label'].map(label_map)

# Extract text and labels
train_texts, train_labels = train_df['claim'].tolist(), train_df['label'].tolist()
dev_texts, dev_labels = dev_df['claim'].tolist(), dev_df['label'].tolist()
print("extracting texts and labels")

# BERT Tokenization
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
dev_encodings = tokenizer(dev_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict({"input_ids": train_encodings["input_ids"].tolist(), "attention_mask": train_encodings["attention_mask"].tolist(), "labels": train_labels})
dev_dataset = Dataset.from_dict({"input_ids": dev_encodings["input_ids"].tolist(), "attention_mask": dev_encodings["attention_mask"].tolist(), "labels": dev_labels})
print("converted to hugging face")

# Load BERT model
bert_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Training settings
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="no",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
)

# Compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Train BERT model
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

print("BERT model training complete.")


Skipping invalid JSON at line 1: q
loaded datasets
extracting texts and labels
converted to hugging face


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
using tf-idf similarity with svm and random forest

File contains data.
q
{"id": 91198, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Colin Kaepernick became a starting quarterback during the 49ers 63rd season in the National Football League.", "evidence": [[[108548, null, null, null]]]}
{"id": 194462, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Tilda Swinton is a vegan.", "evidence": [[[227768, null, null, null]]]}
{"id": 137334, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Fox 2000 Pictures released the film Soul Food.", "evidence": [[[289914, 283015, "Soul_Food_-LRB-film-RRB-", 0]], [[291259, 284217, "Soul_Food_-LRB-film-RRB-", 0]], [[293412, 285960, "Soul_Food_-LRB-film-RRB-", 0]], [[337212, 322620, "Soul_Food_-LRB-film-RRB-", 0]], [[337214, 322622, "Soul_Food_-LRB-film-RRB-", 0]]]}
{"id": 166626, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Anne Rice was born in New Jersey.", "evidence": [[[191656, null, null, null], [191657, null, null, null]]

In [16]:
import pandas as pd
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

# Load JSONL file with a subset for processing
def load_jsonl_subset(file_path, sample_size):
    print(f"Loading dataset from {file_path}...")
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                data.append(json.loads(line.strip()))  # Parse JSON line
            except json.JSONDecodeError:
                print(f"Skipping invalid JSON line: {line.strip()[:100]}")
    if not data:
        raise ValueError(f"Error: No valid data found in {file_path}")
    df = pd.DataFrame(data)
    df = df.sample(n=min(sample_size, len(df)), random_state=42)  # Random sampling
    print(f"Loaded {len(df)} rows from {file_path}")
    return df

# File paths
train_file_path = "train (1).jsonl"
dev_file_path = "shared_task_dev.jsonl"

# Load datasets with 10,000 rows for training and 4,000 for testing
train_df = load_jsonl_subset(train_file_path, sample_size=20000)
dev_df = load_jsonl_subset(dev_file_path, sample_size=5000)

# Convert labels: SUPPORTS → 1 (Real), REFUTES → 0 (Fake)
label_map = {"SUPPORTS": 1, "REFUTES": 0}

# Filter out "NOT ENOUGH INFO" labels
train_df = train_df[train_df['label'].isin(label_map)].copy()
dev_df = dev_df[dev_df['label'].isin(label_map)].copy()

# Map labels to binary values
train_df['label'] = train_df['label'].map(label_map)
dev_df['label'] = dev_df['label'].map(label_map)

print("Preprocessing complete. Preparing text data...")

# Use claim as text input
train_texts, train_labels = train_df['claim'].tolist(), train_df['label'].tolist()
dev_texts, dev_labels = dev_df['claim'].tolist(), dev_df['label'].tolist()

# TF-IDF Vectorization
print("Applying TF-IDF vectorization...")
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=20000)
X_train = vectorizer.fit_transform(train_texts)
X_dev = vectorizer.transform(dev_texts)
y_train, y_dev = np.array(train_labels), np.array(dev_labels)

print("TF-IDF vectorization complete. Training models...")

# Train an SVM Classifier
print("Training SVM model...")
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train, y_train)
print("SVM training complete.")

# Train a Random Forest Classifier
print("Training Random Forest model...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
print("Random Forest training complete.")

# Evaluate both models
def evaluate_model(model, X, y, model_name):
    print(f"Evaluating {model_name}...")
    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y, y_pred, average='binary')
    print(f"{model_name} Evaluation Complete.")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Get evaluation results
svm_results = evaluate_model(svm_model, X_dev, y_dev, "SVM")
rf_results = evaluate_model(rf_model, X_dev, y_dev, "Random Forest")

# Display results in a table
print("Model evaluation complete. Displaying results:")
results_df = pd.DataFrame([svm_results, rf_results], index=["SVM", "Random Forest"])
print(results_df)


Loading dataset from train (1).jsonl...
Loaded 20000 rows from train (1).jsonl
Loading dataset from shared_task_dev.jsonl...
Skipping invalid JSON line: q
Loaded 5000 rows from shared_task_dev.jsonl
Preprocessing complete. Preparing text data...
Applying TF-IDF vectorization...
TF-IDF vectorization complete. Training models...
Training SVM model...
SVM training complete.
Training Random Forest model...
Random Forest training complete.
Evaluating SVM...
SVM Evaluation Complete.
Evaluating Random Forest...
Random Forest Evaluation Complete.
Model evaluation complete. Displaying results:
               accuracy  precision    recall        f1
SVM            0.679380   0.615268  0.965599  0.751616
Random Forest  0.683552   0.621495  0.946619  0.750353


In [None]:
import pandas as pd
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from imblearn.over_sampling import SMOTE

# Load JSONL file with a subset for processing
def load_jsonl_subset(file_path, sample_size):
    print(f"Loading dataset from {file_path}...")
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                data.append(json.loads(line.strip()))  # Parse JSON line
            except json.JSONDecodeError:
                print(f"Skipping invalid JSON line: {line.strip()[:100]}")
    if not data:
        raise ValueError(f"Error: No valid data found in {file_path}")
    df = pd.DataFrame(data)
    df = df.sample(n=min(sample_size, len(df)), random_state=42)  # Random sampling
    print(f"Loaded {len(df)} rows from {file_path}")
    return df

# File paths
train_file_path = "train (1).jsonl"
dev_file_path = "shared_task_dev.jsonl"

# Load datasets with 20,000 rows for training and 5,000 for testing
train_df = load_jsonl_subset(train_file_path, sample_size=20000)
dev_df = load_jsonl_subset(dev_file_path, sample_size=5000)

# Convert labels: SUPPORTS → 1 (Real), REFUTES → 0 (Fake)
label_map = {"SUPPORTS": 1, "REFUTES": 0}

# Filter out "NOT ENOUGH INFO" labels
train_df = train_df[train_df['label'].isin(label_map)].copy()
dev_df = dev_df[dev_df['label'].isin(label_map)].copy()

# Map labels to binary values
train_df['label'] = train_df['label'].map(label_map)
dev_df['label'] = dev_df['label'].map(label_map)

print("Preprocessing complete. Preparing text data...")

# Use claim as text input
train_texts, train_labels = train_df['claim'].tolist(), train_df['label'].tolist()
dev_texts, dev_labels = dev_df['claim'].tolist(), dev_df['label'].tolist()

# TF-IDF Vectorization with optimized settings
print("Applying TF-IDF vectorization...")
vectorizer = TfidfVectorizer(
    sublinear_tf=True, 
    ngram_range=(1, 3),  # Use unigrams, bigrams, and trigrams
    max_features=50000,  # Increase max features
    stop_words="english"  # Remove stopwords
)
X_train = vectorizer.fit_transform(train_texts)
X_dev = vectorizer.transform(dev_texts)
y_train, y_dev = np.array(train_labels), np.array(dev_labels)

print("TF-IDF vectorization complete. Balancing dataset...")

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Optimize SVM with Grid Search
print("Optimizing SVM model...")
svm_params = {'C': [0.01, 0.1, 1, 10]}
svm_grid = GridSearchCV(SVC(kernel='linear', probability=True), svm_params, cv=3, scoring='f1', n_jobs=-1)
svm_grid.fit(X_train, y_train)
svm_model = svm_grid.best_estimator_
print(f"Best SVM parameters: {svm_grid.best_params_}")

# Train optimized Random Forest
print("Training optimized Random Forest model...")
rf_model = RandomForestClassifier(
    n_estimators=300,  # More trees
    max_depth=50,  # Limit depth
    min_samples_split=5,  # Prevent overfitting
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)
print("Random Forest training complete.")

# Evaluate models
def evaluate_model(model, X, y, model_name):
    print(f"Evaluating {model_name}...")
    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y, y_pred, average='binary')
    print(f"{model_name} Evaluation Complete.")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Get evaluation results
svm_results = evaluate_model(svm_model, X_dev, y_dev, "SVM")
rf_results = evaluate_model(rf_model, X_dev, y_dev, "Random Forest")

# Display results in a table
print("Model evaluation complete. Displaying results:")
results_df = pd.DataFrame([svm_results, rf_results], index=["SVM", "Random Forest"])
print(results_df)
