In [None]:
#Import & Setup
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
import torch


In [None]:
# Load and merge all datasets

location_relative_path = "../data/processed/cleaned/"

# Load each dataset
df1 = pd.read_csv(location_relative_path + "clickbait_data.txt", sep="|", skiprows=1, names=["text", "manipulative"])
df2 = pd.read_csv(location_relative_path + "liar_train.txt", sep="|", skiprows=1, names=["text", "manipulative"])
df3 = pd.read_csv(location_relative_path + "mentalmanip_con.txt", sep="|", skiprows=1, names=["text", "manipulative"])
df4 = pd.read_csv(location_relative_path + "tweets.txt", sep="|", skiprows=1, names=["text", "manipulative"])

# Merge all into one DataFrame
df = pd.concat([df1, df2, df3, df4], ignore_index=True)

# Convert label column to integer
df['manipulative'] = df['manipulative'].astype(int)

# Preview the first few rows
print(f"Total samples: {len(df)}")
df.head()


In [None]:
# Train/Test Split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(), df['manipulative'].tolist(), test_size=0.2, random_state=42
)

# Print summary
print(f"Total samples: {len(df)}")
print(f"Training samples: {len(train_texts)}")
print(f"Testing samples: {len(test_texts)}")

# Preview a few training examples
print("\nSample training texts and labels:")
for i in range(3):
    print(f"Text: {train_texts[i]}")
    print(f"Manipulative: {train_labels[i]}")
    print("---")

In [None]:
# Tokenization
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Tokenize training and test texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Print summary
print(f"Training encodings keys: {list(train_encodings.keys())}")
print(f"Test encodings keys: {list(test_encodings.keys())}")

# Preview a few training samples
print("\nSample tokenized training input_ids:")
for i in range(3):
    print(f"Sample {i+1}: {train_encodings['input_ids'][i][:10]}")

print("\nSample attention masks:")
for i in range(3):
    print(f"Sample {i+1}: {train_encodings['attention_mask'][i][:10]}")


In [None]:
#Wrap as Torch Dataset
class ManipulationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

# Instantiate datasets
train_dataset = ManipulationDataset(train_encodings, train_labels)
test_dataset = ManipulationDataset(test_encodings, test_labels)

# Print summary
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Preview a few samples from train_dataset
print("\nSample training items:")
for i in range(3):
    sample = train_dataset[i]
    for key, val in sample.items():
        if val.dim() > 0:
            print(f"{key}: {val.shape} | {val[:10]}")
        else:
            print(f"{key}: {val.item()}")
    print("---")

In [None]:
#(Optional) Save Datasets

tokenized_relative_path = f"../data/processed/tokenized/"

torch.save(train_dataset, tokenized_relative_path + "train_dataset.pt")
torch.save(test_dataset, tokenized_relative_path + "test_dataset.pt")


In [None]:
# Load the dataset
loaded_test_dataset = torch.load(tokenized_relative_path + "train_dataset.pt", weights_only=False)

# View a sample
sample = loaded_test_dataset[0]
for key, val in sample.items():
    if val.dim() > 0:
        print(f"{key}: {val.shape} | {val[:10]}")
    else:
        print(f"{key}: {val.item()}")  # For scalar tensors like labels
