In [None]:
from google.colab import files

In [None]:
uploaded = files.upload()

Saving training_all_columnns.csv to training_all_columnns.csv


In [None]:
import pandas as pd
from datetime import datetime
import os

# Helper function to convert a date column to UTC
# Assuming input date format for UFC is a Unix timestamp (in seconds)
def convert_to_utc(date, is_ufc):
    try:
        if is_ufc:
            return datetime.utcfromtimestamp(int(date)).strftime('%Y-%m-%d %H:%M:%S')
        else:
            # Attempt to parse already formatted dates
            return datetime.strptime(date, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S')
    except (ValueError, TypeError):
        # Return the original value if conversion fails
        return date

# Function to process files and create the merged output
def merge_csv_files(file1, file2, output_file):
    merged_data = []

    for file in [file1, file2]:
        # Load CSV file
        df = pd.read_csv(file)

        # Determine the label based on the filename
        label = 0 if 'ct_balanced_posts.csv' in os.path.basename(file) else 1

        # Add the label column
        df['label'] = label

        # Convert dates in the "created_utc" or "created_ufc" column to UTC format
        if 'created_ufc' in df.columns:
            df['created_utc'] = df['created_ufc'].apply(lambda x: convert_to_utc(x, True))
            df.drop(columns=['created_ufc'], inplace=True)
        elif 'created_utc' in df.columns:
            df['created_utc'] = df['created_utc'].apply(lambda x: convert_to_utc(x, False))

        # Append the adjusted dataframe to the merged list
        merged_data.append(df)

    # Concatenate all dataframes
    final_df = pd.concat(merged_data, ignore_index=True)

    # Save the merged dataframe to a new CSV file
    final_df.to_csv(output_file, index=False)

# Input and output file paths
file1 = 'ct_balanced_posts.csv'
file2 = 'sz_balanced_posts.csv'
output_file = 'training_all_columnns.csv'

# Call the function
merge_csv_files(file1, file2, output_file)
print(f"Merged file saved as: {output_file}")

Merged file saved as: training_all_columnns.csv


In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
!pip install optuna
!pip install datasets
import optuna
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the BERT tokenizer
model_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Load dataset
df = pd.read_csv('training_all_columns.csv')  # Replace with the path to your merged CSV file
def combine_columns(row):
    title = row['title'] if pd.notna(row['title']) else ''
    body = row['body'] if pd.notna(row['body']) else ''
    return title + ' ' + body

df['text'] = df.apply(combine_columns, axis=1)
df = df[['label', 'text']]  # Ensure the dataframe only contains the required columns

df.head()

# Split dataset into training and testing
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization function
def tokenize_data(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

# Convert pandas dataframes to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_data, batched=True)
test_dataset = test_dataset.map(tokenize_data, batched=True)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["text"]).rename_column("label", "labels")
test_dataset = test_dataset.remove_columns(["text"]).rename_column("label", "labels")
train_dataset.set_format("torch")
test_dataset.set_format("torch")

# Define the compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = torch.argmax(torch.tensor(pred.predictions), dim=1).numpy()

    # Standard metrics
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')

    # Confusion matrix for additional metrics
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'true_positives': tp,
        'false_positives': fp,
        'true_negatives': tn,
        'false_negatives': fn
    }

# Define the objective function for Optuna
def objective(trial):
    # Suggest values for the hyperparameters
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
    num_train_epochs = trial.suggest_int('num_train_epochs', 2, 5)

    # Load a fresh model for each trial
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy="no",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01,
        learning_rate=learning_rate,
        save_strategy="no",  # Disable checkpoint saving
        report_to=[],  # Disable reporting/logging
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    # Fine-tune the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()
    accuracy = eval_results["eval_accuracy"]

    # Return the accuracy for Optuna to optimize
    return accuracy

# Run the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=15)  # Adjust n_trials as needed

# Get the best hyperparameters
print("Best hyperparameters: ", study.best_params)


Using device: cuda


Map:   0%|          | 0/25022 [00:00<?, ? examples/s]

Map:   0%|          | 0/6256 [00:00<?, ? examples/s]

[I 2025-01-07 03:27:20,578] A new study created in memory with name: no-name-132c4096-8f66-418d-9fed-89400c9679f5
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6152
1000,0.5705
1500,0.5376
2000,0.4775


[I 2025-01-07 03:50:58,520] Trial 0 finished with value: 0.7010869565217391 and parameters: {'learning_rate': 1.2474458267354724e-05, 'num_train_epochs': 3}. Best is trial 0 with value: 0.7010869565217391.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6183
1000,0.5751
1500,0.5446
2000,0.4933


[I 2025-01-07 04:14:32,413] Trial 1 finished with value: 0.701886189258312 and parameters: {'learning_rate': 1.1085106664709586e-05, 'num_train_epochs': 3}. Best is trial 1 with value: 0.701886189258312.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6183
1000,0.5681
1500,0.5249
2000,0.4278


[I 2025-01-07 04:38:06,017] Trial 2 finished with value: 0.6962915601023018 and parameters: {'learning_rate': 1.9838367301348504e-05, 'num_train_epochs': 3}. Best is trial 1 with value: 0.701886189258312.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6204
1000,0.5646
1500,0.5105


[I 2025-01-07 04:54:01,098] Trial 3 finished with value: 0.699968030690537 and parameters: {'learning_rate': 3.362485724535835e-05, 'num_train_epochs': 2}. Best is trial 1 with value: 0.701886189258312.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6143
1000,0.558
1500,0.5068


[I 2025-01-07 05:09:56,092] Trial 4 finished with value: 0.6972506393861893 and parameters: {'learning_rate': 2.934117299513861e-05, 'num_train_epochs': 2}. Best is trial 1 with value: 0.701886189258312.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6206
1000,0.5631
1500,0.5136
2000,0.3524


[I 2025-01-07 05:33:30,298] Trial 5 finished with value: 0.6870204603580563 and parameters: {'learning_rate': 3.641118833832878e-05, 'num_train_epochs': 3}. Best is trial 1 with value: 0.701886189258312.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6221
1000,0.5713
1500,0.5203
2000,0.3656
2500,0.2981
3000,0.186


[I 2025-01-07 06:04:45,474] Trial 6 finished with value: 0.676150895140665 and parameters: {'learning_rate': 2.7946685690150645e-05, 'num_train_epochs': 4}. Best is trial 1 with value: 0.701886189258312.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6168
1000,0.5714
1500,0.5342


[I 2025-01-07 06:20:42,136] Trial 7 finished with value: 0.7006074168797954 and parameters: {'learning_rate': 1.7585743625414313e-05, 'num_train_epochs': 2}. Best is trial 1 with value: 0.701886189258312.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6216
1000,0.5645
1500,0.5135
2000,0.3387


[I 2025-01-07 06:44:18,175] Trial 8 finished with value: 0.694693094629156 and parameters: {'learning_rate': 3.8505212469214096e-05, 'num_train_epochs': 3}. Best is trial 1 with value: 0.701886189258312.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6188
1000,0.5688
1500,0.5289
2000,0.3852
2500,0.3166
3000,0.2057
3500,0.1454


[I 2025-01-07 07:23:10,284] Trial 9 finished with value: 0.684462915601023 and parameters: {'learning_rate': 2.478983021338264e-05, 'num_train_epochs': 5}. Best is trial 1 with value: 0.701886189258312.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6169
1000,0.5731
1500,0.5426
2000,0.4768
2500,0.4478
3000,0.3861
3500,0.347


[I 2025-01-07 08:02:02,490] Trial 10 finished with value: 0.6892583120204604 and parameters: {'learning_rate': 1.068768026165213e-05, 'num_train_epochs': 5}. Best is trial 1 with value: 0.701886189258312.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6175
1000,0.5745
1500,0.545
2000,0.4882
2500,0.4672
3000,0.4202


[I 2025-01-07 08:33:15,713] Trial 11 finished with value: 0.6930946291560103 and parameters: {'learning_rate': 1.0191010943083958e-05, 'num_train_epochs': 4}. Best is trial 1 with value: 0.701886189258312.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6153
1000,0.5717
1500,0.5379
2000,0.464


[I 2025-01-07 08:56:50,770] Trial 12 finished with value: 0.7022058823529411 and parameters: {'learning_rate': 1.4145760631193807e-05, 'num_train_epochs': 3}. Best is trial 12 with value: 0.7022058823529411.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6168
1000,0.5704
1500,0.5338
2000,0.45
2500,0.413
3000,0.3469


[I 2025-01-07 09:28:05,620] Trial 13 finished with value: 0.6970907928388747 and parameters: {'learning_rate': 1.4420399646229783e-05, 'num_train_epochs': 4}. Best is trial 12 with value: 0.7022058823529411.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6153
1000,0.5704
1500,0.5327
2000,0.4485


[I 2025-01-07 09:51:40,758] Trial 14 finished with value: 0.7006074168797954 and parameters: {'learning_rate': 1.566655243473692e-05, 'num_train_epochs': 3}. Best is trial 12 with value: 0.7022058823529411.


Best hyperparameters:  {'learning_rate': 1.4145760631193807e-05, 'num_train_epochs': 3}


In [None]:
from google.colab import runtime
runtime.unassign()