In [3]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from tqdm import tqdm

In [4]:
#data_dir = '/Users/vivianwang/Downloads/T2/Deep learning /toxic-comment-classification-dsba-2023/kaggle_data'
from google.colab import drive
drive.mount('/content/drive')
data_dir = '/content/drive/My Drive/Kaggle- FDL'

Mounted at /content/drive


In [5]:
def load_data(data_dir, data_type):

    x_file = f'{data_type}_x.csv'
    y_file = f'{data_type}_y.csv'

    df = pd.read_csv(os.path.join(data_dir, x_file), index_col=0)
    texts = df.values.tolist()

    labels_df = pd.read_csv(os.path.join(data_dir, y_file)).values.tolist()
    labels = [array[15] for array in labels_df]

    if data_type == 'val':
        return texts, labels, labels_df
    else:
        return texts, labels

In [6]:
train_texts, train_labels = load_data(data_dir,'train')
val_texts, val_labels, val_labels_all = load_data(data_dir,'val')

In [7]:
class BertDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        if isinstance(text, str) or (isinstance(text, list) and all(isinstance(t, str) for t in text)):
            encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        else:
            print(f'ID{idx}:{text},The type is {type(text)}')
            raise TypeError("Text input is of incorrect type.")
        return {'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'label': torch.tensor(label)
                }

In [8]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [9]:
def worst_group_accuracy(prediction, y):
    """
        Compute the worst group accuracy, with the groups being defined by ['male', 'female', 'LGBTQ',
        'christian', 'muslim', 'other_religions', 'black', 'white'] for positive and negative toxicity.
        arguments:
            prediction [pandas.DataFrame]: dataframe with 2 columns (index and pred)
            y [pandas.DataFrame]: dataframe containing the metadata
        returns:
            wga [float]: worst group accuracy
    """
    y.loc[prediction.index, 'pred'] = prediction.pred

    categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white']
    accuracies = []
    for category in categories:
        for label in [0, 1]:
            group = y.loc[y[category] == label]
            group_accuracy = (group['y'] == (group['pred'] > 0.5)).mean()
            accuracies.append(group_accuracy)
    wga = np.min(accuracies)
    return wga

In [10]:

def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in tqdm(data_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [11]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [12]:
def evaluate_wgc(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []


    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())

    ids = list(range(len(predictions)))
    pred_df = pd.DataFrame({'pred': predictions}, index=ids)

    # Construct metadata_df from the dataset's labels, assuming it has the correct structure
    if isinstance(data_loader.dataset.labels, list) and all(isinstance(item, list) for item in data_loader.dataset.labels):
        # If data_loader.dataset.labels is a list of lists
        metadata_df = pd.DataFrame(data_loader.dataset.labels, columns=['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white'])
    else:
        # Handle other cases or raise an error
        raise ValueError("data_loader.dataset.labels does not have the expected structure.")

    WGC = worst_group_accuracy(pred_df, y = metadata_df)
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions),WGC


'\ndef evaluate(model, data_loader, device):\n    model.eval()\n    predictions = []\n    actual_labels = []\n\n\n    with torch.no_grad():\n        for batch in tqdm(data_loader):\n            input_ids = batch[\'input_ids\'].to(device)\n            attention_mask = batch[\'attention_mask\'].to(device)\n            labels = batch[\'label\'].to(device)\n            outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n            _, preds = torch.max(outputs, dim=1)\n            predictions.extend(preds.cpu().tolist())\n            actual_labels.extend(labels.cpu().tolist())\n\n    ids = list(range(len(predictions)))\n    pred_df = pd.DataFrame({\'pred\': predictions}, index=ids)\n\n    # Construct metadata_df from the dataset\'s labels, assuming it has the correct structure\n    if isinstance(data_loader.dataset.labels, list) and all(isinstance(item, list) for item in data_loader.dataset.labels):\n        # If data_loader.dataset.labels is a list of lists\n        metad

In [13]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return "1" if preds.item() == 1 else "0"

In [22]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 64
num_epochs = 10
learning_rate = 2e-5

In [15]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = BertDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = BertDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = BERTClassifier(bert_model_name, num_classes).to(device)
optimizer = AdamW(model.parameters(), lr=learning_rate, no_deprecation_warning=True)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
#train_fake(model, train_dataloader, optimizer, scheduler, device)
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        #torch.save(model.state_dict(), f'path/res{epoch}.pth')
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

cuda
Epoch 1/10


100%|██████████| 4204/4204 [27:30<00:00,  2.55it/s]
100%|██████████| 706/706 [02:19<00:00,  5.07it/s]


Validation Accuracy: 0.9219
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     40125
           1       0.65      0.65      0.65      5055

    accuracy                           0.92     45180
   macro avg       0.80      0.80      0.80     45180
weighted avg       0.92      0.92      0.92     45180

Epoch 2/10


100%|██████████| 4204/4204 [27:28<00:00,  2.55it/s]
100%|██████████| 706/706 [02:19<00:00,  5.07it/s]


Validation Accuracy: 0.9225
              precision    recall  f1-score   support

           0       0.96      0.95      0.96     40125
           1       0.65      0.66      0.66      5055

    accuracy                           0.92     45180
   macro avg       0.80      0.81      0.81     45180
weighted avg       0.92      0.92      0.92     45180

Epoch 3/10


100%|██████████| 4204/4204 [27:29<00:00,  2.55it/s]
100%|██████████| 706/706 [02:19<00:00,  5.07it/s]


Validation Accuracy: 0.9222
              precision    recall  f1-score   support

           0       0.95      0.96      0.96     40125
           1       0.66      0.62      0.64      5055

    accuracy                           0.92     45180
   macro avg       0.81      0.79      0.80     45180
weighted avg       0.92      0.92      0.92     45180

Epoch 4/10


100%|██████████| 4204/4204 [27:29<00:00,  2.55it/s]
100%|██████████| 706/706 [02:19<00:00,  5.07it/s]


Validation Accuracy: 0.9203
              precision    recall  f1-score   support

           0       0.95      0.96      0.96     40125
           1       0.66      0.60      0.63      5055

    accuracy                           0.92     45180
   macro avg       0.80      0.78      0.79     45180
weighted avg       0.92      0.92      0.92     45180

Epoch 5/10


100%|██████████| 4204/4204 [27:29<00:00,  2.55it/s]
100%|██████████| 706/706 [02:19<00:00,  5.07it/s]


Validation Accuracy: 0.9153
              precision    recall  f1-score   support

           0       0.95      0.95      0.95     40125
           1       0.62      0.64      0.63      5055

    accuracy                           0.92     45180
   macro avg       0.79      0.79      0.79     45180
weighted avg       0.92      0.92      0.92     45180

Epoch 6/10


100%|██████████| 4204/4204 [27:29<00:00,  2.55it/s]
100%|██████████| 706/706 [02:19<00:00,  5.08it/s]


Validation Accuracy: 0.9162
              precision    recall  f1-score   support

           0       0.95      0.95      0.95     40125
           1       0.63      0.62      0.62      5055

    accuracy                           0.92     45180
   macro avg       0.79      0.79      0.79     45180
weighted avg       0.92      0.92      0.92     45180

Epoch 7/10


100%|██████████| 4204/4204 [27:29<00:00,  2.55it/s]
100%|██████████| 706/706 [02:19<00:00,  5.07it/s]


Validation Accuracy: 0.9185
              precision    recall  f1-score   support

           0       0.95      0.96      0.95     40125
           1       0.65      0.59      0.62      5055

    accuracy                           0.92     45180
   macro avg       0.80      0.78      0.79     45180
weighted avg       0.92      0.92      0.92     45180

Epoch 8/10


100%|██████████| 4204/4204 [27:29<00:00,  2.55it/s]
100%|██████████| 706/706 [02:18<00:00,  5.09it/s]


Validation Accuracy: 0.9200
              precision    recall  f1-score   support

           0       0.95      0.96      0.96     40125
           1       0.66      0.60      0.63      5055

    accuracy                           0.92     45180
   macro avg       0.80      0.78      0.79     45180
weighted avg       0.92      0.92      0.92     45180

Epoch 9/10


100%|██████████| 4204/4204 [27:27<00:00,  2.55it/s]
100%|██████████| 706/706 [02:19<00:00,  5.07it/s]


Validation Accuracy: 0.9195
              precision    recall  f1-score   support

           0       0.95      0.96      0.95     40125
           1       0.65      0.60      0.62      5055

    accuracy                           0.92     45180
   macro avg       0.80      0.78      0.79     45180
weighted avg       0.92      0.92      0.92     45180

Epoch 10/10


100%|██████████| 4204/4204 [27:29<00:00,  2.55it/s]
100%|██████████| 706/706 [02:19<00:00,  5.05it/s]

Validation Accuracy: 0.9194
              precision    recall  f1-score   support

           0       0.95      0.96      0.95     40125
           1       0.65      0.61      0.63      5055

    accuracy                           0.92     45180
   macro avg       0.80      0.78      0.79     45180
weighted avg       0.92      0.92      0.92     45180






In [24]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Function to apply sentiment prediction on a text
def apply_sentiment_prediction(row):
    return predict_sentiment(row['string'], model, tokenizer, device)

# Load your CSV file
df = pd.read_csv('/content/drive/My Drive/Kaggle- FDL/test_x.csv')

# Apply the sentiment prediction function
df['pred'] = df.apply(apply_sentiment_prediction, axis=1)
print(df)

# Reset the index to make it a column and rename it to 'ID'
df_reset = df.reset_index().rename(columns={'index': 'ID'})

# Select only the 'ID' and 'pred' columns|
df_to_export = df_reset[['ID', 'pred']]

# Export to CSV, without the Pandas index
df_to_export.to_csv('new-prediction.csv', index=False)

         index                                             string pred
0            0  OH yes - Were those evil Christian Missionarie...    0
1            1  He's considered a good candidate for a cyber-s...    0
2            2  Lela, you admit no records exist to support yo...    0
3            3  I'll take the iffy libertarian over the guy wh...    1
4            4       Shouldn't your handle be Republic of Uranus?    0
...        ...                                                ...  ...
133777  133777  Is it better to be dead, and broke? Alive and ...    0
133778  133778  When you say speaking in code, you are adding ...    0
133779  133779  At least twice trained law enforcement officer...    0
133780  133780  I've not said this before, but Kizla you are a...    1
133781  133781  The Democrat party aided and abetted by it's M...    1

[133782 rows x 3 columns]
