In [1]:
import numpy as np
import pandas as pd

In [2]:
handcoded = pd.read_csv("handcoded.csv")

In [3]:
data0 = pd.read_csv("fullweibo_1011.csv")

In [4]:
joined_data = handcoded.merge(data0[["weibo_id", "content_cleaned"]], on='weibo_id', how='left')
handcoded1=joined_data[["weibo_id","content","content_cleaned","Labeled"]]

In [5]:
handcoded1.to_csv("handcoded1.csv", encoding="utf-8-sig", index= False)

In [6]:
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
from torch._C import *
import torch
from torch import nn
from torch.optim import Adam

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = df['Labeled'].tolist()
        self.texts = [tokenizer(text, padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['content_cleaned']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.sigmoid(linear_output)

        return final_layer

def train(model, train_data, val_data, learning_rate, epochs):
    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=10, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=10)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):
            total_acc_train = 0
            total_loss_train = 0   
            total_pre_rec_train = 0
            total_pre_train = 0
            total_rec_train = 0
            for train_input, train_label in tqdm(train_dataloader):
                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)[:, 0]

                batch_loss = criterion(output, train_label.float())
                total_loss_train += batch_loss.item()

                output_label = torch.round(output)
                acc = (output_label == train_label).sum()
                total_acc_train += acc

                total_pre_rec_train += ((output_label == 1) & (train_label == 1)).sum()
                total_pre_train += output_label.sum()
                total_rec_train += train_label.sum()

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()

            total_acc_val = 0
            total_loss_val = 0
            total_pre_rec_val = 0
            total_pre_val = 0
            total_rec_val = 0
            with torch.no_grad():
                for val_input, val_label in val_dataloader:
                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)[:, 0]

                    batch_loss = criterion(output, val_label.float())
                    total_loss_val += batch_loss.item()

                    output_label = torch.round(output)
                    acc = (output_label == val_label).sum()
                    total_acc_val += acc

                    total_pre_rec_val += ((output_label == 1) & (val_label == 1)).sum()
                    total_pre_val += output_label.sum()
                    total_rec_val += val_label.sum()

            print(f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Accuracy: {total_acc_train / len(train_data): .3f} | Train Precision: {total_pre_rec_train / total_pre_train: .3f} | Train Recall: {total_pre_rec_train / total_rec_train: .3f}')
            print(f'Epochs: {epoch_num + 1} | Valid Loss: {total_loss_val / len(val_data): .3f} | Valid Accuracy: {total_acc_val / len(val_data): .3f} | Valid Precision: {total_pre_rec_val / total_pre_val: .3f} | Valid Recall: {total_pre_rec_val / total_rec_val: .3f}')
            
            torch.save(model.state_dict(), "models/model_%d.pt" % (epoch_num + 1))

def evaluate(model, test_data):
    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()
    
    answer = []
    total_acc_test = 0
    total_pre_rec_test = 0
    total_pre_test = 0
    total_rec_test = 0
    with torch.no_grad():
        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)[:, 0]
            answer.append(output)

            output_label = torch.round(output)
            acc = (output_label == test_label).sum()
            total_acc_test += acc

            total_pre_rec_test += ((output_label == 1) & (test_label == 1)).sum()
            total_pre_test += output_label.sum()
            total_rec_test += test_label.sum()

    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    print(f'Test Precision: {total_pre_rec_test / total_pre_test: .3f}')
    print(f'Test Recall: {total_pre_rec_test / total_rec_test: .3f}')

    return answer

def predict(model, test_data):
    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()
    
    answer = []
    

    with torch.no_grad():
        a=0
        for test_input, test_label in test_dataloader:
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)[:, 0]
            answer.append(output)
            if (a%50==0):
                print(f'labelled {a*2} comments')
            a+=1
            

    return answer

In [8]:
handcoded1["content_cleaned"] = handcoded1["content_cleaned"].astype(str)


In [9]:
from sklearn.model_selection import train_test_split

In [12]:
df_data_train, df_data_test = train_test_split(
    handcoded1[["Labeled", "content_cleaned"]], test_size=0.3, random_state=1027)

df_data_train_train, df_data_train_val = np.split(df_data_train, [int(0.7*len(df_data_train))])

In [14]:
device = torch.device('cpu')

In [16]:
## Fine Tune the old bert using new data

model_10 = BertClassifier()
model_10.load_state_dict(torch.load("model_10.pt", map_location=device))

EPOCHS = 10
LR = 1e-6

train(model_10, df_data_train_train, df_data_train_val, LR, EPOCHS)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [07:09<00:00, 28.65s/it]


Epochs: 1 | Train Loss:  0.060 | Train Accuracy:  0.782 | Train Precision:  0.857 | Train Recall:  0.526
Epochs: 1 | Valid Loss:  0.054 | Valid Accuracy:  0.825 | Valid Precision:  0.800 | Valid Recall:  0.696


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [07:08<00:00, 28.54s/it]


Epochs: 2 | Train Loss:  0.047 | Train Accuracy:  0.830 | Train Precision:  0.833 | Train Recall:  0.702
Epochs: 2 | Valid Loss:  0.052 | Valid Accuracy:  0.841 | Valid Precision:  0.882 | Valid Recall:  0.652


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [07:39<00:00, 30.66s/it]


Epochs: 3 | Train Loss:  0.042 | Train Accuracy:  0.823 | Train Precision:  0.844 | Train Recall:  0.667
Epochs: 3 | Valid Loss:  0.053 | Valid Accuracy:  0.825 | Valid Precision:  0.875 | Valid Recall:  0.609


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [07:10<00:00, 28.71s/it]


Epochs: 4 | Train Loss:  0.040 | Train Accuracy:  0.850 | Train Precision:  0.872 | Train Recall:  0.719
Epochs: 4 | Valid Loss:  0.048 | Valid Accuracy:  0.857 | Valid Precision:  0.889 | Valid Recall:  0.696


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [07:18<00:00, 29.25s/it]


Epochs: 5 | Train Loss:  0.039 | Train Accuracy:  0.850 | Train Precision:  0.857 | Train Recall:  0.737
Epochs: 5 | Valid Loss:  0.047 | Valid Accuracy:  0.825 | Valid Precision:  0.833 | Valid Recall:  0.652


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [07:57<00:00, 31.86s/it]


Epochs: 6 | Train Loss:  0.036 | Train Accuracy:  0.857 | Train Precision:  0.860 | Train Recall:  0.754
Epochs: 6 | Valid Loss:  0.049 | Valid Accuracy:  0.841 | Valid Precision:  0.882 | Valid Recall:  0.652


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [07:12<00:00, 28.85s/it]


Epochs: 7 | Train Loss:  0.036 | Train Accuracy:  0.864 | Train Precision:  0.894 | Train Recall:  0.737
Epochs: 7 | Valid Loss:  0.048 | Valid Accuracy:  0.841 | Valid Precision:  0.842 | Valid Recall:  0.696


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [07:04<00:00, 28.29s/it]


Epochs: 8 | Train Loss:  0.033 | Train Accuracy:  0.878 | Train Precision:  0.915 | Train Recall:  0.754
Epochs: 8 | Valid Loss:  0.052 | Valid Accuracy:  0.841 | Valid Precision:  0.882 | Valid Recall:  0.652


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [06:59<00:00, 27.97s/it]


Epochs: 9 | Train Loss:  0.033 | Train Accuracy:  0.891 | Train Precision:  0.936 | Train Recall:  0.772
Epochs: 9 | Valid Loss:  0.049 | Valid Accuracy:  0.841 | Valid Precision:  0.882 | Valid Recall:  0.652


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [07:39<00:00, 30.64s/it]


Epochs: 10 | Train Loss:  0.032 | Train Accuracy:  0.891 | Train Precision:  0.936 | Train Recall:  0.772
Epochs: 10 | Valid Loss:  0.047 | Valid Accuracy:  0.825 | Valid Precision:  0.833 | Valid Recall:  0.652


In [18]:
evaluate(model_10, df_data_test)

Test Accuracy:  0.778
Test Precision:  0.688
Test Recall:  0.423


[tensor([0.0908, 0.0689]),
 tensor([0.9274, 0.1219]),
 tensor([0.0816, 0.8519]),
 tensor([0.6288, 0.0935]),
 tensor([0.9747, 0.2040]),
 tensor([0.9022, 0.1536]),
 tensor([0.1414, 0.9445]),
 tensor([0.0956, 0.0775]),
 tensor([0.1451, 0.1783]),
 tensor([0.0930, 0.1340]),
 tensor([0.0840, 0.0893]),
 tensor([0.8459, 0.0827]),
 tensor([0.0974, 0.1912]),
 tensor([0.1157, 0.1353]),
 tensor([0.1814, 0.1645]),
 tensor([0.1114, 0.0471]),
 tensor([0.4069, 0.1608]),
 tensor([0.5322, 0.9803]),
 tensor([0.5279, 0.1378]),
 tensor([0.1891, 0.2213]),
 tensor([0.1493, 0.1096]),
 tensor([0.1453, 0.9078]),
 tensor([0.1059, 0.9610]),
 tensor([0.2236, 0.1142]),
 tensor([0.9418, 0.1362]),
 tensor([0.1290, 0.2116]),
 tensor([0.2305, 0.2697]),
 tensor([0.1936, 0.1797]),
 tensor([0.1731, 0.1505]),
 tensor([0.1337, 0.2676]),
 tensor([0.1500, 0.0864]),
 tensor([0.3246, 0.3097]),
 tensor([0.0862, 0.0589]),
 tensor([0.2266, 0.0778]),
 tensor([0.0801, 0.3262]),
 tensor([0.9481, 0.2108]),
 tensor([0.5837, 0.1777]),
 