In [None]:
!pip uninstall -y torch

In [None]:
!pip install torch==1.13.1+cpu torchvision==0.14.1+cpu torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cpu

In [54]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     -------------- ------------------------- 0.6/1.5 MB 11.5 MB/s eta 0:00:01
     ---------------------------------------  1.5/1.5 MB 18.9 MB/s eta 0:00:01
     ---------------------------------------- 1.5/1.5 MB 16.0 MB/s eta 0:00:00
Collecting joblib
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Installing collected packages: joblib, nltk
Successfully installed joblib-1.2.0 nltk-3.8.1


In [1]:
import numpy as np
import pandas as pd

In [2]:
from tqdm import tqdm
from transformers import BertTokenizer, BertModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from torch._C import *

In [4]:
import torch
from torch import nn
from torch.optim import Adam



## Data Preprocessing

In [68]:
import re
import csv
import pickle
from nltk.corpus import stopwords

import pkuseg
seg = pkuseg.pkuseg()

import string
puncs = string.punctuation + "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
puncs_remover = str.maketrans(puncs, ' '* len(puncs))


In [79]:
weibo_posts1 = pd.read_csv("full_weibo - less than 50.csv")
weibo_posts2 = pd.read_csv("full_weibo_over50.csv")
weibo_posts=pd.concat([weibo_posts1, weibo_posts2], ignore_index=True)

In [80]:
weibo_posts

Unnamed: 0.1,Unnamed: 0,user_id,weibo_id,post time,content
0,0.0,6510420346,M38tQzxAX,2022-08-28 16:23,美国海军表示，两艘军舰周日（28日）通过台湾海峡，这是自美国众议院议长佩洛西访台加剧美中紧张...
1,1.0,6510420346,M2l4pbhZr,2022-08-23 10:36,抱歉，作者已设置仅展示半年内微博，此微博已不可见。 赞[0] 原文转发[0] 原文评论[0...
2,2.0,6510420346,M1l3ppo2K,2022-08-16 20:43,特别喜欢这种滑坡言论今天可以穿和服明天可以穿军服后天可以在中国的地盘建厕所大后天军国主义的铁...
3,3.0,6510420346,M19V9kOIw,2022-08-15 16:23,抱歉，作者已设置仅展示半年内微博，此微博已不可见。 赞[0] 原文转发[0] 原文评论[0...
4,4.0,6510420346,M116er4R9,2022-08-14 17:55,抱歉，作者已设置仅展示半年内微博，此微博已不可见。 赞[0] 原文转发[0] 原文评论[0...
...,...,...,...,...,...
9885,,7295943862,LF9w0aBMP,2022-08-02 18:09,【#美台勾连挑衅中方正当防卫#】美台勾连挑衅在先，中方正当防卫在后。妄图#干涉中国主权问题1...
9886,,7295943862,LF92IFi8p,2022-08-02 16:56,倡议：1、不要拍摄、上传网络有关我军人员、装备、设施、阵地和工事的任何信息！因为这些不经意拍...
9887,,7295943862,LF7h1CCL0,2022-08-02 12:26,哈哈哈哈哈哈哈哈，好主意 原图 赞[6589] 原文转发[580] 原文评论[438]转发...
9888,,7295943862,LF77k5YBs,2022-08-02 12:02,#厦门航空对部分航班调整# 请问大家今晚要不要等着啊，我已经买了一堆好吃的了，就等下班回家见...


In [81]:
weibo_posts=weibo_posts.drop(['Unnamed: 0'],axis=1)
# initial df
weibo_posts['content'] = weibo_posts['content'].fillna('')

In [82]:
# remove useless words in comments
rep = {'_x1f1e8__x1f1f3_': '国旗', '¡评论配图': '', 'O网页链接': '', '¡查看动图': ''}
rep = dict((re.escape(k), v) for k, v in rep.items()) 
pattern = re.compile("|".join(rep.keys()))
replace_function = lambda m: rep[re.escape(m.group(0))]

In [83]:
weibo_posts

Unnamed: 0,user_id,weibo_id,post time,content
0,6510420346,M38tQzxAX,2022-08-28 16:23,美国海军表示，两艘军舰周日（28日）通过台湾海峡，这是自美国众议院议长佩洛西访台加剧美中紧张...
1,6510420346,M2l4pbhZr,2022-08-23 10:36,抱歉，作者已设置仅展示半年内微博，此微博已不可见。 赞[0] 原文转发[0] 原文评论[0...
2,6510420346,M1l3ppo2K,2022-08-16 20:43,特别喜欢这种滑坡言论今天可以穿和服明天可以穿军服后天可以在中国的地盘建厕所大后天军国主义的铁...
3,6510420346,M19V9kOIw,2022-08-15 16:23,抱歉，作者已设置仅展示半年内微博，此微博已不可见。 赞[0] 原文转发[0] 原文评论[0...
4,6510420346,M116er4R9,2022-08-14 17:55,抱歉，作者已设置仅展示半年内微博，此微博已不可见。 赞[0] 原文转发[0] 原文评论[0...
...,...,...,...,...
9885,7295943862,LF9w0aBMP,2022-08-02 18:09,【#美台勾连挑衅中方正当防卫#】美台勾连挑衅在先，中方正当防卫在后。妄图#干涉中国主权问题1...
9886,7295943862,LF92IFi8p,2022-08-02 16:56,倡议：1、不要拍摄、上传网络有关我军人员、装备、设施、阵地和工事的任何信息！因为这些不经意拍...
9887,7295943862,LF7h1CCL0,2022-08-02 12:26,哈哈哈哈哈哈哈哈，好主意 原图 赞[6589] 原文转发[580] 原文评论[438]转发...
9888,7295943862,LF77k5YBs,2022-08-02 12:02,#厦门航空对部分航班调整# 请问大家今晚要不要等着啊，我已经买了一堆好吃的了，就等下班回家见...


In [84]:
df_data_all = weibo_posts.copy()

# like Part 1s
df_data_all["comment_content_process"] = df_data_all["content"].map(lambda x: pattern.sub(replace_function, x))
# remove @
df_data_all["comment_content_process"] = df_data_all["comment_content_process"].map(lambda x: re.sub('@[^ ]+ ', '', x))
# remove puncs
df_data_all["comment_content_process"] = df_data_all["comment_content_process"].map(lambda x: x.translate(puncs_remover))
# seg the comments
df_data_all['comment_content_process'] = df_data_all['comment_content_process'].map(lambda x: ' '.join(seg.cut(x)))

In [86]:
data=df_data_all.drop(['content'],axis=1)
data['true']=0
data

Unnamed: 0,user_id,weibo_id,post time,comment_content_process,true
0,6510420346,M38tQzxAX,2022-08-28 16:23,美国 海军 表示 两 艘 军舰 周日 28日 通过 台湾 海峡 这是 自 美国 众议院 议长...,0
1,6510420346,M2l4pbhZr,2022-08-23 10:36,抱歉 作者 已 设置 仅 展示 半年 内 微博 此 微博 已 不可 见 赞 0 原文 转发 ...,0
2,6510420346,M1l3ppo2K,2022-08-16 20:43,特别 喜欢 这种 滑坡 言论 今天 可以 穿 和服 明天 可以 穿 军服 后天 可以 在 中...,0
3,6510420346,M19V9kOIw,2022-08-15 16:23,抱歉 作者 已 设置 仅 展示 半年 内 微博 此 微博 已 不可 见 赞 0 原文 转发 ...,0
4,6510420346,M116er4R9,2022-08-14 17:55,抱歉 作者 已 设置 仅 展示 半年 内 微博 此 微博 已 不可 见 赞 0 原文 转发 ...,0
...,...,...,...,...,...
9885,7295943862,LF9w0aBMP,2022-08-02 18:09,美 台 勾连 挑衅 中方 正当 防卫 美 台 勾连 挑衅 在先 中方 正当 防卫 在 后 妄...,0
9886,7295943862,LF92IFi8p,2022-08-02 16:56,倡议 1 不要 拍摄 上传 网络 有关 我军 人员 装备 设施 阵地 和 工事 的 任何 信...,0
9887,7295943862,LF7h1CCL0,2022-08-02 12:26,哈哈 哈哈 哈哈 哈哈 好 主意 原图 赞 6589 原文 转发 580 原文 评论 438...,0
9888,7295943862,LF77k5YBs,2022-08-02 12:02,厦门 航空 对 部分 航班 调整 请问 大家 今晚 要不 要 等 着 啊 我 已经 买 了 ...,0


## Labeling

In [100]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = df['true'].tolist()
        self.texts = [tokenizer(text, padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['comment_content_process']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.sigmoid(linear_output)

        return final_layer

def train(model, train_data, val_data, learning_rate, epochs):
    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=10, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=10)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):
            total_acc_train = 0
            total_loss_train = 0   
            total_pre_rec_train = 0
            total_pre_train = 0
            total_rec_train = 0
            for train_input, train_label in tqdm(train_dataloader):
                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)[:, 0]

                batch_loss = criterion(output, train_label.float())
                total_loss_train += batch_loss.item()

                output_label = torch.round(output)
                acc = (output_label == train_label).sum()
                total_acc_train += acc

                total_pre_rec_train += ((output_label == 1) & (train_label == 1)).sum()
                total_pre_train += output_label.sum()
                total_rec_train += train_label.sum()

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()

            total_acc_val = 0
            total_loss_val = 0
            total_pre_rec_val = 0
            total_pre_val = 0
            total_rec_val = 0
            with torch.no_grad():
                for val_input, val_label in val_dataloader:
                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)[:, 0]

                    batch_loss = criterion(output, val_label.float())
                    total_loss_val += batch_loss.item()

                    output_label = torch.round(output)
                    acc = (output_label == val_label).sum()
                    total_acc_val += acc

                    total_pre_rec_val += ((output_label == 1) & (val_label == 1)).sum()
                    total_pre_val += output_label.sum()
                    total_rec_val += val_label.sum()

            print(f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Accuracy: {total_acc_train / len(train_data): .3f} | Train Precision: {total_pre_rec_train / total_pre_train: .3f} | Train Recall: {total_pre_rec_train / total_rec_train: .3f}')
            print(f'Epochs: {epoch_num + 1} | Valid Loss: {total_loss_val / len(val_data): .3f} | Valid Accuracy: {total_acc_val / len(val_data): .3f} | Valid Precision: {total_pre_rec_val / total_pre_val: .3f} | Valid Recall: {total_pre_rec_val / total_rec_val: .3f}')
            
            torch.save(model.state_dict(), "models/model_%d.pt" % (epoch_num + 1))

def evaluate(model, test_data):
    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()
    
    answer = []
    total_acc_test = 0
    total_pre_rec_test = 0
    total_pre_test = 0
    total_rec_test = 0
    with torch.no_grad():
        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)[:, 0]
            answer.append(output)

            output_label = torch.round(output)
            acc = (output_label == test_label).sum()
            total_acc_test += acc

            total_pre_rec_test += ((output_label == 1) & (test_label == 1)).sum()
            total_pre_test += output_label.sum()
            total_rec_test += test_label.sum()

    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    print(f'Test Precision: {total_pre_rec_test / total_pre_test: .3f}')
    print(f'Test Recall: {total_pre_rec_test / total_rec_test: .3f}')

    return answer

def predict(model, test_data):
    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()
    
    answer = []
    

    with torch.no_grad():
        a=0
        for test_input, test_label in test_dataloader:
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)[:, 0]
            answer.append(output)
            if (a%50==0):
                print(f'labelled {a*2} comments')
            a+=1
            

    return answer


In [101]:
###Training

# df_data_train = pd.read_csv("train_all.csv", index_col=0)
# df_data_train["comment_content_process"] = df_data_train["comment_content_process"].astype(str)

# EPOCHS = 10
# model = BertClassifier()
# LR = 1e-6
# df_data_train_train, df_data_train_val = np.split(df_data_train, [int(0.7*len(df_data_train))])

# train(model, df_data_train_train, df_data_train_val, LR, EPOCHS)

In [102]:
# ## Evaluate
# device = torch.device('cpu')
# model = BertClassifier()
# model.load_state_dict(torch.load("model_10.pt", map_location=device))

# answer = torch.cat(evaluate(model, df_data_test))
# df_data_test["bert"]= answer.cpu()

In [93]:
#Predict

data["comment_content_process"] = data["comment_content_process"].astype(str)


device = torch.device('cpu')
model = BertClassifier()
model.load_state_dict(torch.load("model_10.pt", map_location=device))


answer = torch.cat(predict(model, t1))
data["bert"]= answer.cpu()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([0.1035, 0.0811])
labelled 0 comments
tensor([0.0498, 0.0409])
tensor([0.0652, 0.0893])
tensor([0.7449, 0.0538])
tensor([0.1901, 0.0643])
tensor([0.0712, 0.1868])
tensor([0.0820, 0.0391])
tensor([0.0511, 0.0968])
tensor([0.0338, 0.7655])
tensor([0.1273, 0.0499])
tensor([0.0426, 0.0717])
tensor([0.0701, 0.0631])
tensor([0.3680, 0.0756])
tensor([0.0539, 0.0455])
tensor([0.0240, 0.0560])
tensor([0.0539, 0.0285])
tensor([0.0589, 0.0457])
tensor([0.0688, 0.4125])
tensor([0.0593, 0.0468])
tensor([0.0478, 0.1963])
tensor([0.1003, 0.0741])
tensor([0.0368, 0.1314])
tensor([0.0556, 0.0407])
tensor([0.1083, 0.0660])
tensor([0.1350, 0.2945])
tensor([0.0360, 0.1219])
tensor([0.6204, 0.8760])
tensor([0.9417, 0.9136])
tensor([0.0744, 0.0334])
tensor([0.0533, 0.7047])
tensor([0.0340, 0.0509])
tensor([0.7889, 0.0910])
tensor([0.0190, 0.8574])
tensor([0.1089, 0.1127])
tensor([0.1020, 0.4320])
tensor([0.8685, 0.0799])
tensor([0.0777, 0.3778])
tensor([0.0445, 0.0927])
tensor([0.5571, 0.6733])
tenso

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t1["bert"]= answer.cpu()


In [94]:
t1

Unnamed: 0,user_id,weibo_id,post time,comment_content_process,true,bert
0,6510420346,M38tQzxAX,2022-08-28 16:23,美国 海军 表示 两 艘 军舰 周日 28日 通过 台湾 海峡 这是 自 美国 众议院 议长...,0,0.103533
1,6510420346,M2l4pbhZr,2022-08-23 10:36,抱歉 作者 已 设置 仅 展示 半年 内 微博 此 微博 已 不可 见 赞 0 原文 转发 ...,0,0.081071
2,6510420346,M1l3ppo2K,2022-08-16 20:43,特别 喜欢 这种 滑坡 言论 今天 可以 穿 和服 明天 可以 穿 军服 后天 可以 在 中...,0,0.049751
3,6510420346,M19V9kOIw,2022-08-15 16:23,抱歉 作者 已 设置 仅 展示 半年 内 微博 此 微博 已 不可 见 赞 0 原文 转发 ...,0,0.040946
4,6510420346,M116er4R9,2022-08-14 17:55,抱歉 作者 已 设置 仅 展示 半年 内 微博 此 微博 已 不可 见 赞 0 原文 转发 ...,0,0.065228
...,...,...,...,...,...,...
195,2786215997,LmLZFtLBd,2022-04-03 21:10,辉瑞口 服药 深圳 小规模 临床 显示 初步 疗效 个 很 大 疑问 如果 有效 美国 一百...,0,0.209183
196,2786215997,Lms9E6YSE,2022-04-01 18:40,上海 第一 批 封控区 核酸 筛查 完成 🙏🙏 🙏 希望 浦西 人民 上海 人民 全国 人民...,0,0.626840
197,2786215997,LmqL0jFEK,2022-04-01 15:06,浦东 卫健委 上海 六 院 上海 六 院 删除 丁丁 保卫战 文章 看看 人家 的 六 院 ...,0,0.075638
198,2786215997,LmjK6Ewb7,2022-03-31 21:15,上海 买 菜 到底 难 不难 几 个 买 菜 的 APP 平台 不 把 手机 戳破 连 根葱...,0,0.040631


In [99]:
data.to_csv("labelled_data.csv",encoding='utf_8_sig')