# Config

In [1]:
import os
import torch
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import djwtool

Config = {
    'device': "torch.device('cuda' if torch.cuda.is_available() else 'cpu')",
    'csv_file_path': 'data/test_right.csv',
    'label_column': '微博主分类标注',

    'Vectorizer_max_length': 140,
    'Vectorizer_model_name': 'hfl/chinese-roberta-wwm-ext-large',

    'train_ratio': 0.6,
    'val_ratio': 0.2,
    'batch_size': 64,

    'epochs': 10,
    'model_save_path': 'test_right_model.pth'
}


folders_to_create = ['./model', './tensor']
for folder in folders_to_create:
    if not os.path.exists(folder):
        os.makedirs(folder)
    else:
        print(f'文件夹 {folder} 已存在')
print("=" * 100)

文件夹 ./model 已存在
文件夹 ./tensor 已存在


# 数据预处理

In [2]:
csv = djwtool.CSVProcessor(Config['csv_file_path'])
# csv.df = csv.df[csv.df['微博主分类标注'].isin(['网民', '自媒体'])]
values= ['网民', '自媒体', '大V名人']
csv.df = csv.df[csv.df['微博主分类标注'].isin(values)]
label_mapping = csv.generate_label_mapping(Config['label_column'])
print(label_mapping)
csv.label_numerization(label_mapping, Config['label_column'])  
num_classes=len(label_mapping)
print(num_classes)

column_names = ['关注', '粉丝', '微博']
for column_name in column_names:
    csv.df[column_name] = csv.df[column_name].apply(lambda x: f'{int(x):06}' if pd.notnull(x) else '000000')
csv.fill_nan_with_value()
csv.df['认证'] = csv.df['认证'].replace('无', '无V')

csv.str_length_normalization('博主标记',26)
csv.str_length_normalization('简介',50)
csv.str_length_normalization('工作信息',12)
csv.str_length_normalization('标签和其他',25)

all_to_merge = ['认证', '关注', '粉丝', '微博','博主标记', '简介', '工作信息', '标签和其他']
# all_to_merge = ['认证', '昵称']
csv.df['text'] = csv.apply_merge_to_columns(all_to_merge)

text_column = csv.df['text']
print('text:')
print(text_column)
print("=" * 100)

tokenizer = djwtool.TextTokenizer(Config['Vectorizer_model_name'],
                                Config['Vectorizer_max_length'])
input_ids_list, attention_mask_list = tokenizer.tokenize_dataframe(csv.df['text'])
label_list = list(csv.df['label_num'])
input_ids_tensor = torch.tensor(input_ids_list)
attention_mask_tensor = torch.tensor(attention_mask_list)
label_tensor = torch.tensor(label_list)
print('input_ids_tensor:',input_ids_tensor.shape)
print('attention_mask_tensor:',attention_mask_tensor.shape)
print('label_tensor:',label_tensor.shape)
print("=" * 100)


{'超话粉丝大咖': 0, '公务员': 1, '大V名人': 2, '党委': 3, '基层组织': 4, '政府': 5, '媒体': 6, '企事业单位': 7, '赛事活动': 8, '社会组织': 9, '社区组织': 10, '司法机关': 11, '网民': 12, '学校': 13, '演艺娱乐明星': 14}
15
text:
0        金V/000274/000320/013621/超话粉丝大咖（朱正廷超话）000000000...
1        无V/000857/012365/013157/超话粉丝大咖（陈立农超话）000000000...
2        金V/000749/000241/002130/超话粉丝大咖（周杰伦超话）000000000...
3        金V/000412/000948/048019/超话粉丝大咖（陈伟霆超话）000000000...
4        无V/000516/000885/013115/超话粉丝大咖（宋茜超话）0000000000...
                               ...                        
12081    无V/000458/005589/003409/著名电影、电视剧观众。00000000000...
12082    金V/000775/127134427/010903/女主持人000000000000000...
12083    无V/000200/008303/000135/8。23。24000000000000000...
12084    蓝V/000107/048572/001365/庐山西海国家级风景名胜区官方微博000000...
12085    无V/000145/101107/000601/稀捍行动官方微博00000000000000...
Name: text, Length: 10906, dtype: object
input_ids_tensor: torch.Size([10906, 140])
attention_mask_tensor: torch.Size([10906, 140])
label_tensor: torch.Size([10906])


# 模型训练

In [3]:
td = djwtool.TrainDataset(input_ids_tensor, attention_mask_tensor, label_tensor)
train_loader, val_loader, test_loader = td.prepare_dataloaders(Config['train_ratio'],                                                                                                                     
                                                               Config['val_ratio'],
                                                               Config['batch_size']
                                                               )

print('train_loader检查:')
for batch_idx, (ids, attention_mask , target) in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}:")
    print("ids shape:", ids.shape)
    print("attention_mask shape:", attention_mask.shape)
    print("Target shape:", target.shape)

    if batch_idx == 0:  
        break
print("=" * 100)


model = djwtool.BERTVectorizer(Config['Vectorizer_model_name'],num_classes) 
# optimizer = optim.Adam(model.parameters(), lr=1e-2,  weight_decay=5e-4)
# optimizer=optim.RMSprop(model.parameters(),lr=0.001,alpha=0.99,momentum=0,weight_decay=0)
optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=0.5, weight_decay=5e-4)
# scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, Config['epochs'])
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,  'min',
#                                                     factor=0.5, #学习率下降的因子factor=0.5, 
#                                                     verbose=True,#每次更新都会打印一条消息 
#                                                     patience=2,#有2个epochs的平均损失没有变化，学习率将
#                                                     min_lr=0.00000001,# 学习率的下限
#                                                     threshold=0.001)#小于这个数表示平均损失没有下降
# scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[75, 150], gamma=0.5)
scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=15, T_mult=2, eta_min=0.00001, last_epoch=-1 , verbose=False)
criterion = nn.CrossEntropyLoss()
print('训练过程:')
trainer = djwtool.Trainer(model, train_loader, val_loader,optimizer,
                          criterion, scheduler, Config['epochs'], Config['model_save_path'])
trainer.train()
print("=" * 100)

train_loader检查:
Batch 1:
ids shape: torch.Size([64, 140])
attention_mask shape: torch.Size([64, 140])
Target shape: torch.Size([64])
训练过程:
Epoch [1/10] Train Loss: 0.953258 Train Acc: 0.73 Val Loss: 0.331646 Val Acc: 0.91 Learning Rate: 0.010000
	Time: 00:59
Epoch [2/10] Train Loss: 0.251863 Train Acc: 0.93 Val Loss: 0.236498 Val Acc: 0.93 Learning Rate: 0.009910
	Time: 00:59
Epoch [3/10] Train Loss: 0.181921 Train Acc: 0.95 Val Loss: 0.228831 Val Acc: 0.95 Learning Rate: 0.009906
	Time: 00:59
Epoch [4/10] Train Loss: 0.143619 Train Acc: 0.96 Val Loss: 0.222779 Val Acc: 0.94 Learning Rate: 0.009902
	Time: 00:57
Epoch [5/10] Train Loss: 0.119983 Train Acc: 0.97 Val Loss: 0.210463 Val Acc: 0.95 Learning Rate: 0.009903
	Time: 00:59
Epoch [6/10] Train Loss: 0.093381 Train Acc: 0.97 Val Loss: 0.238315 Val Acc: 0.94 Learning Rate: 0.009902
	Time: 00:56
Epoch [7/10] Train Loss: 0.076391 Train Acc: 0.98 Val Loss: 0.258303 Val Acc: 0.94 Learning Rate: 0.009904
	Time: 00:57
Epoch [8/10] Train Lo

# 模型测试

In [4]:
print('模型测试:')

evaluator = djwtool.ModelEvaluator(model, test_loader, label_mapping)
evaluator.test_accuracy()
evaluator.accuracy_of_label()

模型测试:
Accuracy of the network on the test items: 95.05 %
Accuracy of 超话粉丝大咖 : 99.48 %
Accuracy of   公务员 : 78.57 %
Accuracy of  大V名人 : 97.77 %
Accuracy of    党委 : 96.07 %
Accuracy of  基层组织 : 92.59 %
Accuracy of    政府 : 95.45 %
Accuracy of    媒体 : 90.50 %
Accuracy of 企事业单位 : 93.44 %
Accuracy of  赛事活动 : 94.59 %
Accuracy of  社会组织 : 93.55 %
Accuracy of  社区组织 : 93.12 %
Accuracy of  司法机关 : 99.43 %
Accuracy of    网民 : 93.44 %
Accuracy of    学校 : 95.61 %
Accuracy of 演艺娱乐明星 : 100.00 %


In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

class ModelEvaluator:
    def __init__(self, y_true, y_pred_prob, threshold=0.5):
        self.y_true = y_true
        self.y_pred_prob = y_pred_prob
        self.threshold = threshold
        self.y_pred = (self.y_pred_prob >= self.threshold).astype(int)

    def calculate_metrics(self):
        accuracy = accuracy_score(self.y_true, self.y_pred)
        precision = precision_score(self.y_true, self.y_pred)
        recall = recall_score(self.y_true, self.y_pred)
        f1 = f1_score(self.y_true, self.y_pred)
        auc = roc_auc_score(self.y_true, self.y_pred_prob)
        confusion = confusion_matrix(self.y_true, self.y_pred)
        class_report = classification_report(self.y_true, self.y_pred)

        return {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1,
            'AUC': auc,
            'Confusion Matrix': confusion,
            'Classification Report': class_report
        }

    def plot_roc_curve(self):
        fpr, tpr, _ = roc_curve(self.y_true, self.y_pred_prob)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc_score(self.y_true, self.y_pred_prob))
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc='lower right')
        plt.show()

# Example usage
y_true = [1, 0, 1, 1, 0, 0, 1, 0]
y_pred_prob = [0.8, 0.6, 0.9, 0.7, 0.3, 0.4, 0.6, 0.2]

evaluator = ModelEvaluator(y_true, y_pred_prob)
metrics = evaluator.calculate_metrics()
print(metrics)

evaluator.plot_roc_curve()


TypeError: '>=' not supported between instances of 'list' and 'float'

# 模型预测

In [1]:
import os
import torch
import pandas as pd
import djwtool


Config_pdc = {

    'device': "torch.device('cuda' if torch.cuda.is_available() else 'cpu')",
    'csv_file_path': 'pcd.csv',
    'num_classes': 2,
    'model_path': 'model/model.pth',
    'batch_size': 64,
    'Vectorizer_max_length': 140,
    'Vectorizer_model_name': 'hfl/chinese-roberta-wwm-ext-large'

}


csv = djwtool.CSVProcessor(Config_pdc['csv_file_path'])
values= ['网民', '自媒体', '大V名人']
csv.df = csv.df[csv.df['微博主分类标注'].isin(values)]

column_names = ['关注', '粉丝', '微博']
for column_name in column_names:
    csv.df[column_name] = csv.df[column_name].apply(lambda x: f'{int(x):06}' if pd.notnull(x) else '000000')
csv.fill_nan_with_value()
csv.df['认证'] = csv.df['认证'].replace('无', '无V')

csv.str_length_normalization('博主标记',26)
csv.str_length_normalization('简介',50)
csv.str_length_normalization('工作信息',12)
csv.str_length_normalization('标签和其他',25)

all_to_merge = ['认证', '关注', '粉丝', '微博','博主标记', '简介', '工作信息', '标签和其他']
# all_to_merge = ['认证', '昵称']
csv.df['text'] = csv.apply_merge_to_columns(all_to_merge)

text_column = csv.df['text']
print('text:')
print(text_column)
print("=" * 100)

tokenizer = djwtool.TextTokenizer(Config_pdc['Vectorizer_model_name'],
                                Config_pdc['Vectorizer_max_length'])
input_ids_list, attention_mask_list = tokenizer.tokenize_dataframe(csv.df['text'])
input_ids_tensor = torch.tensor(input_ids_list)
attention_mask_tensor = torch.tensor(attention_mask_list)

print('input_ids_tensor:',input_ids_tensor.shape)
print('attention_mask_tensor:',attention_mask_tensor.shape)
model_pdc = djwtool.BERTVectorizer(Config_pdc['Vectorizer_model_name'],Config_pdc['num_classes']) 

  self.df= pd.read_csv(csv_file_path)


text:
1190     无V/000103/000238/000247/无000000000000000000000...
1191     无V/000492/001472/001755/无000000000000000000000...
1192     无V/000422/000388/004231/无000000000000000000000...
1193     无V/001051/000792/015672/交流、学习、进步！0000000000000...
1194     无V/001027/000885/001504/无000000000000000000000...
                               ...                        
84303    金V/000440/001052/001128/无000000000000000000000...
84304    金V/000387/011607/000915/无000000000000000000000...
84305    金V/000123/010828/000410/无000000000000000000000...
84306    金V/000066/101477/000251/无000000000000000000000...
84307    金V/000093/053154/000059/无000000000000000000000...
Name: text, Length: 83003, dtype: object
input_ids_tensor: torch.Size([83003, 140])
attention_mask_tensor: torch.Size([83003, 140])


In [2]:
pdc_dataset = djwtool.PredictionDataset(input_ids_tensor, attention_mask_tensor)
pdc_dataloader = pdc_dataset.prepare_dataloader(64)

result = djwtool.Prediction(model_pdc, Config_pdc['model_path'])
pdc_list = []
pdc_list = result.predict(pdc_dataloader)

FileNotFoundError: [Errno 2] No such file or directory: 'model/model.pth'