# Config

In [1]:
import os
import torch
import pandas as pd
import torch.optim as optim
import torch.nn as nn
import tool
import numpy as np

Config = {
    'device': "cuda:1" if torch.cuda.is_available() else "cpu",
    'csv_file_path': 'data/test_right.csv',
    'label_column': '微博主分类标注',

    'Vectorizer_max_length': 140,
    'Vectorizer_model_name': 'hfl/chinese-roberta-wwm-ext-large',

    'train_ratio': 0.6,
    'val_ratio': 0.2,
    'batch_size': 64,

    'epochs':6,
    'model_save_path': 'similar-最后3分类.pth'
}


folders_to_create = ['./model', './tensor']
for folder in folders_to_create:
    if not os.path.exists(folder):
        os.makedirs(folder)
    else:
        print(f'文件夹 {folder} 已存在')
print("=" * 100)

文件夹 ./model 已存在
文件夹 ./tensor 已存在


# 数据预处理(数据量大小)

In [49]:
csv = tool.CSVProcessor(Config['csv_file_path'])
values= ['明星红人', '民主党派']
csv.df = csv.df[~csv.df['微博主分类标注'].isin(values)]
label_counts = csv.df['微博主分类标注'].value_counts()
# print(label_counts)
# print("=" * 100)
# 分大小数据
threshold = 600
csv.df['target_column'] = csv.df['微博主分类标注'].apply(lambda x: '小数据' if label_counts[x] < threshold else '大数据')
small_data_count = (csv.df['target_column'] == '小数据').sum()
large_data_count = (csv.df['target_column'] == '大数据').sum()

print(f"小数据数量：{small_data_count}")
print(f"大数据数量：{large_data_count}")
print("=" * 100)

values= ['小数据']
csv.df = csv.df[~csv.df['target_column'].isin(values)]
csv.df['target_column'] = csv.df['微博主分类标注']


def map_labels(row):
    if row['微博主分类标注'] in ['大V名人', '网民', '党委', '媒体', '自媒体', '企事业单位']:
        return 'similar'
    else:
        return 'distinctive'
    
csv.df['target_column'] = csv.df.apply(map_labels, axis=1)

valeus = ['distinctive', ]
csv.df = csv.df[~csv.df['target_column'].isin(valeus)]
csv.df['target_column'] = csv.df['微博主分类标注']

valeus = ['党委','网民','自媒体']
csv.df = csv.df[~csv.df['微博主分类标注'].isin(valeus)]

# # conditions = [ csv.df['微博主分类标注'].isin(values) ]
# # choices = ['其他']
# # csv.df['target_column'] = np.select(conditions, choices, default=csv.df['微博主分类标注'])

# label_counts = csv.df['微博主分类标注'].value_counts()
# print(label_counts)
# # values= ['网民', '自媒体']
# # conditions = [ csv.df['微博主分类标注'].isin(values) ]
# # choices = ['网自']
# # csv.df['target_column'] = np.select(conditions, choices, default=csv.df['微博主分类标注'])
# label_mapping = {'媒体': '其他',  '大V名人': '其他', '企事业单位':'其他',
#                  '网民': '分出网友+自媒体', '自媒体': '分出网友+自媒体'}

# 在新的一列中根据映射为数据添加新标签
csv.df['target_column'] = csv.df['微博主分类标注']

小数据数量：1299
大数据数量：10780


In [50]:
# 列操作
label_mapping = csv.generate_label_mapping('target_column')
print(label_mapping)
csv.label_numerization(label_mapping, 'target_column')  
num_classes=len(label_mapping)
print(num_classes)

column_names = ['关注', '粉丝', '微博']
for column_name in column_names:
    csv.df[column_name] = csv.df[column_name].apply(lambda x: f'{int(x):06}' if pd.notnull(x) else '000000')
csv.fill_nan_with_value()
csv.df['认证'] = csv.df['认证'].replace('无', '无V')

csv.str_length_normalization('博主标记',26)
csv.str_length_normalization('简介',50)
csv.str_length_normalization('工作信息',12)
csv.str_length_normalization('标签和其他',25)

all_to_merge = ['认证', '关注', '粉丝', '微博','博主标记', '简介', '工作信息', '标签和其他']
csv.df['text'] = csv.apply_merge_to_columns(all_to_merge)

text_column = csv.df['text']
print('text:')
print(text_column)
print("=" * 100)

{'大V名人': 0, '媒体': 1, '企事业单位': 2}
3
text:
195      金V/002326/5313506/007832/母婴育儿博主 魏婷 美容专家 中央台评委 ...
196      金V/003297/11895480/030788/微博2017十大影响力军事大V 知名军事...
197      金V/000183/7420283/107015/知名美食博主 微博知名美食帐号000000...
198      金V/000238/11989942/002004/演员，代表作《余罪》《追龙》《西虹市首富...
199      金V/001798/9426455/029562/知名电视剧博主 电视剧视频自媒体00000...
                               ...                        
12065    蓝V/001589/932745/031263/小康杂志社官方微博0000000000000...
12073    金V/000975/5360289/070568/微博知名搞笑博主0000000000000...
12074    金V/001071/5008876/002515/BTV主持人悦悦0000000000000...
12076    蓝V/000201/155537/009029/中国石油化工股份有限公司西北油田分公司官方微...
12084    蓝V/000107/048572/001365/庐山西海国家级风景名胜区官方微博000000...
Name: text, Length: 2669, dtype: object


In [4]:
# 向量化
tokenizer = tool.TextTokenizer(Config['Vectorizer_model_name'],
                                Config['Vectorizer_max_length'])
input_ids_list, attention_mask_list = tokenizer.tokenize_dataframe(csv.df['text'])
label_list = list(csv.df['label_num'])
input_ids_tensor = torch.tensor(input_ids_list)
attention_mask_tensor = torch.tensor(attention_mask_list)
label_tensor = torch.tensor(label_list)
print('input_ids_tensor:',input_ids_tensor.shape)
print('attention_mask_tensor:',attention_mask_tensor.shape)
print('label_tensor:',label_tensor.shape)
print("=" * 100)

input_ids_tensor: torch.Size([2669, 140])
attention_mask_tensor: torch.Size([2669, 140])
label_tensor: torch.Size([2669])


In [5]:
td = tool.TrainDataset(input_ids_tensor, attention_mask_tensor, label_tensor)
train_loader, val_loader, test_loader = td.prepare_dataloaders(Config['train_ratio'],                                                                                                                     
                                                               Config['val_ratio'],
                                                               Config['batch_size']
                                                               )
print('train_loader检查:')
for batch_idx, (ids, attention_mask , target) in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}:")
    print("ids shape:", ids.shape)
    print("attention_mask shape:", attention_mask.shape)
    print("Target shape:", target.shape)

    if batch_idx == 0:  
        break
print("=" * 100)

train_loader检查:
Batch 1:
ids shape: torch.Size([64, 140])
attention_mask shape: torch.Size([64, 140])
Target shape: torch.Size([64])


# 模型训练

In [6]:
model = tool.BERTVectorizer(Config['Vectorizer_model_name'], num_classes, Config['device'])
# optimizer = optim.Adam(model.parameters(), lr=1e-2,  weight_decay=5e-4)
# optimizer=optim.RMSprop(model.parameters(),lr=0.001,alpha=0.99,momentum=0,weight_decay=0)
optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=0.5, weight_decay=5e-4)
# scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, Config['epochs'])
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,  'min',
#                                                     factor=0.5, #学习率下降的因子factor=0.5, 
#                                                     verbose=True,#每次更新都会打印一条消息 
#                                                     patience=2,#有2个epochs的平均损失没有变化，学习率将
#                                                     min_lr=0.00000001,# 学习率的下限
#                                                     threshold=0.001)#小于这个数表示平均损失没有下降
# scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[75, 150], gamma=0.5)
scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=15, T_mult=2, eta_min=0.00001, last_epoch=-1 , verbose=False)
criterion = nn.CrossEntropyLoss()
print('训练过程:')
trainer = tool.Trainer(model, train_loader, val_loader,optimizer,
                          criterion, scheduler, Config['epochs'], Config['model_save_path'], Config['device'])
trainer.train()
print("=" * 100)

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext-large were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


训练过程:
Epoch [1/6] Train Loss: 0.670032 Train Acc: 0.72 Val Loss: 3.854492 Val Acc: 0.35 Learning Rate: 0.010000
	Time: 00:30
Epoch [2/6] Train Loss: 0.553901 Train Acc: 0.84 Val Loss: 1.854348 Val Acc: 0.64 Learning Rate: 0.009987
	Time: 00:38
Epoch [3/6] Train Loss: 0.259350 Train Acc: 0.91 Val Loss: 0.093242 Val Acc: 0.97 Learning Rate: 0.009955
	Time: 00:38
Epoch [4/6] Train Loss: 0.117857 Train Acc: 0.96 Val Loss: 0.098465 Val Acc: 0.98 Learning Rate: 0.009896
	Time: 00:39
Epoch [5/6] Train Loss: 0.083267 Train Acc: 0.97 Val Loss: 0.281654 Val Acc: 0.92 Learning Rate: 0.009896
	Time: 00:28
Epoch [6/6] Train Loss: 0.074207 Train Acc: 0.98 Val Loss: 0.145064 Val Acc: 0.96 Learning Rate: 0.009908
	Time: 00:28


# 模型测试

In [7]:
import torch
from sklearn.metrics import precision_score, recall_score, f1_score

print('模型测试:')
model = tool.BERTVectorizer(Config['Vectorizer_model_name'],num_classes, Config['device']) 
device = Config['device']
model = model.to(device)
model.load_state_dict(torch.load('model/similar-最后3分类.pth'))
model.eval()
true_labels = []
predicted_labels = []

with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, 1)

        true_labels.extend(labels.cpu().numpy())
        predicted_labels.extend(predicted.cpu().numpy())

# 计算精确率、召回率和 F1 分数
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print(f"Precision: {precision:.3f}")
print(f"recall: {recall:.3f}")
print(f"F1: {f1:.3f}")
evaluator = tool.ModelEvaluator(model, test_loader, label_mapping, Config['device'])
evaluator.test_accuracy()
evaluator.accuracy_of_label()


模型测试:


Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext-large were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Precision: 0.954
recall: 0.951
F1: 0.951
Accuracy of the network on the test items: 95.14 %
Accuracy of  大V名人 : 98.03 %
Accuracy of    媒体 : 97.88 %
Accuracy of 企事业单位 : 90.21 %


# 模型预测


In [1]:
import os
import torch
import pandas as pd
import tool
import tqdm


Config_pdc = {
    'csv_file_path': 'data/程序-头条1组链接.csv',
    'num_classes': 24,
    'device': "cuda:1" if torch.cuda.is_available() else "cpu",
    'model_save_path': 'model/24class.pth',
    'batch_size': 64,
    'Vectorizer_max_length': 90,
    'Vectorizer_model_name': 'hfl/chinese-roberta-wwm-ext-large'
}

original_dict = {'超话粉丝大咖': 0, '公务员': 1, '大V名人': 2, '党委': 3, '国防军委': 4, 
                 '基层组织': 5, '政府': 6, '检验检测': 7, '媒体': 8, '民主党派': 9, '明星红人': 10, 
                 '企事业单位': 11, '赛事活动': 12, '社会组织': 13, '社区组织': 14, '司法机关': 15,
                 '外国政府机构': 16, '网民': 17, '行业专家': 18, '学校': 19, '研究机构': 20,
                 '演艺娱乐明星': 21, '政协人大': 22, '自媒体': 23}
reverse_dict = {value: key for key, value in original_dict.items()}
print(reverse_dict)

{0: '超话粉丝大咖', 1: '公务员', 2: '大V名人', 3: '党委', 4: '国防军委', 5: '基层组织', 6: '政府', 7: '检验检测', 8: '媒体', 9: '民主党派', 10: '明星红人', 11: '企事业单位', 12: '赛事活动', 13: '社会组织', 14: '社区组织', 15: '司法机关', 16: '外国政府机构', 17: '网民', 18: '行业专家', 19: '学校', 20: '研究机构', 21: '演艺娱乐明星', 22: '政协人大', 23: '自媒体'}


In [2]:
csv_pdc = tool.CSVProcessor(Config_pdc['csv_file_path'])
csv_pdc.str_length_normalization('用户名',20)
csv_pdc.str_length_normalization('用户认证',30)
csv_pdc.str_length_normalization('认证信息',40)
all_to_merge = ['认证信息', '用户名', '用户认证']
csv_pdc.df['text'] = csv_pdc.apply_merge_to_columns(all_to_merge)
print(csv_pdc.df['text'])

  self.df= pd.read_csv(csv_file_path)


0          车快评00000000000000000/评论车市风云 点评汽车文化000000000000...
1          车快评00000000000000000/评论车市风云 点评汽车文化000000000000...
2          车快评00000000000000000/评论车市风云 点评汽车文化000000000000...
3                                       勿燥.00000000000000000
4                                       勿燥.00000000000000000
                                 ...                        
1006874                                 孤独的探路者00000000000000
1006875                                 孤独的探路者00000000000000
1006876                                 网事随风2900000000000000
1006877                                 静看云起0000000000000000
1006878                                 菩提青年感悟00000000000000
Name: text, Length: 1006879, dtype: object


In [21]:
csv_pdc = tool.CSVProcessor(Config_pdc['csv_file_path'])
values= ['网民', '自媒体']
csv_pdc.df = csv_pdc.df[csv_pdc.df['微博主分类标注'].isin(values)]
print(csv_pdc.df.shape[0])

column_names = ['关注', '粉丝', '微博']
for column_name in column_names:
    csv_pdc.df[column_name] = csv_pdc.df[column_name].apply(lambda x: f'{int(x):06}' if pd.notnull(x) else '000000')
csv_pdc.fill_nan_with_value()
csv_pdc.df['认证'] = csv_pdc.df['认证'].replace('无', '无V')

csv_pdc.str_length_normalization('博主标记',26)
csv_pdc.str_length_normalization('简介',50)
csv_pdc.str_length_normalization('工作信息',12)
csv_pdc.str_length_normalization('标签和其他',25)

all_to_merge = ['认证', '关注', '粉丝', '微博','博主标记', '简介', '工作信息', '标签和其他']
# all_to_merge = ['认证', '昵称']
csv_pdc.df['text'] = csv_pdc.apply_merge_to_columns(all_to_merge)

text_column = csv_pdc.df['text']
print('text:')
print(text_column)
print(csv_pdc.df.shape[0])
print("=" * 100)

1806
text:
2565     无V/000167/000092/009016/超话粉丝大咖（蔡徐坤超话）000000000...
2566     金V/000450/000203/006766/超话粉丝大咖（王晰超话）0000000000...
2567     无V/002485/000731/006925/你看见了这个样子的世界 你要对善良的人好一点...
2568     金V/000247/000600/007034/财经博主000000000000000000...
2569     金V/000570/000209/033078/超话粉丝大咖（易烊千玺超话）00000000...
                               ...                        
12071    无V/000222/007977/000151/北京女神范文化传媒有限公司 平面模特0000...
12075    无V/000380/004579/000482/视频自媒体00000000000000000...
12079    金V/000112/841975/003449/北京协和医院营养科主治医师 李宁 健康博主0...
12081    无V/000458/005589/003409/著名电影、电视剧观众。00000000000...
12083    无V/000200/008303/000135/8。23。24000000000000000...
Name: text, Length: 1806, dtype: object
1806


In [None]:
tokenizer = tool.TextTokenizer(Config_pdc['Vectorizer_model_name'],
                                Config_pdc['Vectorizer_max_length'])
input_ids_list, attention_mask_list = tokenizer.tokenize_dataframe(csv_pdc.df['text'])
input_ids_tensor = torch.tensor(input_ids_list)
attention_mask_tensor = torch.tensor(attention_mask_list)

print('input_ids_tensor:',input_ids_tensor.shape)
print('attention_mask_tensor:',attention_mask_tensor.shape)

In [4]:
pdc_dataset = tool.PredictionDataset(input_ids_tensor, attention_mask_tensor)
pdc_dataloader = pdc_dataset.prepare_dataloader(64)

model_pdc = tool.BERTVectorizer(Config_pdc['Vectorizer_model_name'],Config_pdc['num_classes'], Config_pdc['device']) 
result = tool.Prediction(model_pdc, Config_pdc['model_save_path'],Config_pdc['device'])
predictions = []
predictions = result.predict(pdc_dataloader)
print(len(predictions))

1006879


In [5]:
predicted_label = [reverse_dict[number] for number in predictions]
# print(predicted_label)
print(csv_pdc.df.shape[0])
csv_pdc.df['pdc'] = predicted_label


1006879


In [None]:
csv_pdc.df.to_csv('new_file.csv', index=False)

In [10]:
csv_pdc.df.to_csv('头条预测.csv',encoding='utf_8_sig', index=False)

In [13]:
csv_pdc.df.to_csv('new_dataframe.csv',  encoding='utf_8_sig', index=False)
equal_rows = csv_pdc.df[csv_pdc.df['微博主分类标注'] == csv_pdc.df['pdc']]
num_equal_rows = len(equal_rows)

# 计算比重
total_rows = len(csv_pdc.df)
equal_rows_ratio = num_equal_rows / total_rows

print(f"Equal rows ratio: {equal_rows_ratio:.2%}")

Equal rows ratio: 90.09%


In [17]:
predicted_labels = predictions
label_mapping = csv_pdc.generate_label_mapping('微博主分类标注')
print(label_mapping)
csv_pdc.label_numerization(label_mapping, '微博主分类标注')  
num_classes=len(label_mapping)
print(num_classes)
true_labels = csv_pdc.df['label_num']
class_0_predicted = [pred for pred, true in zip(predicted_labels, true_labels) if true == 0]
class_1_predicted = [pred for pred, true in zip(predicted_labels, true_labels) if true == 1]

# 计算每个类别的准确率
accuracy_class_0 = sum([1 for pred, true in zip(class_0_predicted, true_labels) if pred == true]) / len(class_0_predicted)
accuracy_class_1 = sum([1 for pred, true in zip(class_1_predicted, true_labels) if pred == true]) / len(class_1_predicted)

print("Accuracy for class 0:", accuracy_class_0)
print("Accuracy for class 1:", accuracy_class_1)

{'网民': 0, '自媒体': 1}
2
Accuracy for class 0: 0.9392499810983139
Accuracy for class 1: 0.939917695473251


In [18]:
# 假设你有模型的预测结果和真实标签
predicted_labels = [1, 0, 1, 1, 0, 1, 0, 1, 0, 0]
true_labels = [1, 0, 1, 0, 0, 1, 1, 0, 1, 0]

# 统计原始标签为0且被正确预测的数量
correct_predictions_for_class_0 = sum([1 for pred, true in zip(predicted_labels, true_labels) if true == 1 and pred == true])

print("Correct predictions for class 1:", correct_predictions_for_class_0)


Correct predictions for class 1: 3
