# Config

In [1]:
import os
import torch
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import djwtool

Config = {
    'device': "torch.device('cuda' if torch.cuda.is_available() else 'cpu')",
    'csv_file_path': 'pcd.csv',
    'label_column': '微博主分类标注',

    'Vectorizer_max_length': 140,
    'Vectorizer_model_name': 'hfl/chinese-roberta-wwm-ext-large',

    'train_ratio': 0.6,
    'val_ratio': 0.2,
    'batch_size': 64,

    'epochs': 3,
    'model_save_path': 'test_right_model.pth'
}


folders_to_create = ['./model', './tensor']
for folder in folders_to_create:
    if not os.path.exists(folder):
        os.makedirs(folder)
    else:
        print(f'文件夹 {folder} 已存在')
print("=" * 100)

文件夹 ./model 已存在
文件夹 ./tensor 已存在


# 数据预处理

In [2]:
csv = djwtool.CSVProcessor(Config['csv_file_path'])
# csv.df = csv.df[csv.df['微博主分类标注'].isin(['网民', '自媒体'])]
values= ['网民', '自媒体']
csv.df = csv.df[csv.df['微博主分类标注'].isin(values)]
label_mapping = csv.generate_label_mapping(Config['label_column'])
print(label_mapping)
csv.label_numerization(label_mapping, Config['label_column'])  
num_classes=len(label_mapping)
print(num_classes)

column_names = ['关注', '粉丝', '微博']
for column_name in column_names:
    csv.df[column_name] = csv.df[column_name].apply(lambda x: f'{int(x):06}' if pd.notnull(x) else '000000')
csv.fill_nan_with_value()
csv.df['认证'] = csv.df['认证'].replace('无', '无V')

csv.str_length_normalization('博主标记',26)
csv.str_length_normalization('简介',50)
csv.str_length_normalization('工作信息',12)
csv.str_length_normalization('标签和其他',25)

all_to_merge = ['认证', '关注', '粉丝', '微博','博主标记', '简介', '工作信息', '标签和其他']
# all_to_merge = ['认证', '昵称']
csv.df['text'] = csv.apply_merge_to_columns(all_to_merge)

text_column = csv.df['text']
print('text:')
print(text_column)
print("=" * 100)

tokenizer = djwtool.TextTokenizer(Config['Vectorizer_model_name'],
                                Config['Vectorizer_max_length'])
input_ids_list, attention_mask_list = tokenizer.tokenize_dataframe(csv.df['text'])
label_list = list(csv.df['label_num'])
input_ids_tensor = torch.tensor(input_ids_list)
attention_mask_tensor = torch.tensor(attention_mask_list)
label_tensor = torch.tensor(label_list)
print('input_ids_tensor:',input_ids_tensor.shape)
print('attention_mask_tensor:',attention_mask_tensor.shape)
print('label_tensor:',label_tensor.shape)
print("=" * 100)


  self.df= pd.read_csv(csv_file_path)


{'网民': 0, '自媒体': 1}
2
text:
1190     无V/000103/000238/000247/无000000000000000000000...
1191     无V/000492/001472/001755/无000000000000000000000...
1192     无V/000422/000388/004231/无000000000000000000000...
1193     无V/001051/000792/015672/交流、学习、进步！0000000000000...
1194     无V/001027/000885/001504/无000000000000000000000...
                               ...                        
84303    金V/000440/001052/001128/无000000000000000000000...
84304    金V/000387/011607/000915/无000000000000000000000...
84305    金V/000123/010828/000410/无000000000000000000000...
84306    金V/000066/101477/000251/无000000000000000000000...
84307    金V/000093/053154/000059/无000000000000000000000...
Name: text, Length: 83003, dtype: object
input_ids_tensor: torch.Size([83003, 140])
attention_mask_tensor: torch.Size([83003, 140])
label_tensor: torch.Size([83003])


# 模型训练

In [3]:
td = djwtool.TrainDataset(input_ids_tensor, attention_mask_tensor, label_tensor)
train_loader, val_loader, test_loader = td.prepare_dataloaders(Config['train_ratio'],                                                                                                                     
                                                               Config['val_ratio'],
                                                               Config['batch_size']
                                                               )

print('train_loader检查:')
for batch_idx, (ids, attention_mask , target) in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}:")
    print("ids shape:", ids.shape)
    print("attention_mask shape:", attention_mask.shape)
    print("Target shape:", target.shape)

    if batch_idx == 0:  
        break
print("=" * 100)


model = djwtool.BERTVectorizer(Config['Vectorizer_model_name'],num_classes) 
# optimizer = optim.Adam(model.parameters(), lr=1e-2,  weight_decay=5e-4)
# optimizer=optim.RMSprop(model.parameters(),lr=0.001,alpha=0.99,momentum=0,weight_decay=0)
optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=0.5, weight_decay=5e-4)
# scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, Config['epochs'])
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,  'min',
#                                                     factor=0.5, #学习率下降的因子factor=0.5, 
#                                                     verbose=True,#每次更新都会打印一条消息 
#                                                     patience=2,#有2个epochs的平均损失没有变化，学习率将
#                                                     min_lr=0.00000001,# 学习率的下限
#                                                     threshold=0.001)#小于这个数表示平均损失没有下降
# scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[75, 150], gamma=0.5)
scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=15, T_mult=2, eta_min=0.00001, last_epoch=-1 , verbose=False)
criterion = nn.CrossEntropyLoss()
print('训练过程:')
trainer = djwtool.Trainer(model, train_loader, val_loader,optimizer,
                          criterion, scheduler, Config['epochs'], Config['model_save_path'])
trainer.train()
print("=" * 100)

train_loader检查:
Batch 1:
ids shape: torch.Size([64, 140])
attention_mask shape: torch.Size([64, 140])
Target shape: torch.Size([64])
训练过程:
Epoch [1/3] Train Loss: 0.050892 Train Acc: 0.98 Val Loss: 0.037965 Val Acc: 0.98 Learning Rate: 0.010000
	Time: 07:14
Epoch [2/3] Train Loss: 0.034015 Train Acc: 0.98 Val Loss: 0.039522 Val Acc: 0.98 Learning Rate: 0.009894
	Time: 07:14
Epoch [3/3] Train Loss: 0.031325 Train Acc: 0.99 Val Loss: 0.031048 Val Acc: 0.98 Learning Rate: 0.009894
	Time: 07:14


# 模型测试

In [4]:
print('模型测试:')

evaluator = djwtool.ModelEvaluator(model, test_loader, label_mapping)
evaluator.test_accuracy()
evaluator.accuracy_of_label()

模型测试:
Accuracy of the network on the test items: 98.66 %
Accuracy of    网民 : 99.10 %
Accuracy of   自媒体 : 89.00 %


In [None]:
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score, confusion_matrix, classification_report
# import matplotlib.pyplot as plt

# class ModelEvaluator:
#     def __init__(self, y_true, y_pred_prob, threshold=0.5):
#         self.y_true = y_true
#         self.y_pred_prob = y_pred_prob
#         self.threshold = threshold
#         self.y_pred = (self.y_pred_prob >= self.threshold).astype(int)

#     def calculate_metrics(self):
#         accuracy = accuracy_score(self.y_true, self.y_pred)
#         precision = precision_score(self.y_true, self.y_pred)
#         recall = recall_score(self.y_true, self.y_pred)
#         f1 = f1_score(self.y_true, self.y_pred)
#         auc = roc_auc_score(self.y_true, self.y_pred_prob)
#         confusion = confusion_matrix(self.y_true, self.y_pred)
#         class_report = classification_report(self.y_true, self.y_pred)

#         return {
#             'Accuracy': accuracy,
#             'Precision': precision,
#             'Recall': recall,
#             'F1 Score': f1,
#             'AUC': auc,
#             'Confusion Matrix': confusion,
#             'Classification Report': class_report
#         }

#     def plot_roc_curve(self):
#         fpr, tpr, _ = roc_curve(self.y_true, self.y_pred_prob)
#         plt.figure(figsize=(8, 6))
#         plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc_score(self.y_true, self.y_pred_prob))
#         plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
#         plt.xlim([0.0, 1.0])
#         plt.ylim([0.0, 1.05])
#         plt.xlabel('False Positive Rate')
#         plt.ylabel('True Positive Rate')
#         plt.title('Receiver Operating Characteristic')
#         plt.legend(loc='lower right')
#         plt.show()

# # Example usage
# y_true = [1, 0, 1, 1, 0, 0, 1, 0]
# y_pred_prob = [0.8, 0.6, 0.9, 0.7, 0.3, 0.4, 0.6, 0.2]

# evaluator = ModelEvaluator(y_true, y_pred_prob)
# metrics = evaluator.calculate_metrics()
# print(metrics)

# evaluator.plot_roc_curve()


# 模型预测

In [1]:
import os
import torch
import pandas as pd
import djwtool
import tqdm


Config_pdc = {
    'device': "torch.device('cuda' if torch.cuda.is_available() else 'cpu')",
    'csv_file_path': 'pcd.csv',
    'num_classes': 2,
    'model_save_path': 'model/test_right_model.pth',
    'batch_size': 20,
    'Vectorizer_max_length': 140,
    'Vectorizer_model_name': 'hfl/chinese-roberta-wwm-ext-large'
}

original_dict = {'网民': 0, '自媒体': 1}
reverse_dict = {value: key for key, value in original_dict.items()}
print(reverse_dict)

{0: '网民', 1: '自媒体'}


In [2]:
csv_pdc = djwtool.CSVProcessor(Config_pdc['csv_file_path'])
values= ['网民', '自媒体']
csv_pdc.df = csv_pdc.df[csv_pdc.df['微博主分类标注'].isin(values)]
print(csv_pdc.df.shape[0])

column_names = ['关注', '粉丝', '微博']
for column_name in column_names:
    csv_pdc.df[column_name] = csv_pdc.df[column_name].apply(lambda x: f'{int(x):06}' if pd.notnull(x) else '000000')
csv_pdc.fill_nan_with_value()
csv_pdc.df['认证'] = csv_pdc.df['认证'].replace('无', '无V')

csv_pdc.str_length_normalization('博主标记',26)
csv_pdc.str_length_normalization('简介',50)
csv_pdc.str_length_normalization('工作信息',12)
csv_pdc.str_length_normalization('标签和其他',25)

all_to_merge = ['认证', '关注', '粉丝', '微博','博主标记', '简介', '工作信息', '标签和其他']
# all_to_merge = ['认证', '昵称']
csv_pdc.df['text'] = csv_pdc.apply_merge_to_columns(all_to_merge)

text_column = csv_pdc.df['text']
print('text:')
print(text_column)
print(csv_pdc.df.shape[0])
print("=" * 100)

  self.df= pd.read_csv(csv_file_path)


83003
text:
1190     无V/000103/000238/000247/无000000000000000000000...
1191     无V/000492/001472/001755/无000000000000000000000...
1192     无V/000422/000388/004231/无000000000000000000000...
1193     无V/001051/000792/015672/交流、学习、进步！0000000000000...
1194     无V/001027/000885/001504/无000000000000000000000...
                               ...                        
84303    金V/000440/001052/001128/无000000000000000000000...
84304    金V/000387/011607/000915/无000000000000000000000...
84305    金V/000123/010828/000410/无000000000000000000000...
84306    金V/000066/101477/000251/无000000000000000000000...
84307    金V/000093/053154/000059/无000000000000000000000...
Name: text, Length: 83003, dtype: object
83003


In [3]:
tokenizer = djwtool.TextTokenizer(Config_pdc['Vectorizer_model_name'],
                                Config_pdc['Vectorizer_max_length'])
input_ids_list, attention_mask_list = tokenizer.tokenize_dataframe(csv_pdc.df['text'])
input_ids_tensor = torch.tensor(input_ids_list)
attention_mask_tensor = torch.tensor(attention_mask_list)

print('input_ids_tensor:',input_ids_tensor.shape)
print('attention_mask_tensor:',attention_mask_tensor.shape)

input_ids_tensor: torch.Size([83003, 140])
attention_mask_tensor: torch.Size([83003, 140])


In [15]:
pdc_dataset = djwtool.PredictionDataset(input_ids_tensor, attention_mask_tensor)
pdc_dataloader = pdc_dataset.prepare_dataloader(64)

model_pdc = djwtool.BERTVectorizer(Config_pdc['Vectorizer_model_name'],Config_pdc['num_classes']) 
result = djwtool.Prediction(model_pdc, Config_pdc['model_save_path'])
predictions = []
predictions = result.predict(pdc_dataloader)
print(len(predictions))

83003


In [5]:
predicted_label = [reverse_dict[number] for number in predictions]
# print(predicted_label)
print(csv_pdc.df.shape[0])
csv_pdc.df['pdc'] = predicted_label


83003


In [13]:
csv_pdc.df.to_csv('new_dataframe.csv',  encoding='utf_8_sig', index=False)
equal_rows = csv_pdc.df[csv_pdc.df['微博主分类标注'] == csv_pdc.df['pdc']]
num_equal_rows = len(equal_rows)

# 计算比重
total_rows = len(csv_pdc.df)
equal_rows_ratio = num_equal_rows / total_rows

print(f"Equal rows ratio: {equal_rows_ratio:.2%}")

Equal rows ratio: 90.09%


In [17]:
predicted_labels = predictions
label_mapping = csv_pdc.generate_label_mapping('微博主分类标注')
print(label_mapping)
csv_pdc.label_numerization(label_mapping, '微博主分类标注')  
num_classes=len(label_mapping)
print(num_classes)
true_labels = csv_pdc.df['label_num']
class_0_predicted = [pred for pred, true in zip(predicted_labels, true_labels) if true == 0]
class_1_predicted = [pred for pred, true in zip(predicted_labels, true_labels) if true == 1]

# 计算每个类别的准确率
accuracy_class_0 = sum([1 for pred, true in zip(class_0_predicted, true_labels) if pred == true]) / len(class_0_predicted)
accuracy_class_1 = sum([1 for pred, true in zip(class_1_predicted, true_labels) if pred == true]) / len(class_1_predicted)

print("Accuracy for class 0:", accuracy_class_0)
print("Accuracy for class 1:", accuracy_class_1)

{'网民': 0, '自媒体': 1}
2
Accuracy for class 0: 0.9392499810983139
Accuracy for class 1: 0.939917695473251


In [18]:
# 假设你有模型的预测结果和真实标签
predicted_labels = [1, 0, 1, 1, 0, 1, 0, 1, 0, 0]
true_labels = [1, 0, 1, 0, 0, 1, 1, 0, 1, 0]

# 统计原始标签为0且被正确预测的数量
correct_predictions_for_class_0 = sum([1 for pred, true in zip(predicted_labels, true_labels) if true == 1 and pred == true])

print("Correct predictions for class 1:", correct_predictions_for_class_0)


Correct predictions for class 1: 3
