# 功能说明

- 读取文件'V:\20240920\all_in_one_1\2_processed_data_BERT.csv'
- 把'lette'列的文本逐一送给指定模型进行判断
- 将模型判断的结果写入新加的特定列中
- 将源文件的内容和模型得到的结果写入文件'V:\20240920\way2\way2_base_data_add_emotion.csv'中

## 运行环境设置

先下载模型文件，以便离线使用

**powershell**    

- $env:HF_HUB_OFFLINE="0"
- $env:HF_DATASETS_OFFLINE="0"
- $env:HF_ENDPOINT = "https://hf-mirror.com"   
- huggingface-cli download --resume-download --repo-type model SamLowe/roberta-base-go_emotions --local-dir "V:/huggingface/model/SamLowe/roberta-base-go_emotions   

## 导入需要的package

In [1]:
from transformers import pipeline
from transformers import AutoTokenizer
import pandas as pd
import numpy as np
import torch
import chardet

# code all-in-one

## SamLowe/roberta-base-go_emotions 

<font color=red size=3> 这个模型最多只支持512个输入tokens，否则报错，为此pipeline()需要增加参数 truncation=True 来截断输入的超长文本</font>

In [7]:
from transformers import pipeline
from transformers import AutoTokenizer
import pandas as pd
import numpy as np
import torch
import chardet

# 注意：文件 V:\20240920\all_in_one_1\2_processed_data_BERT.csv 的编码检测结果是: Windows-1252
input_file = r'V:\20240920\all_in_one_1\2_processed_data_BERT.csv'

# 检测文件编码
with open(input_file, 'rb') as f:
    raw_data = f.read()
    result = chardet.detect(raw_data)
    file_encoding = result['encoding']

print(f"----文件 {input_file} 的编码检测结果是: {file_encoding}")

# 读取 CSV 文件
try:
    # header=1：表示将第二行作为列名（因为 Python 从 0 开始计数，header=1 表示第二行）
    # skiprows=0：表示不跳过任何行。如果设置为 skiprows=1，则会跳过第一行。
    df = pd.read_csv(input_file, encoding=file_encoding) 
    print(f"以编码检测的结果 {file_encoding} 读取文件成功！")
    # print(df.head())  # 显示前 5 行数据
except UnicodeDecodeError:
    print("检测到的编码无法正确读取文件，尝试其他编码...")
    try:
        # 尝试 gbk 编码
        df = pd.read_csv(input_file, encoding='gbk')  # 第一行是无效数据，第二行是列名
        print("以 GBK 编码读取文件成功！")
        # print(df.head())
    except Exception as e:
        print(f"文件读取失败，错误信息: {e}")
except Exception as e:
    print(f"文件读取失败，错误信息: {e}")


# show statistics of None values in letter column
print("\nData statistics:")
print(f"Total number of texts: {len(df)}")
print(f"Number of None values: {df['letter'].isnull().sum()}")

# Remove rows with None values
# 必须删除 'letter' 列为空的行，否则 SentenceTransformer 会报错
df_clean = df.dropna(subset=['letter']).reset_index()                # must reset_index
print(f"Number of valid texts after removing None: {len(df_clean)}")

print("\n Data statistics after Remove rows with None values :")
print(f" Total number of texts: {len(df_clean)}")
print(f" Number of None values: {df_clean['letter'].isnull().sum()}")

# 定义新加列的列名列表
columns = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
           'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement',
           'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
           'relief', 'remorse', 'sadness', 'surprise', 'neutral']

# 创建新的的DataFrame
df_result = df_clean.copy(deep=True)

# 通过为新的列赋值来加入新列
for col in columns:
    df_result[col] = np.nan

print(f"\n---columns name list of df_result is: ----\n {df_result.columns.tolist()}")

# set compute device
if torch.cuda.is_available():
    print("当前GPU设备ID有:", torch.cuda.current_device())
    device_id = torch.cuda.current_device() # 指定使用的GPU设备ID
    print("使用的GPU设备ID:", device_id)
else:
    print("当前没有可用的GPU设备")
    device_id = "cpu"     # 明确指定使用CPU设备
    print("使用CPU")

classifier = pipeline(task="text-classification", 
                      model=r"V:\huggingface\model\SamLowe\roberta-base-go_emotions", 
                      top_k=None, 
                      device=device_id,
                      truncation=True)   # important !!!

# 为 df_result 的各新加列填充对应的emotion值
for i in range(len(df_result['letter'])):
    letter_i = df_result.loc[i,'letter']
    model_outputs = classifier(letter_i)
    data_dict = {item['label']: item['score'] for item in  model_outputs[0]} #将model_outputs[0]转换为以标签为键，分数为值的字典，方便后续根据标签快速查询分数
    df_result.loc[i,'admiration'] = data_dict['admiration']
    df_result.loc[i,'amusement'] = data_dict['amusement']
    df_result.loc[i,'anger'] = data_dict['anger']
    df_result.loc[i,'annoyance'] = data_dict['annoyance']
    df_result.loc[i,'approval'] = data_dict['approval']
    df_result.loc[i,'caring'] = data_dict['caring']
    df_result.loc[i,'confusion'] = data_dict['confusion']
    df_result.loc[i,'curiosity'] = data_dict['curiosity']
    df_result.loc[i,'desire'] = data_dict['desire']
    df_result.loc[i,'disappointment'] = data_dict['disappointment']
    df_result.loc[i,'disapproval'] = data_dict['disapproval']
    df_result.loc[i,'disgust'] = data_dict['disgust']
    df_result.loc[i,'embarrassment'] = data_dict['embarrassment']
    df_result.loc[i,'excitement'] = data_dict['excitement']
    df_result.loc[i,'fear'] = data_dict['fear']
    df_result.loc[i,'gratitude'] = data_dict['gratitude']
    df_result.loc[i,'grief'] = data_dict['grief']
    df_result.loc[i,'joy'] = data_dict['joy']
    df_result.loc[i,'love'] = data_dict['love']
    df_result.loc[i,'nervousness'] = data_dict['nervousness']
    df_result.loc[i,'optimism'] = data_dict['optimism']
    df_result.loc[i,'pride'] = data_dict['pride']
    df_result.loc[i,'realization'] = data_dict['realization']
    df_result.loc[i,'relief'] = data_dict['relief']
    df_result.loc[i,'remorse'] = data_dict['remorse']
    df_result.loc[i,'sadness'] = data_dict['sadness']
    df_result.loc[i,'surprise'] = data_dict['surprise']
    df_result.loc[i,'neutral'] = data_dict['neutral']
    

# 保存结果到文件
df_result.to_csv(r'V:\20240920\all_in_one_1\3_add_emotion.csv', index=False)
df_result.head(3)

----文件 V:\20240920\all_in_one_1\2_processed_data_BERT.csv 的编码检测结果是: utf-8
以编码检测的结果 utf-8 读取文件成功！

Data statistics:
Total number of texts: 5583
Number of None values: 0
Number of valid texts after removing None: 5583

 Data statistics after Remove rows with None values :
 Total number of texts: 5583
 Number of None values: 0

---columns name list of df_result is: ----
 ['index', 'Unnamed: 0', 'Coder1', 'Coder2', 'ResponseID', 'NA_column', 'letter', 'number_of_characters', 'coding1', 'coding2', 'coding3', 'Mostmoderntheoriesofdecisionmakingrecognizethefactthatdecisionsd', 'EFFORT', 'Age', 'DScores', 'IATTaken', 'ReasonifMissing', 'OUT', 'Gender', 'WhatisyourgenderidentityOtherpleasespecifyText', 'RACE_all', 'Race_primary', 'RACE_FINAL', 'RACE_COMBINEASIAN', 'RACE_OUT', 'Inwhatcountrywasyourmotherborn', 'Inwhatcountrywasyourfatherborn', 'Listthecountrieswhereyourgrandparentswereborn', 'ChooseOneRaceOtherpeoplebelievemorethanonetermdescribesthem.Ifth', 'POLITICS_scale', 'Politics', 'Politi

Device set to use cpu


Unnamed: 0.1,index,Unnamed: 0,Coder1,Coder2,ResponseID,NA_column,letter,number_of_characters,coding1,coding2,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,0,0,Diya Basu,Alexandra Weiss,R_005dEWNt4jenOql,6108957f175a2eb650221b8e,"Greeting Senator,\n \n Women's Global Empower...",560.0,1,1.0,...,0.005607,0.000668,0.062789,0.004094,0.017382,0.002048,0.000658,0.001718,0.000797,0.296698
1,1,2,Anna Owens,Isha Gupta,R_00y8lNWUZCArLjP,A245CPNDUDHUYX,Hello:\n \n I feel that this bill should be re...,289.0,1,1.0,...,0.001567,0.000568,0.043968,0.001635,0.010425,0.003761,0.003688,0.002551,0.000864,0.040688
2,2,3,Ashley Kim,Quincy Lherisson,R_018hCVjTzrhwNQB,A1JM5XNB4NCZR6,Dear Senator\n This bell is important for advo...,290.0,-1,-1.0,...,0.000758,0.000584,0.008293,0.00056,0.06788,0.001395,0.002252,0.005963,0.000527,0.738031


In [None]:
df.head(2)

# test code