In [1]:
import pandas as pd

In [2]:
import torch
from transformers import BertTokenizer
from transformers import BertConfig
from transformers import BertForSequenceClassification

In [3]:
if torch.cuda.is_available():
        device = torch.device("cuda")    
        print('There are %d GPU(s) available.' % torch.cuda.device_count())
        print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 2080 Ti


In [4]:
label_dict = {  'INFJ': 0,
                'ENTP': 1,
                'INTP': 2,
                'INTJ': 3,
                'ENTJ': 4,
                'ENFJ': 5,
                'INFP': 6,
                'ENFP': 7,
                'ISFP': 8,
                'ISTP': 9,
                'ISFJ': 10,
                'ISTJ': 11,
                'ESTP': 12,
                'ESFP': 13,
                'ESTJ': 14,
                'ESFJ': 15  }

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                            do_lower_case=True)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                        num_labels=len(label_dict),
                                                        output_attentions=False,
                                                        output_hidden_states=False)

model.to(device)

#opt_save_dic = "C:/Users/JenMing/desktop/dc-bot/data_volume_0517_Nsoftmax_bert-base_batch_size =12/"
opt_save_dic = "C:/Users/JenMing/jupyter_nb/data_volume_otos/"

#model.load_state_dict(torch.load(opt_save_dic+'finetuned_BERT_epoch_11.model', map_location=torch.device('cuda')),strict=False)
model.load_state_dict(torch.load(opt_save_dic+'finetuned_BERT_epoch_8.model', map_location=torch.device('cuda')),strict=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

<All keys matched successfully>

In [6]:
df = pd.read_csv('C:/Users/JenMing/Desktop/MBTI/MBTI_test.csv')
df.head()

Unnamed: 0,type,posts
0,INFJ,What has been the most life-changing experienc...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,"Of course, to which I say I know; that's my bl..."
3,INTJ,"'Dear certain personality, I enjoyed our con..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [7]:
df_new = df
df_new['posts'] = None
df = pd.read_csv('C:/Users/JenMing/Desktop/MBTI/MBTI_test.csv')
df_new.head()

Unnamed: 0,type,posts
0,INFJ,
1,ENTP,
2,INTP,
3,INTJ,
4,ENTJ,


In [8]:
for n in range(len(df)):
    list_split = df['posts'][n].split("|||")
    
    for m in range(len(list_split)):
        sentence = list_split[m]
        # 对句子进行分词和编码
        encoded_inputs = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt")
        input_ids = encoded_inputs["input_ids"].to(device)
        attention_mask = encoded_inputs["attention_mask"].to(device)

        # 将模型设置为评估模式
        model.eval()

        # 运行模型的前向传播
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
        # 获取模型的预测结果
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)

        reversed_label_dict = {v: k for k, v in label_dict.items()}

        # 使用预测的标签索引查找相应的类型
        predicted_type = reversed_label_dict[predicted_labels.item()]

        #label_dict
        # 打印预测结果
        #print("Predicted type:", predicted_type)
        
        if df_new['posts'][n] is None:
            df_new['posts'][n] = []
        
        df_new['posts'][n].append(predicted_type)
    

        
df_new.head()

#df_new.to_csv(f"C:/Users/JenMing/Desktop/MBTI/LSTM/mbti_to_LSTM_DF.csv", index=False)
df_new.to_csv(f"C:/Users/JenMing/Desktop/MBTI/one_type_one_sentence/mbti_to_LSTM_DF.csv", index=False)