In [2]:
# load pandas pickle file
import pandas as pd
import numpy as np
import pickle
import os
def extract(path):
    print(os.getcwd())
    file = open(path, 'rb')
    object_file = pickle.load(file)
    file.close()
    return object_file

def save_dataset(item, dir, name):
    if not os.path.exists(dir):
        os.makedirs(dir)
    path = dir+"/"+name+".pickle"
    pickle.dump(item, open(path, 'wb'))



In [19]:
from transformers import pipeline
from tqdm import tqdm

classifier = pipeline("text-classification", model="nateraw/bert-base-uncased-emotion", return_all_scores=True, max_length=512)

def convert_list_to_dict(list_of_dicts):
    result_dict = {}
    for item in list_of_dicts:
        label = item.get("label")
        score = item.get("score")
        if label is not None and score is not None:
            result_dict[label] = score
    return result_dict

def convert_emotion(sentences):
    # Assuming classifier and convert_list_to_dict are defined elsewhere

    # Initialize an empty list to store individual DataFrames
    dfs = []

    # Process each sentence in the list
    for sentence in tqdm(sentences):
        emo = classifier(sentence)  # Assuming classifier returns emotions for a single sentence
        out = convert_list_to_dict(emo[0])  # Assuming convert_list_to_dict processes the emotion list
        df = pd.DataFrame(out, index=[0])
        dfs.append(df)

    # Concatenate the list of DataFrames into a single DataFrame
    result_df = pd.concat(dfs, ignore_index=True)

    return result_df


print(convert_emotion(["I am happy","he"]))

  0%|          | 0/2 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 2/2 [00:00<00:00, 11.35it/s]

    sadness       joy      love     anger      fear  surprise
0  0.001000  0.995372  0.001425  0.001084  0.000497  0.000623
1  0.059335  0.116308  0.022859  0.736262  0.049569  0.015667





# personality

In [20]:

mbti = pd.read_csv('../../corpus/mbti.csv')
sentence_list = mbti['posts'].tolist()
emotion_df_list = convert_emotion(sentence_list)
mbti_ready = pd.concat([mbti, emotion_df_list], axis=1)
mbti_ready.head()
save_dataset(mbti_ready, "../../corpus/emotion-aware-personality", "bert-based-uncased-mbti")

100%|██████████| 8675/8675 [49:42<00:00,  2.91it/s] 


# sentiment


In [None]:
imdb = pd.read_csv('../../corpus/imdb.csv')
imdb['sentiment'] = imdb['sentiment'].replace('positive', 1)
imdb['sentiment'] = imdb['sentiment'].replace('negative', 0)
sentence_list = imdb['review'].tolist()
emotion_df_list = convert_emotion(sentence_list)
imdb_ready = pd.concat([imdb, emotion_df_list], axis=1)
imdb_ready.head()

save_dataset(imdb_ready, "../../corpus/emotion-aware-sentiment", "bert-based-uncased-imdb")

In [15]:
moviereview = pd.read_csv('../../corpus/movie-review.csv')
sentence_list = moviereview['content'].tolist()
emotion_df_list = convert_emotion(sentence_list)
moviereview_ready = pd.concat([moviereview, emotion_df_list], axis=1)
save_dataset(moviereview_ready, "../../corpus/emotion-aware-sentiment", "bert-based-uncased-moviereview")

# depression

In [16]:
sdcnl = pd.read_csv('../../corpus/sdcnl.csv')

# merge title and selftext column together, both have string value
sdcnl['text'] = sdcnl['title'].astype(str) + " | " + sdcnl['selftext'].astype(str)

# drop all column except text and is_suicide column
sdcnl = sdcnl[['text', 'is_suicide']]

sdcnl.head()

sentence_list = sdcnl['text'].tolist()
emotion_df_list = convert_emotion(sentence_list)
sdcnl_ready = pd.concat([sdcnl, emotion_df_list], axis=1)
sdcnl_ready.head()
save_dataset(sdcnl_ready, "../../corpus/emotion-aware-depression", "bert-based-uncased-sdcnl")

100%|██████████| 1895/1895 [05:43<00:00,  5.52it/s]


In [17]:
twitter = pd.read_csv('../../corpus/mental-health-twitter.csv')
twitter = twitter[["post_text","label"]]
sentence_list = twitter['post_text'].tolist()
emotion_df_list = convert_emotion(sentence_list)
twitter_ready = pd.concat([twitter, emotion_df_list], axis=1)
twitter_ready.head()
save_dataset(twitter_ready, "../../corpus/emotion-aware-depression", "bert-based-uncased-twitter")

100%|██████████| 20000/20000 [23:46<00:00, 14.02it/s]


# personality

In [4]:
from sklearn.model_selection import train_test_split
dataset = extract("../../corpus/emotion-aware-personality/roberta-base-mbti.pickle")


dataset["E"] = dataset['type'].apply(lambda x: 1 if x[0] == 'E' else 0)
dataset["O"] = dataset['type'].apply(lambda x: 1 if x[1] == 'N' else 0)
dataset["A"] = dataset['type'].apply(lambda x: 1 if x[2] == 'F' else 0)
dataset["C"] = dataset['type'].apply(lambda x: 1 if x[3] == 'J' else 0)

dataset = dataset.drop(['type'], axis=1)

#save
save_dataset(dataset, "../../corpus/emotion-aware-personality", "roberta-base-mbti")
dataset.head()


/Users/jingjietan/Desktop/PRaware/model_aware/emotion


Unnamed: 0,filename,posts,sadness,curiosity,caring,neutral,remorse,disappointment,approval,confusion,...,embarrassment,gratitude,disgust,surprise,excitement,pride,E,O,A,C
0,0,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,0.68965,0.188368,0.126145,0.101215,0.092729,0.046889,0.032194,0.026312,...,0.002361,0.00196,0.00196,0.001827,0.001702,0.000448,0,1,1,1
1,1,'I'm finding the lack of me in these posts ver...,0.00308,0.021786,0.00186,0.108906,0.000811,0.020744,0.043765,0.021139,...,0.004704,0.000575,0.005139,0.004308,0.009194,0.001004,1,1,0,0
2,2,'Good one _____ https://www.youtube.com/wat...,0.013163,0.443897,0.008598,0.075922,0.003276,0.01823,0.053725,0.321709,...,0.00151,0.087438,0.002965,0.01067,0.005236,0.00036,0,1,0,0
3,3,"'Dear INTP, I enjoyed our conversation the o...",0.010823,0.209207,0.017589,0.113511,0.005305,0.058189,0.160561,0.206875,...,0.001786,0.001879,0.002025,0.002681,0.009717,0.001332,0,1,0,1
4,4,'You're fired.|||That's another silly misconce...,0.002441,0.086538,0.001938,0.149465,0.001085,0.014468,0.063388,0.058207,...,0.003577,0.000209,0.006866,0.002247,0.002406,0.000445,1,1,0,1
