In [2]:
# load pandas pickle file
import pandas as pd
import numpy as np
import pickle
import os
def extract(path):
    print(os.getcwd())
    file = open(path, 'rb')
    object_file = pickle.load(file)
    file.close()
    return object_file

def save_dataset(item, dir, name):
    if not os.path.exists(dir):
        os.makedirs(dir)
    path = dir+"/"+name+".pickle"
    pickle.dump(item, open(path, 'wb'))


In [2]:
from transformers import pipeline
from tqdm import tqdm

classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True, max_length=512)

def convert_list_to_dict(list_of_dicts):
    result_dict = {}
    for item in list_of_dicts:
        label = item.get("label")
        score = item.get("score")
        if label is not None and score is not None:
            result_dict[label] = score
    return result_dict

def convert_emotion(sentences):
    # Assuming classifier and convert_list_to_dict are defined elsewhere

    # Initialize an empty list to store individual DataFrames
    dfs = []

    # Process each sentence in the list
    for sentence in tqdm(sentences):
        emo = classifier(sentence)  # Assuming classifier returns emotions for a single sentence
        out = convert_list_to_dict(emo[0])  # Assuming convert_list_to_dict processes the emotion list
        df = pd.DataFrame(out, index=[0])
        dfs.append(df)

    # Concatenate the list of DataFrames into a single DataFrame
    result_df = pd.concat(dfs, ignore_index=True)

    return result_df


print(convert_emotion(["I am happy","he"]))

  return self.fget.__get__(instance, owner)()
  0%|          | 0/2 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 2/2 [00:00<00:00, 14.40it/s]

      anger   disgust      fear       joy   neutral   sadness  surprise
0  0.001381  0.000329  0.000380  0.965741  0.002609  0.013528  0.016032
1  0.076094  0.195284  0.016262  0.011164  0.626966  0.050076  0.024154





# personality

In [3]:

mbti = pd.read_csv('../../corpus/mbti.csv')
sentence_list = mbti['posts'].tolist()
emotion_df_list = convert_emotion(sentence_list)
mbti_ready = pd.concat([mbti, emotion_df_list], axis=1)
mbti_ready.head()
save_dataset(mbti_ready, "../../corpus/emotion-aware-personality", "distill-roberta-mbti")

100%|██████████| 8675/8675 [29:08<00:00,  4.96it/s]


In [4]:
from sklearn.model_selection import train_test_split
dataset = extract("../../corpus/emotion-aware-personality/distill-roberta-mbti.pickle")


dataset["E"] = dataset['type'].apply(lambda x: 1 if x[0] == 'E' else 0)
dataset["O"] = dataset['type'].apply(lambda x: 1 if x[1] == 'N' else 0)
dataset["A"] = dataset['type'].apply(lambda x: 1 if x[2] == 'F' else 0)
dataset["C"] = dataset['type'].apply(lambda x: 1 if x[3] == 'J' else 0)

dataset = dataset.drop(['type'], axis=1)

#save
save_dataset(dataset, "../../corpus/emotion-aware-personality", "distill-roberta-mbti")
dataset.head()

/Users/jingjietan/Desktop/PRaware/model_aware/emotion


Unnamed: 0,filename,posts,anger,disgust,fear,joy,neutral,sadness,surprise,E,O,A,C
0,0,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,0.008167,0.001831,0.019847,0.056881,0.194236,0.636635,0.082404,0,1,1,1
1,1,'I'm finding the lack of me in these posts ver...,0.010766,0.006015,0.904504,0.005772,0.034155,0.008474,0.030315,1,1,0,0
2,2,'Good one _____ https://www.youtube.com/wat...,0.042915,0.024796,0.769279,0.011808,0.05141,0.042462,0.05733,0,1,0,0
3,3,"'Dear INTP, I enjoyed our conversation the o...",0.022988,0.010755,0.02005,0.212752,0.178759,0.437566,0.117131,0,1,0,1
4,4,'You're fired.|||That's another silly misconce...,0.37068,0.105621,0.037517,0.012559,0.342921,0.057501,0.073202,1,1,0,1


# sentiment

In [4]:
imdb = pd.read_csv('../../corpus/imdb.csv')
imdb['sentiment'] = imdb['sentiment'].replace('positive', 1)
imdb['sentiment'] = imdb['sentiment'].replace('negative', 0)
sentence_list = imdb['review'].tolist()
emotion_df_list = convert_emotion(sentence_list)
imdb_ready = pd.concat([imdb, emotion_df_list], axis=1)
save_dataset(imdb_ready, "../../corpus/emotion-aware-sentiment", "distill-roberta-imdb")

100%|██████████| 50000/50000 [48:14<00:00, 17.28it/s] 


In [12]:
moviereview = pd.read_csv('../../corpus/movie-review.csv')
sentence_list = moviereview['content'].tolist()
emotion_df_list = convert_emotion(sentence_list)
moviereview_ready = pd.concat([moviereview, emotion_df_list], axis=1)
save_dataset(moviereview_ready, "../../corpus/emotion-aware-sentiment", "distill-roberta-moviereview")

100%|██████████| 2000/2000 [04:48<00:00,  6.94it/s]


# depression

In [15]:
sdcnl = pd.read_csv('../../corpus/sdcnl.csv')

# merge title and selftext column together, both have string value
sdcnl['text'] = sdcnl['title'].astype(str) + " | " + sdcnl['selftext'].astype(str)

# drop all column except text and is_suicide column
sdcnl = sdcnl[['text', 'is_suicide']]

sdcnl.head()

sentence_list = sdcnl['text'].tolist()
emotion_df_list = convert_emotion(sentence_list)
sdcnl_ready = pd.concat([sdcnl, emotion_df_list], axis=1)
sdcnl_ready.head()
save_dataset(sdcnl_ready, "../../corpus/emotion-aware-depression", "distill-roberta-sdcnl")

100%|██████████| 1895/1895 [02:40<00:00, 11.80it/s]


In [16]:
twitter = pd.read_csv('../../corpus/mental-health-twitter.csv')
twitter = twitter[["post_text","label"]]
sentence_list = twitter['post_text'].tolist()
emotion_df_list = convert_emotion(sentence_list)
twitter_ready = pd.concat([twitter, emotion_df_list], axis=1)
twitter_ready.head()
save_dataset(twitter_ready, "../../corpus/emotion-aware-depression", "distill-roberta-twitter")

100%|██████████| 20000/20000 [11:33<00:00, 28.85it/s]
