In [2]:
# load pandas pickle file
import pandas as pd
import numpy as np
import pickle
import os
def extract(path):
    print(os.getcwd())
    file = open(path, 'rb')
    object_file = pickle.load(file)
    file.close()
    return object_file

def save_dataset(item, dir, name):
    if not os.path.exists(dir):
        os.makedirs(dir)
    path = dir+"/"+name+".pickle"
    pickle.dump(item, open(path, 'wb'))


In [24]:
from transformers import pipeline
from tqdm import tqdm

classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None, max_length=512)

def convert_list_to_dict(list_of_dicts):
    result_dict = {}
    for item in list_of_dicts:
        label = item.get("label")
        score = item.get("score")
        if label is not None and score is not None:
            result_dict[label] = score
    return result_dict

def convert_emotion(sentences):
    # Assuming classifier and convert_list_to_dict are defined elsewhere

    # Initialize an empty list to store individual DataFrames
    dfs = []

    # Process each sentence in the list
    for sentence in tqdm(sentences):
        emo = classifier(sentence)  # Assuming classifier returns emotions for a single sentence
        out = convert_list_to_dict(emo[0])  # Assuming convert_list_to_dict processes the emotion list
        df = pd.DataFrame(out, index=[0])
        dfs.append(df)

    # Concatenate the list of DataFrames into a single DataFrame
    result_df = pd.concat(dfs, ignore_index=True)

    return result_df


print(convert_emotion(["I am happy","he"]))

  0%|          | 0/2 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 2/2 [00:00<00:00, 23.90it/s]

        joy   neutral  approval    relief  admiration  excitement  gratitude  \
0  0.895246  0.035179  0.028167  0.026642    0.020664    0.016066   0.015526   
1  0.002303  0.963529  0.014937  0.000378    0.003578    0.002336   0.001188   

     caring  amusement      love  ...    desire  disappointment  curiosity  \
0  0.014913   0.013467  0.012827  ...  0.002370        0.002130   0.001743   
1  0.001146   0.001991  0.001160  ...  0.000982        0.002786   0.001232   

   nervousness  surprise   remorse     grief  embarrassment      fear  \
0     0.001603  0.001498  0.001199  0.001178       0.000986  0.000782   
1     0.000328  0.000820  0.000313  0.000402       0.000548  0.001327   

    disgust  
0  0.000671  
1  0.001774  

[2 rows x 28 columns]





# personality

In [28]:

mbti = pd.read_csv('../../corpus/mbti.csv')
sentence_list = mbti['posts'].tolist()
emotion_df_list = convert_emotion(sentence_list)
mbti_ready = pd.concat([mbti, emotion_df_list], axis=1)
mbti_ready.head()
save_dataset(mbti_ready, "../../corpus/emotion-aware-personality", "roberta-base-mbti")

100%|██████████| 8675/8675 [49:00<00:00,  2.95it/s] 


In [None]:
from sklearn.model_selection import train_test_split
dataset = extract("../../corpus/emotion-aware-personality/roberta-base-mbti.pickle")


dataset["E"] = dataset['type'].apply(lambda x: 1 if x[0] == 'E' else 0)
dataset["O"] = dataset['type'].apply(lambda x: 1 if x[1] == 'N' else 0)
dataset["A"] = dataset['type'].apply(lambda x: 1 if x[2] == 'F' else 0)
dataset["C"] = dataset['type'].apply(lambda x: 1 if x[3] == 'J' else 0)

dataset = dataset.drop(['type'], axis=1)

#save
save_dataset(dataset, "../../corpus/emotion-aware-personality", "roberta-base-mbti")
dataset.head()

# sentiment

In [14]:
imdb = pd.read_csv('../../corpus/imdb.csv')
imdb['sentiment'] = imdb['sentiment'].replace('positive', 1)
imdb['sentiment'] = imdb['sentiment'].replace('negative', 0)
sentence_list = imdb['review'].tolist()
emotion_df_list = convert_emotion(sentence_list)
imdb_ready = pd.concat([imdb, emotion_df_list], axis=1)
save_dataset(imdb_ready, "../../corpus/emotion-aware-sentiment", "roberta-base-imdb.pickle")

100%|██████████| 50000/50000 [2:39:10<00:00,  5.24it/s]  


In [15]:
moviereview = pd.read_csv('../../corpus/movie-review.csv')
sentence_list = moviereview['content'].tolist()
emotion_df_list = convert_emotion(sentence_list)
moviereview_ready = pd.concat([moviereview, emotion_df_list], axis=1)
save_dataset(moviereview_ready, "../../corpus/emotion-aware-sentiment", "roberta-base-moviereview")

100%|██████████| 2000/2000 [08:26<00:00,  3.95it/s]


# depression

In [8]:
sdcnl = pd.read_csv('../../corpus/sdcnl.csv')

# merge title and selftext column together, both have string value
sdcnl['text'] = sdcnl['title'].astype(str) + " | " + sdcnl['selftext'].astype(str)

# drop all column except text and is_suicide column
sdcnl = sdcnl[['text', 'is_suicide']]

sdcnl.head()

sentence_list = sdcnl['text'].tolist()
emotion_df_list = convert_emotion(sentence_list)
sdcnl_ready = pd.concat([sdcnl, emotion_df_list], axis=1)
sdcnl_ready.head()
save_dataset(sdcnl_ready, "../../corpus/emotion-aware-depression", "roberta-base-sdcnl")




100%|██████████| 1895/1895 [05:30<00:00,  5.73it/s]


In [13]:
twitter = pd.read_csv('../../corpus/mental-health-twitter.csv')
twitter = twitter[["post_text","label"]]
sentence_list = twitter['post_text'].tolist()
emotion_df_list = convert_emotion(sentence_list)
twitter_ready = pd.concat([twitter, emotion_df_list], axis=1)
twitter_ready.head()
save_dataset(twitter_ready, "../../corpus/emotion-aware-depression", "roberta-base-twitter")

100%|██████████| 20000/20000 [22:53<00:00, 14.56it/s]
