In [9]:
# load pandas pickle file
import pandas as pd
import numpy as np
import pickle
import os
def extract(path):
    print(os.getcwd())
    file = open(path, 'rb')
    object_file = pickle.load(file)
    file.close()
    return object_file

def save_dataset(item, dir, name):
    if not os.path.exists(dir):
        os.makedirs(dir)
    path = dir+"/"+name+".pickle"
    pickle.dump(item, open(path, 'wb'))


In [5]:
from transformers import pipeline
from tqdm import tqdm

classifier = pipeline("text-classification", model="JanSt/albert-base-v2_mbti-classification",top_k=None, max_length=512)

def convert_list_to_dict(list_of_dicts):
    result_dict = {}
    for item in list_of_dicts:
        label = item.get("label")
        score = item.get("score")
        if label is not None and score is not None:
            result_dict[label] = score
    return result_dict

def convert_mbti_list(sentences):
    # Assuming classifier and convert_list_to_dict are defined elsewhere

    # Initialize an empty list to store individual DataFrames
    dfs = []

    # Process each sentence in the list
    for sentence in tqdm(sentences):
        emo = classifier(sentence)  # Assuming classifier returns emotions for a single sentence
        out = convert_list_to_dict(emo[0])  # Assuming convert_list_to_dict processes the emotion list
        df = pd.DataFrame(out, index=[0])
        dfs.append(df)

    # Concatenate the list of DataFrames into a single DataFrame
    result_df = pd.concat(dfs, ignore_index=True)

    max_column = result_df.idxmax(axis=1)

    # Add a new column to the DataFrame with the column name of the maximum value
    result_df['highest'] = max_column

    return result_df




print(convert_mbti_list(["I am happy","he"]))

  0%|          | 0/2 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 2/2 [00:00<00:00, 15.29it/s]

       ENFP      ENTJ      INFJ      ENFJ      ISFJ      ENTP      ISFP  \
0  0.291670  0.239024  0.199571  0.068912  0.060027  0.037044  0.031224   
1  0.008041  0.059717  0.018833  0.002206  0.012591  0.413074  0.015419   

       ESFJ      INFP      ESTJ      ESFP      ISTJ      INTJ      ESTP  \
0  0.022395  0.015866  0.015733  0.014663  0.001525  0.001282  0.000503   
1  0.001411  0.005133  0.264850  0.000103  0.000252  0.124362  0.022159   

       ISTP      INTP highest  
0  0.000377  0.000184    ENFP  
1  0.018450  0.033400    ENTP  





# sentiment


In [6]:
imdb = pd.read_csv('../../corpus/imdb.csv')
imdb['sentiment'] = imdb['sentiment'].replace('positive', 1)
imdb['sentiment'] = imdb['sentiment'].replace('negative', 0)
sentence_list = imdb['review'].tolist()
mbti_df_list = convert_mbti_list(sentence_list)
imdb_ready = pd.concat([imdb, mbti_df_list], axis=1)
imdb_ready.head()
save_dataset(imdb_ready, "../../corpus/personality-aware-sentiment", "albert-base-list-imdb")

100%|██████████| 50000/50000 [2:23:11<00:00,  5.82it/s]  


In [7]:
imdb_highest_list_ready = pd.concat([imdb['review'], mbti_df_list["highest"]], axis=1)
imdb_highest_list_ready['E'] = imdb_highest_list_ready['highest'].str.contains('E').astype(int)
imdb_highest_list_ready['F'] = imdb_highest_list_ready['highest'].str.contains('F').astype(int)
imdb_highest_list_ready['N'] = imdb_highest_list_ready['highest'].str.contains('N').astype(int)
imdb_highest_list_ready['J'] = imdb_highest_list_ready['highest'].str.contains('J').astype(int)
imdb_highest_list_ready.drop("highest", axis=1, inplace=True)
save_dataset(imdb_highest_list_ready, "../../corpus/personality-aware-sentiment", "albert-base-highest-imdb")

In [17]:
moviereview = pd.read_csv('../../corpus/movie-review.csv')
sentence_list = moviereview['content'].tolist()
mbti_df_list = convert_mbti_list(sentence_list)
moviereview_ready = pd.concat([moviereview, mbti_df_list], axis=1)
moviereview_ready.head()
save_dataset(moviereview_ready, "../../corpus/personality-aware-sentiment", "albert-base-list-moviereview")

100%|██████████| 2000/2000 [06:37<00:00,  5.04it/s]


In [4]:
moviereview_highest_list_ready = pd.concat([moviereview_ready['content'], mbti_df_list["highest"]], axis=1)
moviereview_highest_list_ready['E'] = moviereview_highest_list_ready['highest'].str.contains('E').astype(int)
moviereview_highest_list_ready['F'] = moviereview_highest_list_ready['highest'].str.contains('F').astype(int)
moviereview_highest_list_ready['N'] = moviereview_highest_list_ready['highest'].str.contains('N').astype(int)
moviereview_highest_list_ready['J'] = moviereview_highest_list_ready['highest'].str.contains('J').astype(int)
moviereview_highest_list_ready.drop("highest", axis=1, inplace=True)
save_dataset(moviereview_highest_list_ready, "../../corpus/personality-aware-sentiment", "albert-base-highest-moviereview")

In [3]:
import pickle
moviereview_highest_list_ready  = pickle.load(open("../../corpus/personality-aware-sentiment/albert-base-list-moviereview.pickle", "rb"))
moviereview_highest_list_ready = moviereview_highest_list_ready[['content', 'highest', 'label']]
save_dataset(moviereview_highest_list_ready, "../../corpus/personality-aware-sentiment", "albert-base-highest-moviereview")

# depression

In [21]:
sdcnl = pd.read_csv('../../corpus/sdcnl.csv')

# merge title and selftext column together, both have string value
sdcnl['text'] = sdcnl['title'].astype(str) + " | " + sdcnl['selftext'].astype(str)

# drop all column except text and is_suicide column
sdcnl = sdcnl[['text', 'is_suicide']]

sdcnl.head()

sentence_list = sdcnl['text'].tolist()
mbti_df_list = convert_mbti_list(sentence_list)
sdcnl_ready = pd.concat([sdcnl, mbti_df_list], axis=1)
sdcnl_ready.head()
save_dataset(sdcnl_ready, "../../corpus/personality-aware-depression", "albert-base-list-sdcnl")

100%|██████████| 1895/1895 [02:49<00:00, 11.18it/s]


In [12]:
sdcnl_highest_list_ready = pd.concat([sdcnl_ready['text'], mbti_df_list["highest"]], axis=1)
sdcnl_highest_list_ready['E'] = sdcnl_highest_list_ready['highest'].str.contains('E').astype(int)
sdcnl_highest_list_ready['F'] = sdcnl_highest_list_ready['highest'].str.contains('F').astype(int)
sdcnl_highest_list_ready['N'] = sdcnl_highest_list_ready['highest'].str.contains('N').astype(int)
sdcnl_highest_list_ready['J'] = sdcnl_highest_list_ready['highest'].str.contains('J').astype(int)
sdcnl_highest_list_ready.drop("highest", axis=1, inplace=True)
save_dataset(sdcnl_highest_list_ready, "../../corpus/personality-aware-depression", "albert-base-highest-sdcnl")

In [11]:
import pickle
sdcnl_highest_list_ready = pickle.load(open("../../corpus/personality-aware-depression/albert-base-list-sdcnl.pickle", "rb"))
sdcnl_highest_list_ready = sdcnl_highest_list_ready[['text', 'highest', 'is_suicide']]
sdcnl_highest_list_ready.head()

Unnamed: 0,text,highest,is_suicide
0,Need help | Hi I don't really know how to phra...,ENFP,0
1,feeling so overwhelmed and hopeless | i have b...,INFP,1
2,"Nothing matters anymore, getting worse | Hi..I...",ESTJ,0
3,Who’s tired of hearing bullshit | The shit lik...,ENTJ,1
4,I wish I was someone else. | I wish I was pret...,ISFP,0


In [23]:
twitter = pd.read_csv('../../corpus/mental-health-twitter.csv')
twitter = twitter[["post_text","label"]]
sentence_list = twitter['post_text'].tolist()
mbti_df_list = convert_mbti_list(sentence_list)
twitter_ready = pd.concat([twitter, mbti_df_list], axis=1)
twitter_ready.head()
save_dataset(twitter_ready, "../../corpus/personality-aware-depression", "albert-base-list-twitter")

100%|██████████| 20000/20000 [13:39<00:00, 24.41it/s]


In [5]:
#twitter_highest_list_ready = pd.concat([twitter_ready['post_text'], mbti_df_list["highest"]], axis=1)
twitter_highest_list_ready['E'] = twitter_highest_list_ready['highest'].str.contains('E').astype(int)
twitter_highest_list_ready['F'] = twitter_highest_list_ready['highest'].str.contains('F').astype(int)
twitter_highest_list_ready['N'] = twitter_highest_list_ready['highest'].str.contains('N').astype(int)
twitter_highest_list_ready['J'] = twitter_highest_list_ready['highest'].str.contains('J').astype(int)
twitter_highest_list_ready.drop("highest", axis=1, inplace=True)
save_dataset(twitter_highest_list_ready, "../../corpus/personality-aware-depression", "albert-base-highest-twitter")