In [2]:
import pandas as pd

## load data
- only preserve stories whose length is between 500 and 800

In [23]:
df_essays = pd.read_csv("./essays2007.csv")

df_essays['essay_length'] = df_essays.text.apply(lambda x: len(x.split()))
df_essays['personality_type'] = df_essays.cAGR.astype(str) + df_essays.cCON.astype(str) + df_essays.cEXT.astype(str) + df_essays.cOPN.astype(str) + df_essays.cNEU.astype(str)
print(df_essays.shape)

# df_essays.groupby('personality_type').count()

(2468, 90)


## story detection + lexicon detection
- pick essays predicted as `personal story` by LLMs (`run_story_detection.py`)
- pick essays whose length is between 700 and 800
- pick essays without explicit mentions of personality traits

In [24]:
import json
import pandas as pd
import os

def read_json_predictions(folder_name):
    "read json files into a csv file"

    all_rows = []
    for filename in range(2468):
        file_path = os.path.join(folder_name, "{}.json".format(filename))
        with open(file_path) as f:
            json_obj = json.load(f)
            file_id, prediction = json_obj['file_name'], json_obj['annotation'].lower()
            if prediction == "yes":
                all_rows.append([file_id, True])
            else:
                all_rows.append([file_id, False])
    
    return pd.DataFrame(all_rows, columns=['Filename', 'has_story'])                                

output_folder = "../outputs/gpt-3.5-turbo-0613/temp0.0/essays2007"
df_story_pred = read_json_predictions(output_folder)
df_story_pred.head()

Unnamed: 0,Filename,has_story
0,0,True
1,1,True
2,2,True
3,3,True
4,4,True


In [28]:
df_essays_stories = df_essays.merge(df_story_pred, on="Filename")
print(df_essays_stories.shape)
df_essays_stories = df_essays_stories[df_essays_stories.has_story]
print(df_essays_stories.shape)

print("Check personality type == 32")
print(len(df_essays_stories.personality_type.unique()))

(2468, 91)
(1840, 91)
Check personality type == 32
32


In [29]:
# sampling article
def contains_personality_lexicon(text):
    for lex in ["extrover,", "introver", "agreeabl", "antagonis", "conscientious", "unconscientious", "neuroti", "emotionally stabl", "open to experience", "closed to experience"]:
        if lex in text:
            return True
    return False


print(df_essays_stories.shape)
df_essays_stories = df_essays_stories[(df_essays_stories.essay_length > 700) & (df_essays_stories.essay_length < 800)]
print(df_essays_stories.shape)
df_essays_stories['has_lexicon'] = df_essays_stories.text.apply(contains_personality_lexicon)
df_essays_stories = df_essays_stories[~df_essays_stories.has_lexicon]
print(df_essays_stories.shape)
print(len(df_essays_stories.personality_type.unique()))


(1840, 91)
(208, 91)
(205, 92)
32


In [30]:
df_list = []
for personality_type in df_essays_stories.personality_type.unique():
    df_ = df_essays_stories[df_essays_stories.personality_type == personality_type]
    df_list.append(df_.sample(1))

df_sample = pd.concat(df_list)
df_sample.to_csv("essays2007_32_sample.csv", index=False)
df_sample.shape


(32, 92)

In [31]:
df_sample

Unnamed: 0,cAGR,cCON,cEXT,cOPN,cNEU,text,Filename,Segment,WC,WPS,...,Exclam,Dash,Quote,Apostro,Parenth,OtherP,essay_length,personality_type,has_story,has_lexicon
1449,1,1,1,1,0,"Well, today was a bad day. Not bad in the sens...",1449,1,734,9.29,...,0.0,0.0,1.09,4.36,0.41,0.14,735,11110,True,False
152,0,0,0,0,0,I finally got into this thing. that took forev...,152,1,734,15.29,...,0.0,0.14,0.0,2.45,0.0,0.0,734,0,True,False
2228,0,1,1,1,1,"It's only the second week of school, actually ...",2228,1,786,18.28,...,0.0,0.0,0.0,5.73,0.0,0.0,786,1111,True,False
1470,0,0,1,1,0,I'm really use to writing for long periods of ...,1470,1,762,12.1,...,0.0,0.0,1.05,4.86,0.26,0.0,760,110,True,False
552,1,0,0,0,1,I know that I have assignments to take care of...,552,1,789,16.1,...,0.0,0.0,0.0,0.51,0.0,0.0,792,10001,True,False
1022,1,0,0,1,0,"man, I feel really weird right now. I don't kn...",1022,1,753,8.86,...,0.0,0.13,0.27,3.19,0.0,0.0,754,10010,True,False
59,0,1,0,0,1,"September 0, 0000 I am not too sure what to ...",59,1,731,18.74,...,0.14,0.14,0.27,2.46,1.09,0.0,732,1001,True,False
754,1,0,1,1,1,I have way too much to do this week I am never...,754,1,802,10.84,...,0.0,1.12,0.25,3.87,0.0,0.0,799,10111,True,False
918,1,0,1,0,1,Well here I am sitting at the computer typing ...,918,1,776,43.11,...,0.0,1.03,0.0,2.96,0.0,0.0,776,10101,True,False
1113,0,0,0,0,1,I get sick to my stomach during your class and...,1113,1,708,12.21,...,0.0,0.0,1.69,3.53,0.28,0.14,711,1,True,False
