<a href="https://colab.research.google.com/github/joanna-jaeeun/Emotion-Diary-Analysis-NLP-LLM/blob/main/Analysis_of_Student_Emotion_Diaries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP and LLM-based Analysis of Student Emotion Diaries

## Load packages

In [22]:
# import moduels
import numpy as np
import pandas as pd
import re
import csv
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import os
import json
import hashlib
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from openai import OpenAI
from neo4j_graphrag.llm import OpenAILLM
import neo4j
from neo4j_graphrag.generation.prompts import ERExtractionTemplate
from neo4j_graphrag.experimental.components.kg_writer import Neo4jWriter
from neo4j_graphrag.experimental.components.types import Neo4jGraph

import torch
os.environ["WANDB_DISABLED"] = "true"
from transformers import AutoTokenizer, AutoModelForAudioClassification, TrainingArguments, Trainer, AutoModelForSequenceClassification
from sklearn.metrics import precision_recall_fscore_support

plt.rcParams['font.family'] = 'NanumGothic'

## Mount Google Drive


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Data Preprocessing

### Digitalization

In [None]:
# Converted handwritten diary into .txt file, using Google AI OCR techniques
import os

GOOGLE_DRIVE_PATH_MODELS = '/content/drive/MyDrive/Colab Notebooks/LLM_Project'
print(os.listdir(GOOGLE_DRIVE_PATH_MODELS))

### Constructing Dataframe

In [35]:
# Open text data
file_path = '/content/drive/MyDrive/Colab Notebooks/LLM_Project/raw_data.txt'
with open(file_path, 'r', encoding='utf-8') as f:
    raw_text = f.read()

# Devide text by name
blocks = re.split(r'\n(?=[가-힣]{2,4}\n일기 \d+\n)',raw_text) #re.split(pattern, string)

rows = []
word = []

for block in blocks :
    word.append(str.split(block, '\n'))

# Create Dataframe
raw_data = pd.DataFrame(columns=['Name', 'Number', 'Date', 'Weather', 'Emotion', 'Diary title' ,'Diary'])

# Extract componets from each dairy entry and insert them into dataframe(raw_data)
def extract_value(text, prefix):
    if text.startswith(prefix):
        return text[len(prefix):].strip()
    return ''


for row in word:

    name = row[0]
    number = row[1]


    date = ''
    weather = ''
    emotion = ''
    title = ''
    diary_text = ''

    for text in row:
        if text.startswith('날짜:'):
            date = extract_value(text, '날짜:')
        elif text.startswith('날씨:'):
            weather = extract_value(text, '감정:')
        elif text.startswith('감정:'):
            emotion = extract_value(text, '감정:')
        elif text.startswith('제목:'):
            title = extract_value(text, '제목:')
        elif text.startswith('내용:'):
            diary_text = extract_value(text, '내용:')

    raw_data.loc[len(raw_data)] = [name, number, date, weather, emotion, title, diary_text]

### Data Privacy
- hashlib

In [11]:
## To protect data privacy, students' names have been replaced with nicknames.

# Students real name list
students_real_names = list(raw_data['Name'].unique())

# Get full name and surname
name_variants = []
for name in students_real_names:
    if len(name) > 1:
        # Full name
        name_variants.append(name)
        # First name (My students’ first names start with the initial letter of their surname.)
        name_variants.append(name[1:])
    else:
        name_variants.append(name)

#Generate nicknames using hashlib
def hash_string(input_str):
    return hashlib.sha256(input_str.encode()).hexdigest()[:7]

nicknames = [hash_string(students_real_names[i]) for i in range(len(students_real_names))]

# name - nick name mapping
nickname_map = {}
for real_name, nick in zip(students_real_names, nicknames):
    nickname_map[real_name] = nick
    if len(real_name) > 1:
        nickname_map[real_name[1:]] = nick

# nickname_map eg : {real name : nickname}


# The 'Name' and 'Diary' columns contain real names.
# Transform all real names into nicknames for privacy.

df_privacy = raw_data.copy()

# Define function
def replace_names(text):
    if pd.isna(text):
        return text
    # change serveral names using regular expressions

    # mapping dictionary
    adjusted_map = {
        ('**' if k == '*' else k): v for k, v in nickname_map.items()  #One student has one letter of lastname, so it counfused
    }

    # regular expression
    pattern = re.compile("|".join(re.escape(k) for k in adjusted_map.keys()))

    # change function
    return pattern.sub(lambda x: adjusted_map[x.group()], text)


# Diary columns
df_privacy['Diary'] = df_privacy['Diary'].apply(replace_names)

# Name coulums
df_privacy['Name'] = df_privacy['Name'].map(nickname_map).fillna(df_privacy['Name'])

In [90]:
df_privacy[11:15]

Unnamed: 0,Name,Number,Date,Weather,Emotion,Diary title,Diary
11,7.53e+218,일기 1,,,"행복하다, 기쁘다",그림,오늘 미술시간에 그림을 그리는데 처음 스케치할 때는 친구들이 별로 말을 안 했는데 ...
12,7.53e+218,일기 2,,,"즐겁다, 답답하다(다음에 또 하고 싶다)",오목,오늘은 오목을 했다. 8a2b9ef강 나랑 먼저 하고 다음은 91ab0aa이랑 나랑...
13,7.53e+218,일기 3,2025-07-07,,"속상하다, 화나다, 억울하다, 기분이 나쁘다, 화가 부글부글한다",생존수영,오늘은 생존수영을 배웠다. 근데 6ee5fe2이가 벽에 안 붙어서 두 번 말했는데 ...
14,7.53e+218,일기 4,2025-07-08,,"무섭다, 울고 싶다, 긴장하다",놀람,오늘은 새우등 뜨기랑 해파리 뜨기랑 페트병 들고 뜨기를 했는데 너무 무서워서 훌쩍이...


### Data Cleaning

In [None]:
diary = df_privacy.copy()
# Remove rows with missing emotion values by index
diary.drop(index=[8, 205, 210], inplace=True)


# For some rows with missing emotions, infer emotion from the first word of 'Diary title'
for idx in diary.iloc[190:200].index:
    diary.loc[idx, 'Emotion'] = diary.loc[idx, "Diary title"].split(" ")[0]

# diary.loc[190:199, 'Emotion'] = diary.loc[190:199, 'Diary title'].apply(lambda x: x.strip()[0])

# Reset the index to remove gaps and have consecutive numbering
diary.reset_index(drop=True, inplace=True)

In [89]:
diary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257 entries, 0 to 256
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         257 non-null    object
 1   Number       257 non-null    object
 2   Date         257 non-null    object
 3   Weather      257 non-null    object
 4   Emotion      257 non-null    object
 5   Diary title  257 non-null    object
 6   Diary        257 non-null    object
dtypes: object(7)
memory usage: 14.2+ KB


## Data analysis

In [24]:
# Peer relationships are one of the most influential factors affecting students' emotions in school life.

student_names = list(diary.Name.unique())
student_names

# Select only diary entries that contain students' names to maximize analysis efficiency
pattern = '|'.join(map(re.escape, student_names))
diary2 = diary[diary['Diary'].str.contains(pattern, na= False)]
# Only 38 diary entries left
diary2

# Change text format for Neo4j input
text_list = []
for idx, row in diary2.iterrows() :
    text_list.append(row['Name'] + " : " + row['Diary'])
text_list[10]

'0fa9737 : 오늘은 4교시에 체육을 했다. 피구팀을 정했는데 8a2b9ef팀이 안돼서 아쉬웠지만 즐거웠다.'

In [50]:
# .env file
load_dotenv(dotenv_path="/content/drive/MyDrive/Colab Notebooks/LLM_Project/.env")

# API key
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

In [None]:
prompt_template = ERExtractionTemplate()
llm = OpenAILLM(model_name="gpt-4o", model_params={"temperature": 0})

graph_json = ''
graph_history = ''
for user_input in text_list + ['finish']:
    if(user_input == 'finish'):
        with neo4j.GraphDatabase.driver("bolt://44.199.251.177:7687", auth=("neo4j", 'tune-disk-rain')) as driver:
            writer = Neo4jWriter(driver)
            graph = Neo4jGraph(
                nodes= graph_json['nodes'],
                relationships= graph_json['relationships']
            )
            await writer.run(graph)
            break

    prompt = prompt_template.format(
        schema = """
        '0f368b7', '71db26b', '3a693a0', 'dac2109', 'dc88476', 'f14a686', '2aa8348', '7685386', '826175f', '1668616', '2040613', 'e0dabba',
       '58931c4', '6a4eec6', 'd43e7e4', '23de8b0', '2926bfc', '45156c0', '8fcaf23', '9909af3', '9642e41', 'ec4a490', 'a2424b0', 'd9a5b82', '6c632a1'
        These are student name
        All 25 student nodes must always appear """,
        text = user_input
            + """(Continue extracting the graph for the following Input text.
                Ensure you retain the existing nodes and relationships from the graph history
                and add only new nodes and relationships.\n
                Graph History : """ + graph_history + ")",
        examples = ''
    )

    print(prompt)
    response = llm.invoke(prompt) # Extrach graph
    print(response.content)
    graph_history = response.content

    # Sometimes the response includes ```json```, so we split based on {}
    graph_json = json.loads(graph_history[graph_history.find("{"):graph_history.rfind("}")+1])

In [None]:
# Check the outcome at Neo4j Sandbox

## Modeling

### Psedo-Labeling

In [25]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="sangrimlee/bert-base-multilingual-cased-nsmc", framework='pt')

# Manually correct the mislabeled data
# negative_words dictionary
negative_words = ["슬프", "아쉽", "아쉬", "화나다", "울고", "무섭", "긴장", "귀찮", "떨리다", "아프", "외롭"]
# positive_words dictionary
positive_words = ["상상", "궁금", "신난", "뿌듯", "흐뭇", "즐겁", "신기", "소망", "감사"]
nutural_words = ["그저", "그렇다"]

# Get the scores for each emotion words
def get_score_morphs(df, tag_colname):
    all_scores = []

    for i in range(len(df)):
        sentence = df[tag_colname][i]
        score = []
        # To get the scores for each emotion word, split the text using commas
        for part in sentence.split(","):
            part_clean = part.strip()
            result = classifier(part_clean)[0]
            # If the word is in the emotion dictionary(negative), treat it as negative.
            if any(neg_word in part_clean for neg_word in negative_words):
                adjusted_score = round(-abs(result['score']), 3)

            # If the word is in the emotion dictionary(negative), treat it as negative.
            elif any(pos_word in part_clean for pos_word in positive_words):
                adjusted_score = round(abs(result['score']), 3)
            else:
                # normal
                if result['label'].lower() == 'negative':
                    adjusted_score = round(-result['score'], 3)
                else:
                    adjusted_score = round(result['score'], 3)
            score.append(adjusted_score)
        all_scores.append(score)
    return all_scores

scores = get_score_morphs(diary, "Emotion")
scores[:10]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/932 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/712M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


[[0.855, 0.963],
 [0.855, 0.963],
 [-0.652, -0.614],
 [0.973, 0.96, 0.855, 0.963, 0.844],
 [-0.599, -0.912],
 [0.977, 0.96, 0.963],
 [0.973, 0.96, 0.855, 0.963],
 [0.977, 0.96, 0.855, 0.963],
 [-0.599],
 [0.963]]

In [None]:
# labeling

diary_label = diary.copy()
def label_from_scores(all_scores):
    for i in range(len(all_scores)):
        all_positive = all(n > 0 for n in all_scores[i])
        all_negative = all(n < 0 for n in all_scores[i])

        # If all words are positive, label as 1
        if all_positive :
            label = 1
        # If all words are negative, label as 0
        elif all_negative :
            label = 0
        # If positive and negative are mixed, choose the one with the higher absolute value
        else :
            max_abs_num = max(all_scores[i], key = abs)
            lable =  1 if max_abs_num > 0 else 0

        diary_label.loc[i, 'label'] = label
    return diary_label

diary_label = label_from_scores(scores)


In [87]:
diary_label[11:15]

Unnamed: 0,Name,Number,Date,Weather,Emotion,Diary title,Diary,label
11,7.53e+218,일기 2,,,"즐겁다, 답답하다(다음에 또 하고 싶다)",오목,오늘은 오목을 했다. 8a2b9ef강 나랑 먼저 하고 다음은 91ab0aa이랑 나랑...,1.0
12,7.53e+218,일기 3,2025-07-07,,"속상하다, 화나다, 억울하다, 기분이 나쁘다, 화가 부글부글한다",생존수영,오늘은 생존수영을 배웠다. 근데 6ee5fe2이가 벽에 안 붙어서 두 번 말했는데 ...,0.0
13,7.53e+218,일기 4,2025-07-08,,"무섭다, 울고 싶다, 긴장하다",놀람,오늘은 새우등 뜨기랑 해파리 뜨기랑 페트병 들고 뜨기를 했는데 너무 무서워서 훌쩍이...,0.0
14,7.53e+218,일기 5,2025-07-09,,"행복하다, 설레다, 재미있다",생존수영,"오늘은 구명조끼 배우기를 했다. 단계는 1, 2, 3, 4, 5, 6단계가 있었다....",1.0


### Sentiment Anaysis - BERT (Pretrained model VS Fine-tuned)

In [None]:
# # Check the ratio
# result_label['label'].value_counts()

In [28]:
train_data = diary_label[['Diary', 'label']].sample(frac = 0.8, random_state = 42)
test_data = diary_label[['Diary', 'label']].drop(train_data.index)

In [39]:
# Pretrained model

# Load model
model_name = "beomi/KcELECTRA-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Create pipeline(text classification)
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Classification
results = []
for idx, row in test_data.iterrows() :
    result = classifier(row['Diary'])[0].get("label")
    results.append(result)

# # Correct the label format
test_data['prediction'] = results
test_data['pre_trained_label'] = test_data['prediction'].apply(lambda x : 0 if x == 'LABEL_0' else 1)
test_data.drop(['prediction'], axis = 1)

# Evaluation
y_true = test_data['label'] #real
y_pred = test_data['pre_trained_label'] #predict

acc = accuracy_score (y_true, y_pred)
f1 = f1_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)

print(f"Accuracy: {acc}, F1: {f1}, Precision: {precision}, Recall: {recall}")

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Accuracy: 0.5882352941176471, F1: 0.6956521739130435, Precision: 0.6153846153846154, Recall: 0.8


In [40]:
# Fine tuning

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('device :', device)

model_name = "beomi/KcELECTRA-base"

# Tokenising
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_train_sentences = tokenizer(
    list(train_data['Diary']),
    return_tensors = "pt",
    max_length = 128,
    padding = True,
    truncation = True,
    add_special_tokens = True)

tokenized_test_sentences = tokenizer(
    list(test_data['Diary']),
    return_tensors = "pt",
    max_length = 128,
    padding = True,
    truncation = True,
    add_special_tokens = True)

# Load Dataset
class CurseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item


    def __len__(self):
        return len(self.labels)

train_label= train_data["label"].values
test_label=test_data["label"].values

train_dataset = CurseDataset(tokenized_train_sentences, train_label)
test_dataset = CurseDataset(tokenized_test_sentences, test_label)

# Training
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 2)
model.to(device)

training_args = TrainingArguments(
    output_dir = './',
    num_train_epochs= 3,
    per_device_train_batch_size= 16,
    per_device_eval_batch_size= 64,
    logging_dir = '.logs',
    logging_steps = 5,
    save_total_limit = 2,
    report_to=["none"]
)

# Evaluation
def compute_metrics(pred) :
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _  = precision_recall_fscore_support(labels, preds, average = 'binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy_score' : acc,
        'f1': f1,
        'precision' : precision,
        'recall' : recall
    }

# Trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset=train_dataset,
    eval_dataset = test_dataset,
    compute_metrics = compute_metrics
)

device : cpu


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
trainer.train()

Step,Training Loss
5,0.6766
10,0.5945
15,0.5746
20,0.6407
25,0.6085
30,0.5625
35,0.5939


TrainOutput(global_step=39, training_loss=0.5957284860121899, metrics={'train_runtime': 704.2759, 'train_samples_per_second': 0.877, 'train_steps_per_second': 0.055, 'total_flos': 24136328219040.0, 'train_loss': 0.5957284860121899, 'epoch': 3.0})

In [42]:
trainer.evaluate(eval_dataset = test_dataset)

{'eval_loss': 0.6503643989562988,
 'eval_accuracy_score': 0.5882352941176471,
 'eval_f1': 0.7407407407407407,
 'eval_precision': 0.5882352941176471,
 'eval_recall': 1.0,
 'eval_runtime': 15.6394,
 'eval_samples_per_second': 3.261,
 'eval_steps_per_second': 0.064,
 'epoch': 3.0}

## Generating products (Prompt engineering)
- using GPT-3.5-turbo

In [73]:
diary_LLM = diary_label.copy()

# Sentiment analysis and finding cause of the emotion

def analyze_emotion_and_cause(text):
    # 1. Sentiment analysis
    emotion_response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content":
             "Detect one or more emotions present in the input text and provide a score (e.g., from 0 to 1) indicating the intensity or confidence level for each detected emotion. "
             "Choose only from the following emotions: neutral, happy, joy, sadness, anger, worry. "
             "Example output format: {\"happy\": 0.8 , \"worry\": 0.6}"},
            {"role": "user", "content": text}
        ]
    )
    emotion_text = emotion_response.choices[0].message.content.strip()

    try:
        # Parsing with dictionary
        emotion_dict = json.loads(emotion_text)
    except:
        print("Emotion parsing error:", emotion_text)
        return {}, {}

    # 2. Finding cause of the emotion
    cause_prompt = f"""
    Given the following sentence and detected emotions, explain the cause for each emotion.
    Sentence: "{text}"
    Detected Emotions: {emotion_dict}

    Format your answer as a JSON dictionary mapping each emotion to its cause. Don't forget comma in JSON dictionary
    Example:

   {{"happy": "being able to play with Student14", "worry": "not being able to play with Student14"}}
    Answer in English only.
    """
    cause_response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": cause_prompt}
        ]
    )
    cause_text = cause_response.choices[0].message.content.strip()

    try:
        cause_dict = json.loads(cause_text)
    except:
        print("Cause parsing error:", cause_text)
        return emotion_dict, {}

    return emotion_dict, cause_dict

In [62]:
# Example of generated answer
analyze_emotion_and_cause(diary_LLM['Diary'][13])

({'worry': 0.9, 'fear': 0.8, 'sadness': 0.7},
 {'worry': "The experience of carrying a shrimp spine and jellyfish while walking was so terrifying that the individual felt overwhelmed with fear and started crying. They also felt a tingling sensation as if their hands and feet were paralyzed, causing them to shake uncontrollably. The fear was so intense that they couldn't even bring themselves to stand up with the plastic bottle. Eventually, they managed to pick up the plastic bottle despite feeling scared and wanting to cry.",
  'fear': "The individual felt extreme fear while attempting to walk with the plastic bottle and ended up sitting outside due to the overwhelming feeling of fear. The fear was so strong that they couldn't control their emotions and wanted to cry."})

In [74]:
# Make 'emotion' and 'emotion_cause' columns in the dataframe
emotion_results = []
cause_results = []
#
for diary_text in diary_LLM['Diary']:
    try:
        emotion, cause = analyze_emotion_and_cause(diary_text)
    except Exception as e:
        emotion, cause = "error", str(e)

    emotion_results.append(emotion)
    cause_results.append(cause)

#
diary_LLM['emotion'] = emotion_results
diary_LLM['emotion_cause'] = cause_results

Cause parsing error: {
    "joy": "The cause of joy in the sentence is the experience of participating in a survival swimming lesson involving rescue practices. The person found it more enjoyable to be the rescuer than the one being rescued."
    "neutral": "The cause of neutrality in the sentence could be the overall description of the activities without any strong positive or negative emotions attached."
}
Cause parsing error: {
    "happy": "The speaker felt happy because they had a science class that involved performing experiments. Despite encountering difficult parts, they still found it enjoyable."
    "joy": "The speaker experienced joy from the overall experience of conducting experiments in their science class, even though it presented some challenges. They were able to derive satisfaction and pleasure from the activities."
}
Cause parsing error: {
    "happy": "The cause for feeling happy is that the person enjoyed doing physical education today. They mention that it was enj

In [85]:
diary_LLM[11:15]

Unnamed: 0,Name,Number,Date,Weather,Emotion,Diary title,Diary,label,emotion,emotion_cause
11,7.53e+218,일기 2,,,"즐겁다, 답답하다(다음에 또 하고 싶다)",오목,오늘은 오목을 했다. 8a2b9ef강 나랑 먼저 하고 다음은 91ab0aa이랑 나랑...,1.0,{'neutral': 1.0},{'neutral': 'The speaker is describing a serie...
12,7.53e+218,일기 3,2025-07-07,,"속상하다, 화나다, 억울하다, 기분이 나쁘다, 화가 부글부글한다",생존수영,오늘은 생존수영을 배웠다. 근데 6ee5fe2이가 벽에 안 붙어서 두 번 말했는데 ...,0.0,"{'anger': 0.8, 'sadness': 0.7, 'worry': 0.4}",{'anger': 'Feeling frustrated because the pers...
13,7.53e+218,일기 4,2025-07-08,,"무섭다, 울고 싶다, 긴장하다",놀람,오늘은 새우등 뜨기랑 해파리 뜨기랑 페트병 들고 뜨기를 했는데 너무 무서워서 훌쩍이...,0.0,"{'worry': 0.9, 'sadness': 0.7}",{'worry': 'The cause of worry in the sentence ...
14,7.53e+218,일기 5,2025-07-09,,"행복하다, 설레다, 재미있다",생존수영,"오늘은 구명조끼 배우기를 했다. 단계는 1, 2, 3, 4, 5, 6단계가 있었다....",1.0,"{'neutral': 0.9, 'joy': 0.2, 'worry': 0.4}",{'neutral': 'Learning how to wear a life jacke...


In [80]:
diary_LLM2 = diary_LLM.copy()

# Ready for adding emotion scores

columns = ['neutral', 'happy', 'joy', 'sadness', 'anger', 'worry']
zero = pd.DataFrame(0, index=range(257), columns=columns)

diary_LLM2 = pd.concat([diary_LLM2, zero],  axis = 1)


# Fill in the value if the corresponding key exists in the emotion dictionary for each emotion.
emotion_keys = ['happy', 'worry', 'joy', 'sadness', 'anger', 'neutral']

for key in emotion_keys:
    diary_LLM2[key] = diary_LLM2.apply(lambda row: row['emotion'].get(key) if pd.notnull(row['emotion']) and isinstance(row['emotion'], dict) else None, axis=1)

# Fillna
diary_LLM2[['happy', 'worry', 'joy', 'sadness', 'anger', 'neutral']] = \
    diary_LLM2[['happy', 'worry', 'joy', 'sadness', 'anger', 'neutral']].fillna(0)

In [84]:
diary_LLM2[11:15]

Unnamed: 0,Name,Number,Date,Weather,Emotion,Diary title,Diary,label,emotion,emotion_cause,neutral,happy,joy,sadness,anger,worry
11,7.53e+218,일기 2,,,"즐겁다, 답답하다(다음에 또 하고 싶다)",오목,오늘은 오목을 했다. 8a2b9ef강 나랑 먼저 하고 다음은 91ab0aa이랑 나랑...,1.0,{'neutral': 1.0},{'neutral': 'The speaker is describing a serie...,1.0,0.0,0.0,0.0,0.0,0.0
12,7.53e+218,일기 3,2025-07-07,,"속상하다, 화나다, 억울하다, 기분이 나쁘다, 화가 부글부글한다",생존수영,오늘은 생존수영을 배웠다. 근데 6ee5fe2이가 벽에 안 붙어서 두 번 말했는데 ...,0.0,"{'anger': 0.8, 'sadness': 0.7, 'worry': 0.4}",{'anger': 'Feeling frustrated because the pers...,0.0,0.0,0.0,0.7,0.8,0.4
13,7.53e+218,일기 4,2025-07-08,,"무섭다, 울고 싶다, 긴장하다",놀람,오늘은 새우등 뜨기랑 해파리 뜨기랑 페트병 들고 뜨기를 했는데 너무 무서워서 훌쩍이...,0.0,"{'worry': 0.9, 'sadness': 0.7}",{'worry': 'The cause of worry in the sentence ...,0.0,0.0,0.0,0.7,0.0,0.9
14,7.53e+218,일기 5,2025-07-09,,"행복하다, 설레다, 재미있다",생존수영,"오늘은 구명조끼 배우기를 했다. 단계는 1, 2, 3, 4, 5, 6단계가 있었다....",1.0,"{'neutral': 0.9, 'joy': 0.2, 'worry': 0.4}",{'neutral': 'Learning how to wear a life jacke...,0.9,0.0,0.2,0.0,0.0,0.4


In [91]:
diary_LLM2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257 entries, 0 to 256
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Name           257 non-null    object 
 1   Number         257 non-null    object 
 2   Date           257 non-null    object 
 3   Weather        257 non-null    object 
 4   Emotion        257 non-null    object 
 5   Diary title    257 non-null    object 
 6   Diary          257 non-null    object 
 7   label          257 non-null    float64
 8   emotion        257 non-null    object 
 9   emotion_cause  257 non-null    object 
 10  neutral        257 non-null    float64
 11  happy          257 non-null    float64
 12  joy            257 non-null    float64
 13  sadness        257 non-null    float64
 14  anger          257 non-null    float64
 15  worry          257 non-null    float64
dtypes: float64(7), object(9)
memory usage: 32.3+ KB


## Emotion Calendar - Visualization(WeWeb)