In [None]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"

import pandas as pd
from google.colab import drive
import random

# Mount Google Drive and load the CSV file
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/data_science/year5_2.csv'
df = pd.read_csv(file_path, encoding='cp949')

# Emotion-to-integer mapping (excluding 'neutral')
emotion_map = {
    'happiness': 1,
    'sadness': 2,
    'angry': 3,
    'fear': 4,
    'disgust': 5,
    'surprise': 6
}

# Calculate main emotion per row (based on score = mapped_value * intensity)
main_emotions = []

for i, row in df.iterrows():
    scores = []
    for i in range(1, 6):
        emotion = row[f'{i}번 감정']
        intensity_col = f'{i}번 감정세기' if i != 4 else '4번감정세기'

        if emotion == 'neutral':
            continue

        mapped = emotion_map.get(emotion, 0)
        score = mapped * row[intensity_col]
        scores.append((emotion, score))

    if scores:
        top_emotion = max(scores, key=lambda x: x[1])[0]
        main_emotions.append(top_emotion)
    else:
        main_emotions.append(None)

# Add main emotion column to the DataFrame
df['main_emotion'] = main_emotions

# Count the number of samples per main emotion (excluding None)
emotion_counts = df['main_emotion'].value_counts(dropna=True).reset_index()
emotion_counts.columns = ['emotion', 'count']

# Display the result
display(emotion_counts)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,emotion,count
0,sadness,4245
1,happiness,3390
2,disgust,3370
3,surprise,3023
4,angry,2747
5,fear,2173


In [None]:
# # Undersample only if class count >= median_count
# balanced_df = []

# median_count = int(df['main_emotion'].value_counts().median())

# for emotion, group in df[df['main_emotion'].notna()].groupby('main_emotion'):
#     if len(group) >= median_count:
#         sampled = group.sample(n=median_count, random_state=42)
#     else:
#         sampled = group
#     balanced_df.append(sampled)

# # Concatenate all groups into a single DataFrame
# balanced_df = pd.concat(balanced_df).reset_index(drop=True)

# # Show new class distribution
# balanced_counts = balanced_df['main_emotion'].value_counts().reset_index()
# balanced_counts.columns = ['emotion', 'count']

# display(balanced_counts)

# Under-sample all classes to 'fear' count
balanced_df = []

# Get 'fear' count
fear_count = df[df['main_emotion'] == 'fear'].shape[0]

for emotion, group in df[df['main_emotion'].notna()].groupby('main_emotion'):
    if len(group) >= fear_count:
        sampled = group.sample(n=fear_count, random_state=42)
    else:
        sampled = group
    balanced_df.append(sampled)

# Concatenate all groups into a single DataFrame
balanced_df = pd.concat(balanced_df).reset_index(drop=True)

# Show new class distribution
balanced_counts = balanced_df['main_emotion'].value_counts().reset_index()
balanced_counts.columns = ['emotion', 'count']

display(balanced_counts)

Unnamed: 0,emotion,count
0,angry,2173
1,disgust,2173
2,fear,2173
3,happiness,2173
4,sadness,2173
5,surprise,2173


In [None]:
save_path = '/content/drive/MyDrive/data_science/balanced_emotion_data.csv'

# save to csv
balanced_df.to_csv(save_path, encoding='utf-8-sig', index=False)

print(f"saved to: {save_path}")


saved to: /content/drive/MyDrive/data_science/balanced_emotion_data.csv
