In [None]:
import pandas as pd
#from google.colab import drive
import os
import opensmile

In [None]:
#drive.mount('/content/drive')

Labeling / Sampling

In [None]:
# file_path = '/content/drive/MyDrive/???.csv'
# df = pd.read_csv(file_path, encoding='cp949')

In [None]:
emotion_map = {
    'happiness': 1,
    'sadness': 2,
    'angry': 3,
    'fear': 4,
    'disgust': 5,
    'surprise': 6
}

main_emotions = []

for i, row in df.iterrows():
    scores = []
    for i in range(1, 6):
        emotion = row[f'{i}번 감정']
        intensity_col = f'{i}번 감정세기' if i != 4 else '4번감정세기'

        if emotion == 'neutral':
            continue

        mapped = emotion_map.get(emotion, 0)
        score = mapped * row[intensity_col]
        scores.append((emotion, score))

    if scores:
        top_emotion = max(scores, key=lambda x: x[1])[0]
        main_emotions.append(top_emotion)
    else:
        main_emotions.append(None)

df['main_emotion'] = main_emotions

emotion_counts = df['main_emotion'].value_counts(dropna=True).reset_index()
emotion_counts.columns = ['emotion', 'count']

display(emotion_counts)

In [None]:
balanced_df = []

median_count = int(df['main_emotion'].value_counts().median())

for emotion, group in df[df['main_emotion'].notna()].groupby('main_emotion'):
    if len(group) >= median_count:
        sampled = group.sample(n=median_count, random_state=42)
    else:
        sampled = group

balanced_df.append(sampled)
balanced_df = pd.concat(balanced_df).reset_index(drop=True)

In [None]:
txt_df = balanced_df[["wav_id", "발화문", "main_emotion"]]

OPENSMile

In [None]:
# drive.mount('/content/drive')
# audio_folder = '/content/drive/MyDrive/???'

audio_files = []
for dirpath, _, filenames in os.walk(audio_folder):
    for fname in filenames:
        if fname.lower().endswith('.wav'):
            audio_files.append(os.path.join(dirpath, fname))

smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals,
)

results = []
for audio_file in audio_files:
    features = smile.process_file(audio_file)
    row = [os.path.basename(audio_file)] + features.values.flatten().tolist()
    results.append(row)

columns = ['wav_id'] + smile.feature_names
audio_df = pd.DataFrame(results, columns=columns)

Merge

In [None]:
df = pd.merge(audio_df, txt_df, on="wav_id", how="inner")

Modeling

In [None]:
%cd Mecab-ko-for-Google-Colab

!bash install_mecab-ko_on_colab_light_220429.sh

!pip install pandas scikit-learn catboost

In [None]:
import shutil
shutil.rmtree('Mecab-ko-for-Google-Colab', ignore_errors=True)

from konlpy.tag import Mecab
mecab = Mecab()

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from google.colab import files

from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC

from konlpy.tag import Okt

from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
y = df['main_emotion']

mecab = Mecab()

def tokenize(text):
    return mecab.morphs(text)

tfidf = TfidfVectorizer(
    tokenizer=tokenize,
    preprocessor=lambda x: x,
    token_pattern=None,
    max_features=500,
    ngram_range=(1,2)
)
X_text = tfidf.fit_transform(df['발화문'].fillna('')).toarray()
print("TF-IDF shape:", X_text.shape)


audio_cols = [c for c in df.columns if c not in ['wav_id', '발화문', 'main_emotion']]
X_audio   = df[audio_cols].values


X = np.hstack([X_audio, X_text])


le = LabelEncoder()
y_enc = le.fit_transform(y)


X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)


def train_eval(name, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"=== {name} Classification Report ===")
    print(classification_report(y_test, y_pred, target_names=le.classes_))



cb = CatBoostClassifier(verbose=0, random_state=42)
train_eval('CatBoost', cb)

External Data

In [None]:
# file_path = '/content/drive/MyDrive/???.csv'
# final_df = pd.read_csv(file_path, encoding='cp949')

X_text_final = tfidf.transform(final_df['발화문'].fillna('')).toarray()

X_audio_final = final_df[audio_cols].values

X_final = np.hstack([X_audio_final, X_text_final])

y_final_pred_enc = cb.predict(X_final)
y_final_pred = le.inverse_transform(y_final_pred_enc)

final_df['predicted_emotion'] = y_final_pred