In [1]:
import pandas as pd
from tqdm import tqdm
import json
from uuid import uuid4
from collections import Counter


from datasets import load_dataset

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

import optuna



def process(split='train'):    
   
    utterance = []
    ids = []
    label = []
    act = []
    
    # Apply the function to all examples in the dataset
    dataset = load_dataset('daily_dialog', split=split)
    
    for i in tqdm(range(len(dataset))):
        example = dataset[i]
        did = uuid4()
        for j in range(len(example['dialog'])):
            text = example['dialog'][j]
            # add previous sentnce xontext
            #if j > 1:
            #    text = str(example['emotion'][j - 1]) + ' ' + example['dialog'][j - 1] + ' ' + text
            utterance.append(example['dialog'][j])
            act.append(example['act'][j])
            label.append(example['emotion'][j])
            ids.append(did)

    data = {
        'text': utterance,
        'label': label,
        'attr': act,
        'id': ids
    }

    df = pd.DataFrame(data=data)

    return df

df_train = process(split='train')
print('n train', len(df_train))
df_valid = process(split='validation')
df_test = process(split='test')

# improves macro f1
rus = RandomOverSampler(random_state=42)
df_train, _ = rus.fit_resample(df_train, df_train.label)

counts = Counter(df_train.label)
print('train label dist.', counts)


def objective(trial):
    
    # hyper params
    alpha = trial.suggest_float('alpha', 1e-5, 1e-3, log=True)

    clf = SGDClassifier(loss='log_loss', penalty='l2', alpha=alpha, n_jobs=-1)
    #clf = RandomForestClassifier(n_estimators=200, max_depth=200)

    count_vect = CountVectorizer()

    X_train_counts = count_vect.fit_transform(df_train.text.to_list())
    X_valid_counts = count_vect.transform(df_valid.text.to_list())
    X_test_counts = count_vect.transform(df_test.text.to_list())

    tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
    X_train_tfidf = tf_transformer.transform(X_train_counts)
    X_valid_tfidf = tf_transformer.transform(X_valid_counts)
    X_test_tfidf = tf_transformer.transform(X_test_counts)

    clf.fit(X_train_tfidf, df_train.label)

    y_pred = clf.predict(X_valid_tfidf)
    y_true = df_valid.label
    report = classification_report(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    return f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)
study.best_params  # E.g. {'x': 2.002108042}
print('f1', study.best_value, study.best_params) # alpha 1e-5


# test

clf = SGDClassifier(loss='log_loss', penalty='l2', alpha=study.best_params['alpha'], n_jobs=-1)
#clf = RandomForestClassifier(n_estimators=200, max_depth=200)

count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(df_train.text.to_list())
X_valid_counts = count_vect.transform(df_valid.text.to_list())
X_test_counts = count_vect.transform(df_test.text.to_list())

tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tfidf = tf_transformer.transform(X_train_counts)
X_valid_tfidf = tf_transformer.transform(X_valid_counts)
X_test_tfidf = tf_transformer.transform(X_test_counts)

clf.fit(X_train_tfidf, df_train.label)

y_pred = clf.predict(X_test_tfidf)
y_true = df_test.label
report = classification_report(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='macro')

print(report)
print(f1)

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
100%|██████████| 11118/11118 [00:04<00:00, 2348.94it/s]


n train 87170


Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
100%|██████████| 1000/1000 [00:00<00:00, 3734.71it/s]
Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
100%|██████████| 1000/1000 [00:00<00:00, 2367.62it/s]
[I 2023-07-08 22:25:37,461] A new study created in memory with name: no-name-9ff07253-6612-40a1-90e1-31af7aa9b27a


train label dist. Counter({0: 72143, 4: 72143, 6: 72143, 3: 72143, 2: 72143, 5: 72143, 1: 72143})


[I 2023-07-08 22:25:57,436] Trial 0 finished with value: 0.2654404801708495 and parameters: {'alpha': 1.1591775170936925e-05}. Best is trial 0 with value: 0.2654404801708495.
[I 2023-07-08 22:26:13,182] Trial 1 finished with value: 0.22539544529688643 and parameters: {'alpha': 0.0009084114012365891}. Best is trial 0 with value: 0.2654404801708495.
[I 2023-07-08 22:26:28,471] Trial 2 finished with value: 0.2668859065697529 and parameters: {'alpha': 3.648797749566661e-05}. Best is trial 2 with value: 0.2668859065697529.
[I 2023-07-08 22:26:43,532] Trial 3 finished with value: 0.22609603930185668 and parameters: {'alpha': 0.0008515958665917744}. Best is trial 2 with value: 0.2668859065697529.
[I 2023-07-08 22:26:58,764] Trial 4 finished with value: 0.24703164098580618 and parameters: {'alpha': 0.00027157985967521874}. Best is trial 2 with value: 0.2668859065697529.
[I 2023-07-08 22:27:13,899] Trial 5 finished with value: 0.26658931331404356 and parameters: {'alpha': 4.238463505737148e-05}

f1 0.2674667406586527 {'alpha': 2.7098768803559325e-05}
              precision    recall  f1-score   support

           0       0.93      0.61      0.74      6321
           1       0.12      0.53      0.19       118
           2       0.10      0.49      0.17        47
           3       0.07      0.47      0.12        17
           4       0.41      0.64      0.50      1019
           5       0.11      0.61      0.19       102
           6       0.12      0.60      0.20       116

    accuracy                           0.61      7740
   macro avg       0.27      0.57      0.30      7740
weighted avg       0.82      0.61      0.68      7740

0.3011851318987442


In [2]:
import pandas as pd
from tqdm import tqdm
import json
from uuid import uuid4
from collections import Counter


from datasets import load_dataset

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

import optuna



def process(split='train'):    
   
    utterance = []
    ids = []
    label = []
    act = []
    
    # Apply the function to all examples in the dataset
    dataset = load_dataset('daily_dialog', split=split)
    
    for i in tqdm(range(len(dataset))):
        example = dataset[i]
        did = uuid4()
        for j in range(len(example['dialog'])):
            text = example['dialog'][j]
            # add previous sentnce xontext
            if j > 1:
                text = str(example['emotion'][j - 1]) + ' ' + example['dialog'][j - 1] + ' ' + text
            utterance.append(text)
            act.append(example['act'][j])
            label.append(example['emotion'][j])
            ids.append(did)

    data = {
        'text': utterance,
        'label': label,
        'attr': act,
        'id': ids
    }

    df = pd.DataFrame(data=data)

    return df

df_train = process(split='train')
print('n train', len(df_train))
df_valid = process(split='validation')
df_test = process(split='test')

# improves macro f1
rus = RandomOverSampler(random_state=42)
df_train, _ = rus.fit_resample(df_train, df_train.label)

counts = Counter(df_train.label)
print('train label dist.', counts)


def objective(trial):
    
    # hyper params
    alpha = trial.suggest_float('alpha', 1e-5, 1e-3, log=True)

    clf = SGDClassifier(loss='log_loss', penalty='l2', alpha=alpha, n_jobs=-1)
    #clf = RandomForestClassifier(n_estimators=200, max_depth=200)

    count_vect = CountVectorizer()

    X_train_counts = count_vect.fit_transform(df_train.text.to_list())
    X_valid_counts = count_vect.transform(df_valid.text.to_list())
    X_test_counts = count_vect.transform(df_test.text.to_list())

    tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
    X_train_tfidf = tf_transformer.transform(X_train_counts)
    X_valid_tfidf = tf_transformer.transform(X_valid_counts)
    X_test_tfidf = tf_transformer.transform(X_test_counts)

    clf.fit(X_train_tfidf, df_train.label)

    y_pred = clf.predict(X_valid_tfidf)
    y_true = df_valid.label
    report = classification_report(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    return f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)
study.best_params  # E.g. {'x': 2.002108042}
print('f1', study.best_value, study.best_params) # alpha 1e-5


# test

clf = SGDClassifier(loss='log_loss', penalty='l2', alpha=study.best_params['alpha'], n_jobs=-1)
#clf = RandomForestClassifier(n_estimators=200, max_depth=200)

count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(df_train.text.to_list())
X_valid_counts = count_vect.transform(df_valid.text.to_list())
X_test_counts = count_vect.transform(df_test.text.to_list())

tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tfidf = tf_transformer.transform(X_train_counts)
X_valid_tfidf = tf_transformer.transform(X_valid_counts)
X_test_tfidf = tf_transformer.transform(X_test_counts)

clf.fit(X_train_tfidf, df_train.label)

y_pred = clf.predict(X_test_tfidf)
y_true = df_test.label
report = classification_report(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='macro')

print(report)
print(f1)

Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
100%|██████████| 11118/11118 [00:09<00:00, 1120.41it/s]


n train 87170


Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
100%|██████████| 1000/1000 [00:00<00:00, 9290.51it/s]
Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
100%|██████████| 1000/1000 [00:00<00:00, 8765.53it/s]
[I 2023-07-09 01:28:32,648] A new study created in memory with name: no-name-0d70db42-108c-4205-9e4a-58c44a67101a


train label dist. Counter({0: 72143, 4: 72143, 6: 72143, 3: 72143, 2: 72143, 5: 72143, 1: 72143})


[I 2023-07-09 01:28:47,126] Trial 0 finished with value: 0.24618219470468522 and parameters: {'alpha': 0.00013132734867429819}. Best is trial 0 with value: 0.24618219470468522.
[I 2023-07-09 01:29:01,175] Trial 1 finished with value: 0.25256649893575417 and parameters: {'alpha': 3.8428252929126535e-05}. Best is trial 1 with value: 0.25256649893575417.
[I 2023-07-09 01:29:15,772] Trial 2 finished with value: 0.25147192556141496 and parameters: {'alpha': 3.256888988030764e-05}. Best is trial 1 with value: 0.25256649893575417.
[I 2023-07-09 01:29:29,600] Trial 3 finished with value: 0.24112161054586606 and parameters: {'alpha': 0.00016703551682687728}. Best is trial 1 with value: 0.25256649893575417.
[I 2023-07-09 01:29:43,532] Trial 4 finished with value: 0.22494208198752697 and parameters: {'alpha': 0.0004753666110985064}. Best is trial 1 with value: 0.25256649893575417.
[I 2023-07-09 01:29:57,219] Trial 5 finished with value: 0.21345867877936403 and parameters: {'alpha': 0.000927403106