In [41]:
import pandas as pd
from tqdm import tqdm
import json
from uuid import uuid4
from collections import Counter


from datasets import load_dataset

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from tqdm import tqdm_notebook

import optuna

from sentence_transformers import SentenceTransformer

import torch



def process(split='train'):    
   
    utterance = []
    ids = []
    label = []
    act = []
    
    # Apply the function to all examples in the dataset
    dataset = load_dataset('daily_dialog', split=split)
    
    for i in tqdm(range(len(dataset))):
        example = dataset[i]
        did = uuid4()
        for j in range(len(example['dialog'])):
            text = example['dialog'][j]
            # add previous sentnce xontext
            if j > 1:
                text = 'emotion' + ' '+ str(example['emotion'][j - 1]) + ' ' + example['dialog'][j - 1] + ' ' + text
            utterance.append(example['dialog'][j])
            act.append(example['act'][j])
            label.append(example['emotion'][j])
            ids.append(did)

    data = {
        'text': utterance,
        'label': label,
        'attr': act,
        'id': ids
    }

    df = pd.DataFrame(data=data)

    return df

df_train = process(split='train')
print('n train', len(df_train))
df_valid = process(split='validation')
df_test = process(split='test')

# improves macro f1
rus = RandomUnderSampler(random_state=42)
df_train, _ = rus.fit_resample(df_train, df_train.label)

counts = Counter(df_train.label)
print('train label dist.', counts)

model = SentenceTransformer('all-MiniLM-L6-v2')

#df_train = df_train.sample(100)
#df_valid = df_valid.sample(100)
#df_test = df_test.sample(100)

train_sentences = df_train.text.to_numpy()
valid_sentences = df_valid.text.to_numpy()
test_sentences = df_test.text.to_numpy()

print('start encode train')
x_train = model.encode(train_sentences, show_progress_bar=True, convert_to_numpy=True, output_value='token_embeddings')
x_train = [torch.mean(emb, dim=0) for emb in x_train]
print('end encode train')
x_valid = model.encode(valid_sentences, show_progress_bar=True, convert_to_numpy=True, output_value='token_embeddings')
x_valid = [torch.mean(emb, dim=0) for emb in x_valid]
x_test = model.encode(test_sentences, show_progress_bar=True, convert_to_numpy=True, output_value='token_embeddings')
x_test = [torch.mean(emb, dim=0) for emb in x_test]


Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
100%|██████████| 11118/11118 [00:02<00:00, 5052.83it/s]


n train 87170


Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
100%|██████████| 1000/1000 [00:00<00:00, 8460.23it/s]
Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
100%|██████████| 1000/1000 [00:00<00:00, 5427.85it/s]


train label dist. Counter({0: 146, 1: 146, 2: 146, 3: 146, 4: 146, 5: 146, 6: 146})
start encode train


Batches:  12%|█▎        | 4/32 [00:02<00:20,  1.36it/s]


KeyboardInterrupt: 

In [None]:
def objective(trial):
    
    # hyper params
    alpha = trial.suggest_float('alpha', 1e-5, 1e-3, log=True)

    clf = SGDClassifier(loss='log_loss', penalty='l2', alpha=alpha, n_jobs=-1)
    #clf = RandomForestClassifier(n_estimators=200, max_depth=200)

    clf.fit(x_train, df_train.label)

    y_pred = clf.predict(x_valid)
    y_true = df_valid.label
    report = classification_report(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    return f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
study.best_params  # E.g. {'x': 2.002108042}
print('f1', study.best_value, study.best_params) # alpha 1e-5

[I 2023-07-08 23:55:53,013] A new study created in memory with name: no-name-de358597-0299-4ed2-8fd0-137bcdca5a63
[I 2023-07-08 23:55:53,256] Trial 0 finished with value: 0.15433936352023417 and parameters: {'alpha': 0.00010250889765052279}. Best is trial 0 with value: 0.15433936352023417.
[I 2023-07-08 23:55:53,532] Trial 1 finished with value: 0.1534808126888668 and parameters: {'alpha': 3.2831944386549645e-05}. Best is trial 0 with value: 0.15433936352023417.
[I 2023-07-08 23:55:53,887] Trial 2 finished with value: 0.15315808226193003 and parameters: {'alpha': 2.515064166233515e-05}. Best is trial 0 with value: 0.15433936352023417.
[I 2023-07-08 23:55:54,219] Trial 3 finished with value: 0.16493068954191642 and parameters: {'alpha': 0.00018125258580456262}. Best is trial 3 with value: 0.16493068954191642.
[I 2023-07-08 23:55:54,579] Trial 4 finished with value: 0.1703931587716208 and parameters: {'alpha': 0.0004015411557473965}. Best is trial 4 with value: 0.1703931587716208.
[I 202

f1 0.19595822511638655 {'alpha': 0.0009636453926599403}


# use sentence embedding

In [None]:
import pandas as pd
from tqdm import tqdm
import json
from uuid import uuid4
from collections import Counter


from datasets import load_dataset

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from tqdm import tqdm_notebook

import optuna

from sentence_transformers import SentenceTransformer

import torch



def process(split='train'):    
   
    utterance = []
    ids = []
    label = []
    act = []
    
    # Apply the function to all examples in the dataset
    dataset = load_dataset('daily_dialog', split=split)
    
    for i in tqdm(range(len(dataset))):
        example = dataset[i]
        did = uuid4()
        for j in range(len(example['dialog'])):
            text = example['dialog'][j]
            # add previous sentnce xontext
            if j > 1:
                text = 'emotion' + ' '+ str(example['emotion'][j - 1]) + ' ' + example['dialog'][j - 1] + ' ' + text
            utterance.append(example['dialog'][j])
            act.append(example['act'][j])
            label.append(example['emotion'][j])
            ids.append(did)

    data = {
        'text': utterance,
        'label': label,
        'attr': act,
        'id': ids
    }

    df = pd.DataFrame(data=data)

    return df

df_train = process(split='train')
print('n train', len(df_train))
df_valid = process(split='validation')
df_test = process(split='test')

# improves macro f1
rus = RandomUnderSampler(random_state=42)
df_train, _ = rus.fit_resample(df_train, df_train.label)

counts = Counter(df_train.label)
print('train label dist.', counts)

model = SentenceTransformer('all-MiniLM-L6-v2')

#df_train = df_train.sample(100)
#df_valid = df_valid.sample(100)
#df_test = df_test.sample(100)

train_sentences = df_train.text.to_numpy()
valid_sentences = df_valid.text.to_numpy()
test_sentences = df_test.text.to_numpy()

print('start encode train')
x_train = model.encode(train_sentences, show_progress_bar=True, convert_to_numpy=True)
print('end encode train')
x_valid = model.encode(valid_sentences, show_progress_bar=True, convert_to_numpy=True)
x_test = model.encode(test_sentences, show_progress_bar=True, convert_to_numpy=True)

def objective(trial):
    
    # hyper params
    alpha = trial.suggest_float('alpha', 1e-5, 1e-3, log=True)

    clf = SGDClassifier(loss='log_loss', penalty='l2', alpha=alpha, n_jobs=-1)
    #clf = RandomForestClassifier(n_estimators=200, max_depth=200)

    clf.fit(x_train, df_train.label)

    y_pred = clf.predict(x_valid)
    y_true = df_valid.label
    report = classification_report(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    return f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
study.best_params  # E.g. {'x': 2.002108042}
print('f1', study.best_value, study.best_params) # alpha 1e-5

Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
100%|██████████| 11118/11118 [00:03<00:00, 3494.39it/s]


n train 87170


Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
100%|██████████| 1000/1000 [00:00<00:00, 6681.27it/s]
Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
100%|██████████| 1000/1000 [00:00<00:00, 6007.79it/s]


train label dist. Counter({0: 146, 1: 146, 2: 146, 3: 146, 4: 146, 5: 146, 6: 146})
start encode train


Batches: 100%|██████████| 32/32 [00:11<00:00,  2.90it/s]


end encode train


Batches: 100%|██████████| 253/253 [01:20<00:00,  3.15it/s]
Batches: 100%|██████████| 242/242 [01:21<00:00,  2.98it/s]
[I 2023-07-09 00:02:26,675] A new study created in memory with name: no-name-4926be8f-ec88-4a42-a7ff-914f7133c5f8
[I 2023-07-09 00:02:27,053] Trial 0 finished with value: 0.21667155232086896 and parameters: {'alpha': 0.0001073442180651878}. Best is trial 0 with value: 0.21667155232086896.
[I 2023-07-09 00:02:27,420] Trial 1 finished with value: 0.160913689581787 and parameters: {'alpha': 2.321442042303829e-05}. Best is trial 0 with value: 0.21667155232086896.
[I 2023-07-09 00:02:27,702] Trial 2 finished with value: 0.1854458838842257 and parameters: {'alpha': 0.00017678521043400515}. Best is trial 0 with value: 0.21667155232086896.
[I 2023-07-09 00:02:27,964] Trial 3 finished with value: 0.20055975753972088 and parameters: {'alpha': 0.0002985568390672121}. Best is trial 0 with value: 0.21667155232086896.
[I 2023-07-09 00:02:28,293] Trial 4 finished with value: 0.1693856

f1 0.24261841266072953 {'alpha': 0.0004557870520195258}


# over sample

In [47]:
import pandas as pd
from tqdm import tqdm
import json
from uuid import uuid4
from collections import Counter


from datasets import load_dataset

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from tqdm import tqdm_notebook

import optuna

from sentence_transformers import SentenceTransformer

import torch

import numpy as np



def process(split='train'):    
   
    utterance = []
    ids = []
    label = []
    act = []
    
    # Apply the function to all examples in the dataset
    dataset = load_dataset('daily_dialog', split=split)
    
    for i in tqdm(range(len(dataset))):
        example = dataset[i]
        did = uuid4()
        for j in range(len(example['dialog'])):
            text = example['dialog'][j]
            # add previous sentnce xontext
            if j > 1:
                text = 'emotion' + ' '+ str(example['emotion'][j - 1]) + ' ' + example['dialog'][j - 1] + ' ' + text
            utterance.append(example['dialog'][j])
            act.append(example['act'][j])
            label.append(example['emotion'][j])
            ids.append(did)

    data = {
        'text': utterance,
        'label': label,
        'attr': act,
        'id': ids
    }

    df = pd.DataFrame(data=data)

    return df

df_train = process(split='train')
print('n train', len(df_train))
df_valid = process(split='validation')
df_test = process(split='test')

# improves macro f1


counts = Counter(df_train.label)
print('train label dist.', counts)

model = SentenceTransformer('all-MiniLM-L6-v2')

#df_train = df_train.sample(10000).reset_index(drop=True)
#df_valid = df_valid.sample(100)
#df_test = df_test.sample(100)

train_sentences = df_train.text.to_numpy()
valid_sentences = df_valid.text.to_numpy()
test_sentences = df_test.text.to_numpy()

print('start encode train')
x_train = model.encode(train_sentences, show_progress_bar=True, convert_to_numpy=True)

# over sample the embeddings
rus = RandomOverSampler(random_state=42)
df_train['index_0'] = df_train.index
df_train, _ = rus.fit_resample(df_train, df_train.label)
x_train = np.array(x_train)
x_train = x_train[df_train.index_0]
print(x_train.shape)

counts = Counter(df_train.label)
print('train label dist.', counts)

print('end encode train')
x_valid = model.encode(valid_sentences, show_progress_bar=True, convert_to_numpy=True)
x_test = model.encode(test_sentences, show_progress_bar=True, convert_to_numpy=True)



def objective(trial):
    
    # hyper params
    alpha = trial.suggest_float('alpha', 1e-5, 1e-3, log=True)

    clf = SGDClassifier(loss='log_loss', penalty='l2', alpha=alpha, n_jobs=-1)
    #clf = RandomForestClassifier(n_estimators=200, max_depth=200)

    clf.fit(x_train, df_train.label)

    y_pred = clf.predict(x_valid)
    y_true = df_valid.label
    report = classification_report(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    return f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
study.best_params  # E.g. {'x': 2.002108042}
print('f1', study.best_value, study.best_params) # alpha 1e-5

Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
100%|██████████| 11118/11118 [00:01<00:00, 7423.93it/s]


n train 87170


Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
100%|██████████| 1000/1000 [00:00<00:00, 5953.23it/s]
Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
100%|██████████| 1000/1000 [00:00<00:00, 7061.84it/s]


train label dist. Counter({0: 72143, 4: 11182, 6: 1600, 5: 969, 1: 827, 2: 303, 3: 146})
start encode train


Batches: 100%|██████████| 2725/2725 [11:52<00:00,  3.82it/s]


(505001, 384)
train label dist. Counter({0: 72143, 4: 72143, 6: 72143, 3: 72143, 2: 72143, 5: 72143, 1: 72143})
end encode train


Batches: 100%|██████████| 253/253 [01:14<00:00,  3.38it/s]
Batches: 100%|██████████| 242/242 [01:06<00:00,  3.65it/s]
[I 2023-07-09 00:54:00,558] A new study created in memory with name: no-name-aed3d2c2-6925-4b5a-81f3-a9bc3a641a3f
[I 2023-07-09 00:55:07,383] Trial 0 finished with value: 0.24773563732609719 and parameters: {'alpha': 2.2531478892692823e-05}. Best is trial 0 with value: 0.24773563732609719.
[I 2023-07-09 00:55:35,380] Trial 1 finished with value: 0.24368029983462125 and parameters: {'alpha': 0.0001445699249619176}. Best is trial 0 with value: 0.24773563732609719.
[I 2023-07-09 00:55:56,095] Trial 2 finished with value: 0.242599166419652 and parameters: {'alpha': 0.0003524786712356965}. Best is trial 0 with value: 0.24773563732609719.
[I 2023-07-09 00:56:16,093] Trial 3 finished with value: 0.2459058949687854 and parameters: {'alpha': 0.0002779924733301762}. Best is trial 0 with value: 0.24773563732609719.
[I 2023-07-09 00:56:37,617] Trial 4 finished with value: 0.2449871

f1 0.2634924765255851 {'alpha': 1.0115481228917037e-05}


In [48]:
clf = SGDClassifier(loss='log_loss', penalty='l2', alpha=1.0115481228917037e-05, n_jobs=-1)

clf.fit(x_train, df_train.label)

y_pred = clf.predict(x_test)
y_true = df_test.label
report = classification_report(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='macro')
print(f1)
print(report)

0.25843587584681377
              precision    recall  f1-score   support

           0       0.95      0.51      0.66      6321
           1       0.11      0.51      0.18       118
           2       0.05      0.38      0.08        47
           3       0.03      0.47      0.05        17
           4       0.39      0.65      0.49      1019
           5       0.08      0.65      0.15       102
           6       0.12      0.63      0.20       116

    accuracy                           0.53      7740
   macro avg       0.25      0.54      0.26      7740
weighted avg       0.83      0.53      0.61      7740

