In [1]:
!pip install wandb --upgrade



In [2]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
import wandb

In [3]:
wandb.__version__

'0.17.3'

In [4]:
!wandb login --cloud --relogin c34ee54bb979d41596a8c2e52576a53353084cf3

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /aiffel/.netrc


In [5]:
# from wandb.keras import WandbCallback
from wandb.integration.keras import WandbCallback # for 0.17.3

from sklearn.preprocessing import LabelEncoder

In [8]:
# train_data_path ="train.csv"
train_data_path ="./dktc/data/train.csv"
train_data = pd.read_csv(train_data_path)
train_data.head()

Unnamed: 0,idx,class,conversation
0,0,협박 대화,지금 너 스스로를 죽여달라고 애원하는 것인가?\n 아닙니다. 죄송합니다.\n 죽을 ...
1,1,협박 대화,길동경찰서입니다.\n9시 40분 마트에 폭발물을 설치할거다.\n네?\n똑바로 들어 ...
2,2,기타 괴롭힘 대화,너 되게 귀여운거 알지? 나보다 작은 남자는 첨봤어.\n그만해. 니들 놀리는거 재미...
3,3,갈취 대화,어이 거기\n예??\n너 말이야 너. 이리 오라고\n무슨 일.\n너 옷 좋아보인다?...
4,4,갈취 대화,저기요 혹시 날이 너무 뜨겁잖아요? 저희 회사에서 이 선크림 파는데 한 번 손등에 ...


In [9]:
CLASS_NAMES = ['협박 대화', '갈취 대화', '직장 내 괴롭힘 대화', '기타 괴롭힘 대화']

encoder = LabelEncoder()
encoder.fit(CLASS_NAMES)

train_data['class'] = encoder.transform(train_data['class'])

corpus = train_data["conversation"]

In [10]:
def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=1000,
        filters=' ',
        oov_token="<unk>")
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post',maxlen=20)

    print(tensor,tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)

[[  1 210   1 ...   1  41   1]
 [  1   1   1 ...   1   1 396]
 [  1  96   1 ...   1 125   1]
 ...
 [485   3 113 ...  27   1   1]
 [  1   1 152 ... 350   1   1]
 [  1   1   1 ... 156 114   1]] <keras_preprocessing.text.Tokenizer object at 0x79c3679260a0>


In [11]:
for idx in tokenizer.index_word:
    print(idx, ":", tokenizer.index_word[idx])

    if idx >= 10: break

1 : <unk>
2 : 내가
3 : 너
4 : 좀
5 : 다
6 : 왜
7 : 나
8 : 진짜
9 : 야
10 : 지금


In [12]:
X_train = tensor[:3500]
y_train = train_data['class'][:3500]
X_val = tensor[3500:3850]
y_val = train_data['class'][3500:3850]
X_test = tensor[3850:]
y_test = train_data['class'][3850:]

In [13]:
# wandb.login(key = "c34ee54bb979d41596a8c2e52576a53353084cf3")
# wandb.login()

# Sweep

# 1. 변수 - sweep_config

In [14]:
sweep_config = {
    "name": "sweep_test_nlp",
    "metric": {"name": "val_loss", "goal": "minimize"},
    "method": "random",
    "parameters": {
        "learning_rate" : {
            "min" : 0.001,
            "max" : 0.1
            },
        "epoch" : {
            "distribution" : "int_uniform",
            "min" : 5,
            "max" : 10
            },

#         "vocab" : 1000,
#         "embeddings" : 64,
#         "units_1" : 256,
#         "units_2" : 256,
#         "units_3" : 1024,
#         "class_num" : 4,
#         "optimizer" : "adam",
#         "loss" : "sparse_categorical_crossentropy",
#         "metrics" : ["accuracy"],
#         "batch_size" : 32

        }

    }


# 상수 - wandb.config.setdefaults

In [15]:
default_config = {
    "vocab" : 1000,
    "embeddings" : 64,
    "units_1" : 256,
    "units_2" : 256,
    "units_3" : 1024,
    "class_num" : 4,
    "learning_rate" : 0.005,
    "optimizer" : "adam",
    "loss" : "sparse_categorical_crossentropy",
    "metrics" : ["accuracy"],
    "epoch" : 5,
    "batch_size" : 32
}


In [19]:
def train(config=None):

    # Model

#     # 대안
#     wandb.init(config=config)
#     wandb.config.setdefaults(default_config)


    with wandb.init(config=config) as run:
        
        # 
        run.config.setdefaults(default_config)
        
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config

        model=keras.models.Sequential()
        model.add(keras.layers.Embedding(config.vocab, config.embeddings))
        model.add(keras.layers.GRU(units = config.units_1, return_sequences = True))
        model.add(keras.layers.GRU(units = config.units_2))
        model.add(keras.layers.Dense(config.units_3, activation='relu'))
        model.add(keras.layers.Dense(config.class_num, activation='softmax'))

        # 머신 러닝 학습때 여러가지 optimzier를 사용할 경우나 learning rate를 조절할 경우에는 아래와 같은 형태의 코드를 응용합니다.

        if config.optimizer == 'adam':
            optimizer = keras.optimizers.Adam(learning_rate = config.learning_rate)

        model.compile(optimizer = optimizer,
                    loss = config.loss,
                    metrics = config.metrics)

        # WandbCallback 함수는 후술합니다.

        model.fit(X_train, y_train,
                epochs = config.epoch,
                batch_size = config.batch_size,
                validation_data = (X_val, y_val),
                callbacks = [WandbCallback()]
                )

        test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)

        # wandb.log 함수 안에 기록하고 싶은 정보를 담습니다.

        wandb.log({"Test Accuracy Rate: " : round(test_accuracy * 100, 2),
                "Test Error Rate: " : round((1 - test_accuracy) * 100, 2)})

In [20]:
# wandb.init(
#     project = "test_wandb",
#     # name="test_run",
#     config = default_config,
#     # settings=wandb.Settings(start_method='thread', _disable_stats=True)
# )
# #     config = default_config

In [21]:
# entity와 project에 본인의 아이디와 프로젝트명을 입력하세요
sweep_id = wandb.sweep(sweep_config,

#                        entity = "coronarita1991", # entity를 넣으면

                       # wandb: ERROR Error while calling W&B API: permission denied (<Response [403]>)

                       entity = "talk-cleaner", # 팀명으로 넣으면 Create sweep까지는 이뤄진다. 개별 케이스를 고려해야 할 듯
                       project = "LMS_test"
                       )

# run the sweep
wandb.agent(sweep_id,
            function=train,
            count=2)

Create sweep with ID: xl5u8uhg
Sweep URL: https://wandb.ai/talk-cleaner/LMS_test/sweeps/xl5u8uhg


[34m[1mwandb[0m: Agent Starting Run: k4ieiwrl with config:
[34m[1mwandb[0m: 	epoch: 8
[34m[1mwandb[0m: 	learning_rate: 0.02847995013908747




Epoch 1/8




INFO:tensorflow:Assets written to: /aiffel/aiffel/wandb/run-20240625_104922-k4ieiwrl/files/model-best/assets


INFO:tensorflow:Assets written to: /aiffel/aiffel/wandb/run-20240625_104922-k4ieiwrl/files/model-best/assets
[34m[1mwandb[0m: Adding directory to artifact (/aiffel/aiffel/wandb/run-20240625_104922-k4ieiwrl/files/model-best)... Done. 0.0s


Epoch 2/8




INFO:tensorflow:Assets written to: /aiffel/aiffel/wandb/run-20240625_104922-k4ieiwrl/files/model-best/assets


INFO:tensorflow:Assets written to: /aiffel/aiffel/wandb/run-20240625_104922-k4ieiwrl/files/model-best/assets
[34m[1mwandb[0m: Adding directory to artifact (/aiffel/aiffel/wandb/run-20240625_104922-k4ieiwrl/files/model-best)... Done. 0.0s


Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
4/4 - 0s - loss: 1.3848 - accuracy: 0.2100


VBox(children=(Label(value='25.058 MB of 25.058 MB uploaded (0.016 MB deduped)\r'), FloatProgress(value=1.0, m…

0,1
Test Accuracy Rate:,▁
Test Error Rate:,▁
accuracy,▁▁▄▂▇██▇
epoch,▁▂▃▄▅▆▇█
loss,█▁▁▁▁▁▁▁
val_accuracy,▅█▁▁▁▁▁▅
val_loss,█▁▅█▇▇▃▃

0,1
Test Accuracy Rate:,21.0
Test Error Rate:,79.0
accuracy,0.27714
best_epoch,1.0
best_val_loss,1.38449
epoch,7.0
loss,1.38557
val_accuracy,0.26
val_loss,1.38716


[34m[1mwandb[0m: Agent Starting Run: nwijzn3l with config:
[34m[1mwandb[0m: 	epoch: 9
[34m[1mwandb[0m: 	learning_rate: 0.09646096971657524


Epoch 1/9




INFO:tensorflow:Assets written to: /aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best/assets


INFO:tensorflow:Assets written to: /aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best/assets
[34m[1mwandb[0m: Adding directory to artifact (/aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best)... Done. 0.1s


Epoch 2/9




INFO:tensorflow:Assets written to: /aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best/assets


INFO:tensorflow:Assets written to: /aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best/assets
[34m[1mwandb[0m: Adding directory to artifact (/aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best)... Done. 0.0s


Epoch 3/9




INFO:tensorflow:Assets written to: /aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best/assets


INFO:tensorflow:Assets written to: /aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best/assets
[34m[1mwandb[0m: Adding directory to artifact (/aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best)... Done. 0.0s


Epoch 4/9




INFO:tensorflow:Assets written to: /aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best/assets


INFO:tensorflow:Assets written to: /aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best/assets
[34m[1mwandb[0m: Adding directory to artifact (/aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best)... Done. 0.0s


Epoch 5/9




INFO:tensorflow:Assets written to: /aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best/assets


INFO:tensorflow:Assets written to: /aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best/assets
[34m[1mwandb[0m: Adding directory to artifact (/aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best)... Done. 0.0s


Epoch 6/9
Epoch 7/9
Epoch 8/9




INFO:tensorflow:Assets written to: /aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best/assets


INFO:tensorflow:Assets written to: /aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best/assets
[34m[1mwandb[0m: Adding directory to artifact (/aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best)... Done. 0.0s


Epoch 9/9




INFO:tensorflow:Assets written to: /aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best/assets


INFO:tensorflow:Assets written to: /aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best/assets
[34m[1mwandb[0m: Adding directory to artifact (/aiffel/aiffel/wandb/run-20240625_104953-nwijzn3l/files/model-best)... Done. 0.0s


4/4 - 0s - loss: 1.3829 - accuracy: 0.2100


VBox(children=(Label(value='87.700 MB of 87.700 MB uploaded (0.094 MB deduped)\r'), FloatProgress(value=1.0, m…

0,1
Test Accuracy Rate:,▁
Test Error Rate:,▁
accuracy,▁▃▃▄█▃▂▁▅
epoch,▁▂▃▄▅▅▆▇█
loss,█▁▁▁▁▁▁▁▁
val_accuracy,▃▃▁▃▃▁███
val_loss,█▇▅▃▃▄▅▂▁

0,1
Test Accuracy Rate:,21.0
Test Error Rate:,79.0
accuracy,0.26371
best_epoch,8.0
best_val_loss,1.38869
epoch,8.0
loss,1.39009
val_accuracy,0.26
val_loss,1.38869


- BrokenPipeError: [Errno 32] Broken pipe 발생