# **HATESPEECH MULTILABEL CLASSIFICATION**

## **INSTALL AND IMPORT LIBRARY**

In [None]:
!pip -q install datasets
!pip -q install transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.[0

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from sklearn.model_selection import train_test_split
from google.colab import files
from google.colab import drive
import pandas as pd
import re
import os
import numpy as np
import tensorflow as tf
import shutil
from sklearn.metrics import classification_report, accuracy_score

## **LOAD AND TRANSFORM DATASET**

In [None]:
df = load_dataset("keelezibel/hate-speech-indo")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/228 [00:00<?, ?B/s]

train.jsonl:   0%|          | 0.00/4.41M [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15127 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3782 [00:00<?, ? examples/s]

In [None]:
df

DatasetDict({
    train: Dataset({
        features: ['text', 'hs', 'abusive', 'hs_individual', 'hs_group', 'hs_religion', 'hs_race', 'hs_physical', 'hs_gender', 'hs_other', 'hs_weak', 'hs_moderate', 'hs_strong'],
        num_rows: 15127
    })
    test: Dataset({
        features: ['text', 'hs', 'abusive', 'hs_individual', 'hs_group', 'hs_religion', 'hs_race', 'hs_physical', 'hs_gender', 'hs_other', 'hs_weak', 'hs_moderate', 'hs_strong'],
        num_rows: 3782
    })
})

In [None]:
df_train = df['train'].to_pandas()
df_test = df['test'].to_pandas()

In [None]:
df = pd.concat([df_train, df_test], ignore_index=True)
df.head()

In [None]:
df.info()

## **PREPROCESSING DATA**

In [None]:
def remove_duplicate_user_rt(text):
    words = text.split()  # Memisahkan teks menjadi daftar kata
    unique_words = []

    for word in words:
        if word in {"USER", "RT"} and (not unique_words or unique_words[-1] != word):
            unique_words.append(word)
        elif word not in {"USER", "RT"}:
            unique_words.append(word)

    return " ".join(unique_words)  # Menggabungkan kembali menjadi teks

In [None]:
df["text"] = df["text"].apply(remove_duplicate_user_rt)

In [None]:
def clean_text(text):
    # Hapus "USER", "RT", "\n", "www", dan "https"
    text = re.sub(r"\b(USER|RT|www|https)\b", "", text, flags=re.IGNORECASE)

    # Hapus karakter ASCII emoticon seperti "\xf0\x9f\x91\xa6"
    text = re.sub(r"\\x[a-fA-F0-9]{2}", "", text)

    # Hapus karakter whitespace tambahan
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [None]:
df["text"] = df["text"].apply(clean_text)

In [None]:
df.head()

In [None]:
def letters_only(text):
  text = re.sub(r"[^a-zA-Z]+", " ", text).strip().lower()
  return text

In [None]:
df["text"] = df["text"].apply(letters_only)

In [None]:
df.head()

Unnamed: 0,text,hs,abusive,hs_individual,hs_group,hs_religion,hs_race,hs_physical,hs_gender,hs_other,hs_weak,hs_moderate,hs_strong
0,keracunan tiap nete,0,1,0,0,0,0,0,0,0,0,0,0
1,cebong akan terus membela junjungannya walaupu...,1,1,0,1,1,0,0,0,0,0,1,0
2,terus bagaimana dgn uu md bang,0,0,0,0,0,0,0,0,0,0,0,0
3,jangan jahatin anak orang n baru juga sekali u...,1,1,1,0,0,0,1,0,0,1,0,0
4,derita nun jauh disana akan sama dirasakan per...,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
!wget -O kamusalay.csv https://raw.githubusercontent.com/okkyibrohim/id-multi-label-hate-speech-and-abusive-language-detection/master/new_kamusalay.csv

--2024-12-07 07:30:42--  https://raw.githubusercontent.com/okkyibrohim/id-multi-label-hate-speech-and-abusive-language-detection/master/new_kamusalay.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 285941 (279K) [application/octet-stream]
Saving to: ‘kamusalay.csv’


2024-12-07 07:30:42 (10.5 MB/s) - ‘kamusalay.csv’ saved [285941/285941]



In [None]:
 df_slang = pd.read_csv('kamusalay.csv',header=None, encoding="latin1")

In [None]:
df_slang.head()

Unnamed: 0,0,1
0,anakjakartaasikasik,anak jakarta asyik asyik
1,pakcikdahtua,pak cik sudah tua
2,pakcikmudalagi,pak cik muda lagi
3,t3tapjokowi,tetap jokowi
4,3x,tiga kali


In [None]:
df_slang = df_slang.rename(columns={0: "nonformal", 1: "formal"})

In [None]:
slang_dict = {df_slang.nonformal[i]: df_slang.formal[i] for i in range(0, len(df_slang.nonformal.values))}

In [None]:
def replace_slang(sentence):
    tokens = sentence.split()
    for i, token in enumerate(tokens):
        if token in slang_dict.keys():
            tokens[i] = slang_dict[token]
    return ' '.join(tokens)

df['text'] = df['text'].apply(replace_slang)
df.head()

Unnamed: 0,text,hs,abusive,hs_individual,hs_group,hs_religion,hs_race,hs_physical,hs_gender,hs_other,hs_weak,hs_moderate,hs_strong
0,keracunan tiap nete,0,1,0,0,0,0,0,0,0,0,0,0
1,cebong akan terus membela junjungannya walaupu...,1,1,0,1,1,0,0,0,0,0,1,0
2,terus bagaimana dengan undang undang md bang,0,0,0,0,0,0,0,0,0,0,0,0
3,jangan menjahati anak orang dan baru juga seka...,1,1,1,0,0,0,1,0,0,1,0,0
4,derita nun jauh di sana akan sama dirasakan pe...,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df_multi = df[['text', 'abusive', 'hs_individual', 'hs_group', 'hs_religion', 'hs_race', 'hs_physical', 'hs_gender', 'hs_other']]

In [None]:
df_multi.replace(["", "None", "null"], np.nan, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_multi.replace(["", "None", "null"], np.nan, inplace=True)


In [None]:
df_multi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18909 entries, 0 to 18908
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text           18844 non-null  object
 1   abusive        18909 non-null  int64 
 2   hs_individual  18909 non-null  int64 
 3   hs_group       18909 non-null  int64 
 4   hs_religion    18909 non-null  int64 
 5   hs_race        18909 non-null  int64 
 6   hs_physical    18909 non-null  int64 
 7   hs_gender      18909 non-null  int64 
 8   hs_other       18909 non-null  int64 
dtypes: int64(8), object(1)
memory usage: 1.3+ MB


In [None]:
df_multi.dropna(subset=['text'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_multi.dropna(subset=['text'], inplace=True)


In [None]:
df_multi.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18844 entries, 0 to 18908
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text           18844 non-null  object
 1   abusive        18844 non-null  int64 
 2   hs_individual  18844 non-null  int64 
 3   hs_group       18844 non-null  int64 
 4   hs_religion    18844 non-null  int64 
 5   hs_race        18844 non-null  int64 
 6   hs_physical    18844 non-null  int64 
 7   hs_gender      18844 non-null  int64 
 8   hs_other       18844 non-null  int64 
dtypes: int64(8), object(1)
memory usage: 1.4+ MB


In [None]:
df_multi_train = df_multi.iloc[:15127]
df_multi_val_test = df_multi.iloc[15127:]

In [None]:
df_multi_val, df_multi_test = train_test_split(df_multi_val_test, test_size=0.2, random_state=42)

In [None]:
df_multi_train.to_csv('multilabel_train.csv', index=False)
df_multi_val.to_csv('multilabel_val.csv', index=False)
df_multi_test.to_csv('multilabel_test.csv', index=False)

## **TOKENIZING**

In [None]:
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

def tokenize_text(text, max_length=500):
    """
    Tokenisasi teks menggunakan IndoBERT tokenizer dan mengembalikan token_ids dan attention_mask.
    """
    encoded = tokenizer(text.to_list(), padding=True, truncation=True, max_length=max_length, return_tensors='tf')

    token_ids = encoded['input_ids']
    attention_mask = encoded['attention_mask']

    return token_ids, attention_mask

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
multitrain_token_ids, multitrain_attention_mask = tokenize_text(df_multi_train['text'])
multival_token_ids, multival_attention_mask = tokenize_text(df_multi_val['text'])
multitest_token_ids, multitest_attention_mask = tokenize_text(df_multi_test['text'])

In [None]:
multitrain_token_ids

<tf.Tensor: shape=(15127, 85), dtype=int32, numpy=
array([[    2, 16931,  2591, ...,     0,     0,     0],
       [    2, 28731,   225, ...,     0,     0,     0],
       [    2,   944,   916, ...,     0,     0,     0],
       ...,
       [    2,  6931,  3894, ...,     0,     0,     0],
       [    2,   599,   176, ...,     0,     0,     0],
       [    2,   304,   731, ...,     0,     0,     0]], dtype=int32)>

In [None]:
df_multi_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15127 entries, 0 to 15181
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text           15127 non-null  object
 1   abusive        15127 non-null  int64 
 2   hs_individual  15127 non-null  int64 
 3   hs_group       15127 non-null  int64 
 4   hs_religion    15127 non-null  int64 
 5   hs_race        15127 non-null  int64 
 6   hs_physical    15127 non-null  int64 
 7   hs_gender      15127 non-null  int64 
 8   hs_other       15127 non-null  int64 
dtypes: int64(8), object(1)
memory usage: 1.2+ MB


In [None]:
df_multi_val.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2973 entries, 18422 to 18364
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text           2973 non-null   object
 1   abusive        2973 non-null   int64 
 2   hs_individual  2973 non-null   int64 
 3   hs_group       2973 non-null   int64 
 4   hs_religion    2973 non-null   int64 
 5   hs_race        2973 non-null   int64 
 6   hs_physical    2973 non-null   int64 
 7   hs_gender      2973 non-null   int64 
 8   hs_other       2973 non-null   int64 
dtypes: int64(8), object(1)
memory usage: 232.3+ KB


In [None]:
df_multi_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 744 entries, 17886 to 17688
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text           744 non-null    object
 1   abusive        744 non-null    int64 
 2   hs_individual  744 non-null    int64 
 3   hs_group       744 non-null    int64 
 4   hs_religion    744 non-null    int64 
 5   hs_race        744 non-null    int64 
 6   hs_physical    744 non-null    int64 
 7   hs_gender      744 non-null    int64 
 8   hs_other       744 non-null    int64 
dtypes: int64(8), object(1)
memory usage: 58.1+ KB


In [None]:
label_multi_train = tf.convert_to_tensor(df_multi_train.iloc[:, 1:].values, dtype=tf.float32)
label_multi_val = tf.convert_to_tensor(df_multi_val.iloc[:, 1:].values, dtype=tf.float32)
label_multi_test = tf.convert_to_tensor(df_multi_test.iloc[:, 1:].values, dtype=tf.float32)

In [None]:
label_multi_train

<tf.Tensor: shape=(15127, 8), dtype=float32, numpy=
array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [None]:
print(label_multi_train.shape[1])

8


In [None]:
formatted_multi_train = tf.data.Dataset.from_tensor_slices(({"input_ids":multitrain_token_ids, "attention_mask":multitrain_attention_mask}, label_multi_train))
formatted_multi_val = tf.data.Dataset.from_tensor_slices(({"input_ids":multival_token_ids, "attention_mask":multival_attention_mask}, label_multi_val))
formatted_multi_test = tf.data.Dataset.from_tensor_slices(({"input_ids":multitest_token_ids, "attention_mask":multitest_attention_mask}, label_multi_test))

In [None]:
for element in formatted_multi_train.take(5):
    print(element)

({'input_ids': <tf.Tensor: shape=(85,), dtype=int32, numpy=
array([    2, 16931,  2591,  3187, 30357,     3,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(85,), dtype=int32, numpy=
array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
batch_size = 8
formatted_multi_train = formatted_multi_train.shuffle(buffer_size=len(df_multi_train)).batch(batch_size)
formatted_multi_val = formatted_multi_val.batch(batch_size)
formatted_multi_test = formatted_multi_test.batch(batch_size)

## **CONNECT TO GOOGLE DRIVE**

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
checkpoint_dir = '/content/drive/MyDrive/model'
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_filepath = os.path.join(checkpoint_dir, "indobert_epoch_{epoch:02d}.ckpt")

In [None]:
def save_checkpoint(model, optimizer, epoch, wait, best_val_loss, filepath):
  model.save_weights(filepath)

  with open(filepath + '.meta', 'w') as f:
    f.write(f"{epoch}\n")
    f.write(f"{wait}\n")
    f.write(f"{best_val_loss}\n")

def load_checkpoint(model, optimizer, checkpoint_dir):
  latest_ckpt = tf.train.latest_checkpoint(checkpoint_dir)
  if latest_ckpt:
    print(f"memuat checkpoint dari {latest_ckpt}")
    model.load_weights(latest_ckpt)

    with open(latest_ckpt + '.meta', 'r') as f:
      lines = f.readlines()
      initial_epoch = int(lines[0].strip())
      wait = int(lines[1].strip())
      best_val_loss = float(lines[2].strip())
    return initial_epoch+1, wait, best_val_loss
  else:
    print("Tidak menemukan checkpoint. Mulai dari awal...")
    return 0, 0, float('inf')

## **FINE-TUNING IndoBERT**

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1", num_labels=label_multi_train.shape[1])

tf_model.h5:   0%|          | 0.00/656M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = Adam(learning_rate=3e-5)
loss = BinaryCrossentropy(from_logits=True) # otomatis ubah layer terakhir jd multilabel dan sigmoid

In [None]:
tolerance = 3
min_delta = 1e-4 # berapa banyak perubahan agar training pada epoch tsb dinilai bagus
num_epoch = 4
initial_epoch, wait, best_loss = load_checkpoint(model, optimizer, checkpoint_dir)

for epoch in range(initial_epoch, num_epoch):
    print(f"Epoch {epoch+1}/{num_epoch}")
    print("="*10)
    train_loss_total = 0
    train_step = 0

    for step, (x_batch_train, y_batch_train) in enumerate(formatted_multi_train):
      with tf.GradientTape() as tape:
        logits = model(x_batch_train, training=True).logits
        loss_value = loss(y_batch_train, logits) # sampai saat ini adalah FFN

      grads = tape.gradient(loss_value, model.trainable_variables)
      optimizer.apply_gradients(zip(grads, model.trainable_variables)) # back propagation

      train_loss_total += loss_value
      train_step += 1

      if train_step % 100 == 0:
        print(f"Training loss (step {step+1}): {loss_value:.4f}")

      train_loss_avg = train_loss_total / train_step

      val_loss_total = 0

    for x_batch_val, y_batch_val in formatted_multi_test:
      val_logits = model(x_batch_val, training=False).logits
      val_loss_value = loss(y_batch_val, val_logits)

      val_loss_total += val_loss_value

    val_loss_avg = val_loss_total / len(formatted_multi_test)
    print("="*10)
    print(f"Training loss: {train_loss_avg:.4f}")
    print(f"Val loss: {val_loss_avg:.4f}")

    if val_loss_avg < best_loss - min_delta:
      best_loss = val_loss_avg
      wait = 0
      print("Loss berkurang!")
    else:
      wait += 1
      print("Loss meningkat!")

    if wait >= tolerance:
      print("Early stopping.....")
      break

    save_checkpoint(model, optimizer, epoch, wait, best_loss, checkpoint_filepath.format(epoch=epoch + 1))

Tidak menemukan checkpoint. Mulai dari awal...
Epoch 1/4
Training loss (step 100): 0.2592
Training loss (step 200): 0.2105
Training loss (step 300): 0.2424
Training loss (step 400): 0.3443
Training loss (step 500): 0.1200
Training loss (step 600): 0.2072
Training loss (step 700): 0.2446
Training loss (step 800): 0.1394
Training loss (step 900): 0.2119
Training loss (step 1000): 0.0932
Training loss (step 1100): 0.2970
Training loss (step 1200): 0.1484
Training loss (step 1300): 0.1517
Training loss (step 1400): 0.1994
Training loss (step 1500): 0.1087
Training loss (step 1600): 0.0868
Training loss (step 1700): 0.2817
Training loss (step 1800): 0.0986
Training loss: 0.1990
Val loss: 0.1406
Loss berkurang!
Epoch 2/4
Training loss (step 100): 0.3025
Training loss (step 200): 0.0603
Training loss (step 300): 0.0422
Training loss (step 400): 0.1086
Training loss (step 500): 0.0472
Training loss (step 600): 0.0375
Training loss (step 700): 0.1564
Training loss (step 800): 0.0689
Training lo

## **EVALUATION**

In [None]:
threshold = 0.5

preds = model.predict(formatted_multi_test)
probs = tf.sigmoid(preds.logits).numpy()
preds_label = (probs > threshold).astype(int)

true_labels = []
for _, label in formatted_multi_test:
  true_labels.extend(label.numpy())

true_labels = np.array(true_labels)

target_names = ['abusive', 'hs_individual', 'hs_group', 'hs_religion',
       'hs_race', 'hs_physical', 'hs_gender', 'hs_other']

report = classification_report(true_labels, preds_label, target_names=target_names, zero_division=0)
accuracy = accuracy_score(true_labels, preds_label)

print(report)
print(f"Accuracy: {accuracy}")

               precision    recall  f1-score   support

      abusive       0.94      0.93      0.93       285
hs_individual       0.84      0.83      0.84       191
     hs_group       0.89      0.75      0.81       104
  hs_religion       0.94      0.92      0.93        49
      hs_race       0.88      0.90      0.89        31
  hs_physical       0.93      0.68      0.79        19
    hs_gender       0.83      0.59      0.69        17
     hs_other       0.90      0.83      0.86       191

    micro avg       0.90      0.85      0.87       887
    macro avg       0.89      0.80      0.84       887
 weighted avg       0.90      0.85      0.87       887
  samples avg       0.48      0.47      0.47       887

Accuracy: 0.8360215053763441


## **SAVE MODEL AND TOKENIZER**

In [None]:
model.save_pretrained('hs_multilabel_bert')
tokenizer.save_pretrained('hs_multilabel_bert')

shutil.make_archive('hs_multilabel_bert', 'zip', 'hs_multilabel_bert')
files.download('hs_multilabel_bert.zip')