In [1]:
import pandas as pd
import tensorflow as tf
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
mail = pd.read_csv('EDA3_1.csv')

mail.dropna(axis=0, inplace=True)
# mail = mail.applymap(str)
mail = mail.replace('spam', 1)
mail = mail.replace('ham', 0)
mail['label'].unique()

array([1, 0])

In [3]:
mail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60264 entries, 0 to 60263
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    60264 non-null  object
 1   label   60264 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 941.8+ KB


In [4]:
from sklearn.model_selection import train_test_split

x = mail['text'].to_list()
y = mail['label'].to_list()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2022, shuffle = True)

In [5]:
from transformers import BertTokenizerFast

HUGGINGFACE_MODEL_PATH = "klue/bert-base"

# Load Tokenizer
tokenizer = BertTokenizerFast.from_pretrained(HUGGINGFACE_MODEL_PATH)

# Tokenizing
train_encodings = tokenizer(x_train, truncation=True, padding=True)
test_encodings = tokenizer(x_test, truncation=True, padding=True)

In [6]:
# trainset-set
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))

# validation-set
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

In [7]:
from transformers import TFBertForSequenceClassification

num_labels = 2
model = TFBertForSequenceClassification.from_pretrained(HUGGINGFACE_MODEL_PATH, num_labels=num_labels, from_pt=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

es = EarlyStopping(
    monitor="val_accuracy", 
    min_delta=0.001, # the threshold that triggers the termination (acc should at least improve 0.001)
    patience=2)

tf.keras.callbacks.ModelCheckpoint(
    filepath = 'ckpt', 
    monitor='val_loss', 
    verbose=1, 
    save_best_only=False,
    save_weights_only=False, 
    mode='auto', 
    save_freq='epoch')

with tf.device('/gpu:0'):
    model.fit(
        train_dataset.shuffle(50000).batch(16), epochs=1, batch_size=16,
        validation_data=val_dataset.shuffle(50000).batch(16),
        callbacks = [es]
    )



## Model Eval

In [21]:
test_text = pd.read_csv("spaced_spam_test_text.csv", encoding = 'utf-8')
test_label = pd.read_csv("spam_test_label.csv", encoding = 'utf-8')

In [10]:
from transformers import TextClassificationPipeline

text_classifier = TextClassificationPipeline(
    tokenizer=tokenizer, 
    model=model, 
    framework='tf',
    return_all_scores=True
)

In [11]:
predicted_label_list = []
predicted_score_list = []

for text in tqdm(test_text['text']):
    # predict
    preds_list = text_classifier(text)[0]
    sorted_preds_list = sorted(preds_list, key=lambda x: x['score'], reverse=True)
    predicted_label_list.append(sorted_preds_list[0]['label']) # label
    predicted_score_list.append(sorted_preds_list[0]['score']) # score

100%|██████████| 9896/9896 [27:04<00:00,  6.09it/s]


In [23]:
# predicted_label_list = predicted_label_list.replace('LABEL_1', 1)
# predicted_label_list= predicted_label_list.replace('LABEL_0', 0)

pred = []
f1_pred = []
for i in predicted_label_list:
    if i == 'LABEL_1':
        pred.append('spam')
        f1_pred.append(1)
    elif i == 'LABEL_0':
        pred.append('ham')
        f1_pred.append(0)

In [24]:
from sklearn.metrics import *

print(classification_report(pred, test_label['label'], digits = 6))

              precision    recall  f1-score   support

         ham   0.965931  0.937652  0.951582      3689
        spam   0.963579  0.980345  0.971889      6207

    accuracy                       0.964430      9896
   macro avg   0.964755  0.958999  0.961736      9896
weighted avg   0.964456  0.964430  0.964319      9896



In [17]:
answers = test_label['label'].replace(['spam','ham'],[1,0])

In [25]:
print(f1_score(f1_pred, answers))

0.9718894745248363


In [None]:
result = pd.DataFrame({'id':range(0,len(pred)),'label':pred})

In [None]:
result.to_csv('late_submission_bert.csv',index = False)

In [26]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  110617344 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 110,618,882
Trainable params: 110,618,882
Non-trainable params: 0
_________________________________________________________________
