In [1]:
!pip install transformers==2.11.0
!pip install tensorflow==2.2.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==2.11.0
  Downloading transformers-2.11.0-py3-none-any.whl (674 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m674.8/674.8 KB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers==0.7.0
  Downloading tokenizers-0.7.0-cp38-cp38-manylinux1_x86_64.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m103.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m73.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 KB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing m

In [2]:
import os
import numpy as np
import pandas as pd

import tensorflow as tf
from transformers import *
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

tf.random.set_seed(111)
np.random.seed(111)

BATCH_SIZE = 32
NUM_EPOCHS = 3
VALID_SPLIT = 0.2
MAX_LEN = 128

In [3]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [4]:
df = pd.read_csv('gdrive/My Drive/Colab Notebooks/Solux/news2.csv')
test = pd.read_csv('/content/portal_news.csv')

In [5]:
def precleaning(df):
  df.loc[(df['news'] == '조선일보') | (df['news'] == '중앙일보'), 'news'] = 1
  df.loc[(df['news'] == '한겨레') | (df['news'] == '경향'), 'news'] = 0
  df['news'] = df['news'].astype(int)
  df=df[['article', 'news']]

def test_precleaning(test):
  test['body']=test['body'].str.replace(pat=r'[^\w]',repl=r' ',regex=True)

In [6]:
precleaning(df)
df=df[['article', 'news']]

test_precleaning(test)
test=test['body']

In [7]:
def bert_tokenizer(sentence, MAX_LEN):
  tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
  encoded_dict = tokenizer.encode_plus(
      text = sentence,
      add_special_tokens = True,
      max_length = MAX_LEN, 
      pad_to_max_length = True,
      return_attention_mask = True
  )

  input_id = encoded_dict['input_ids']
  attention_mask = encoded_dict['attention_mask']
  token_type_id = encoded_dict['token_type_ids']

  return input_id, attention_mask, token_type_id

In [8]:
input_ids = []
attention_masks = []
token_type_ids = []
train_data_labels = []

for train_sentence, train_label in zip(df['article'], df['news']):

  try:
    input_id, attention_mask, token_type_id =  bert_tokenizer(train_sentence, MAX_LEN)

    input_ids.append(input_id)
    attention_masks.append(attention_mask)
    token_type_ids.append(token_type_id)
    train_data_labels.append(train_label)
  except Exception as e:
    print(e)
    pass

train_news_input_ids = np.array(input_ids, dtype=int)
train_news_attention_masks = np.array(attention_masks, dtype=int)
train_news_token_type_ids = np.array(token_type_ids, dtype=int)
train_news_inputs = (train_news_input_ids, train_news_attention_masks, train_news_token_type_ids)
train_data_labels = np.asarray(train_data_labels, dtype=np.int32)

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

In [9]:
idx = 5
input_id = train_news_input_ids[idx]
attention_mask = train_news_attention_masks[idx]
token_type_id = train_news_token_type_ids[idx]

In [10]:
class TFBertClassifier(tf.keras.Model):
  def __init__(self, model_name, dir_path, num_class):
    super(TFBertClassifier, self).__init__()

    self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)
    self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
    self.classifier = tf.keras.layers.Dense(1, activation="sigmoid",kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range))
    #self.classifier = tf.keras.layers.Dense(num_class,kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range),name='classifier') 

  def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
      outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
      pooled_output = outputs[1]
      pooled_output = self.dropout(pooled_output, training-training)
      logits = self.classifier(pooled_output)

      return logits

cls_model = TFBertClassifier(model_name='bert-base-multilingual-cased',
                             dir_path='bert_ckpt',
                             num_class=1)

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08G [00:00<?, ?B/s]

In [11]:
optimizer = tf.keras.optimizers.Adam(3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
cls_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[metric])

In [12]:
model_name = "tf2_bert_news"

es_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.001, patience=2)

checkpoint_path = os.path.join('./', model_name, 'weight.h5')
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = ModelCheckpoint(checkpoint_path, monitor='val_accuracy',
                              verbose=1, save_best_only=True, save_weights_only=True)

history = cls_model.fit(train_news_inputs, train_data_labels, 
                        epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, validation_split=VALID_SPLIT,
                        callbacks=[es_callback, cp_callback])

Epoch 1/3
Epoch 00001: val_accuracy improved from -inf to 1.00000, saving model to ./tf2_bert_news/weight.h5
Epoch 2/3
Epoch 00002: val_accuracy did not improve from 1.00000
Epoch 3/3
Epoch 00003: val_accuracy did not improve from 1.00000


In [52]:
def pr(test):
  input_ids = []
  attention_masks = []
  token_type_ids = []
  test_data_labels = []

  for test_sentence in test:

    try:
      input_id, attention_mask, token_type_id =  bert_tokenizer(test_sentence, MAX_LEN)

      input_ids.append(input_id)
      attention_masks.append(attention_mask)
      token_type_ids.append(token_type_id)
    except Exception as e:
      pass

  test_news_input_ids = np.array(input_ids, dtype=int)
  test_news_attention_masks = np.array(attention_masks, dtype=int)
  test_news_token_type_ids = np.array(token_type_ids, dtype=int)
  test_news_inputs = (train_news_input_ids, train_news_attention_masks, train_news_token_type_ids)

  prob=cls_model.predict(test_news_inputs)
  prob=np.squeeze(prob, axis=1)

  return prob

In [53]:
pr(test)

array([0.6227857 , 0.95554554, 0.82902503, 0.98133814, 0.8870925 ,
       0.98203474, 0.82130754, 0.6044521 , 0.94235265, 0.9536841 ,
       0.54970783, 0.9813849 , 0.56523526, 0.9744942 , 0.5074979 ,
       0.9246321 , 0.94221306, 0.5907387 , 0.97655344, 0.953382  ,
       0.98599696, 0.904965  , 0.9834175 , 0.9185616 , 0.70329976,
       0.9324054 , 0.7246337 , 0.96093744, 0.76285374, 0.75055045,
       0.95117724, 0.5640191 , 0.76443666, 0.9710201 , 0.9236755 ,
       0.68899566, 0.8976931 , 0.41906422, 0.93967295, 0.9806212 ,
       0.97902095, 0.9811913 , 0.737169  , 0.8884097 , 0.8330939 ,
       0.9367533 , 0.9539365 , 0.48093253, 0.95783687, 0.9102022 ,
       0.9325628 , 0.826328  , 0.9471356 , 0.9576661 , 0.93448687,
       0.8605542 , 0.86561215, 0.96083367, 0.8776195 , 0.9853219 ,
       0.7684074 , 0.35151872, 0.37371558, 0.9494616 , 0.68988174,
       0.37181962, 0.4005416 , 0.96042144, 0.9876623 , 0.9464644 ,
       0.9891349 , 0.53722334, 0.9671105 , 0.7810215 , 0.97540

In [55]:
def left_right(prob):
  lr = [int(i>0.5) for i in prob]
  lr = ['진보' if num == 1 else '보수' for num in lr]
  return lr

left_right(lr)
#확률 : prob
#진보 보수 이분값 : lr

['진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '보수',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '보수',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '보수',
 '보수',
 '진보',
 '진보',
 '보수',
 '보수',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '보수',
 '진보',
 '진보',
 '보수',
 '보수',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',
 '진보',