In [9]:
import numpy as np
import pandas as pd
import os

for dirname, _, filenames in os.walk('input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

input\sample_submission.csv.zip
input\test.csv.zip
input\test_labels.csv.zip
input\train.csv.zip


In [6]:
import re 
import nltk
import texthero as hero
import tensorflow as tf

from nltk.corpus import words
from nltk.corpus import stopwords

from transformers import TFAutoModel
from transformers import AutoTokenizer

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout

In [5]:
pip install transformers

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp38-cp38-win_amd64.whl (2.0 MB)
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.4.0 sacremoses-0.0.47 tokenizers-0.10.3 transformers-4.15.0
Note: you may need to restart the kernel to use updated packages.


In [8]:
train_df = pd.read_csv("input/train.csv.zip")
test_df = pd.read_csv("input/test.csv.zip")

In [10]:
target_col = train_df.columns[2:]
feature_col = train_df.columns[1:2]

In [11]:
def df_preprocess(df, col):
    stop = set(stopwords.words('english'))
    df[col] = (df[col].pipe(hero.lowercase).
                       pipe(hero.remove_urls).
                       pipe(hero.remove_digits).
                       pipe(hero.remove_punctuation).
                       pipe(hero.remove_html_tags))
    return df

In [12]:
with tf.device('/GPU:0'):
    pre_train_df = df_preprocess(train_df, feature_col[0])

In [13]:
pre_test_df = df_preprocess(test_df, feature_col[0])

In [14]:
def create_tokenizer(model_selected):
  tokenizer = AutoTokenizer.from_pretrained(model_selected)
  return tokenizer

def data_tokenization(dataset, col, max_len, tokenizer):
    tokens = dataset[col].apply(
        lambda x: tokenizer(x,return_tensors = 'tf', 
                            truncation = True,
                            padding = 'max_length',
                            max_length = max_len, 
                            add_special_tokens = True))
    input_ids = []
    attention_mask = []
    for i in tokens:
        input_ids.append(i['input_ids'])
        attention_mask.append(i['attention_mask'])
    input_ids, attention_mask = np.squeeze(input_ids), np.squeeze(attention_mask)
    return [input_ids,attention_mask]

def bert_model(model_selected, max_len, learning_rate):
  bert = TFAutoModel.from_pretrained(model_selected)
  for layer in bert.layers:
      layer.trainable = True
  optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate, beta_1 = 0.9, beta_2 = 0.999, epsilon = 1e-07, amsgrad = False, name = 'Adam')
  input_ids = Input(shape = (max_len,),dtype = tf.int32,name = 'input_ids')
  attention_mask = Input(shape = (max_len,), dtype = tf.int32,name = 'attention_mask')
  bert = bert(input_ids, attention_mask)
  x = bert[0][:,0,:]
  x = tf.keras.layers.Dropout(0.1)(x)
  x = tf.keras.layers.Dense(128)(x)
  x = tf.keras.layers.Dense(64)(x)
  x = tf.keras.layers.Dense(32)(x)
  output = tf.keras.layers.Dense(6, activation = 'relu')(x)
  model = Model(inputs = [input_ids,attention_mask], outputs = [output])
  model.compile(optimizer = optimizer,
                loss = tf.keras.losses.BinaryCrossentropy(from_logits = True, reduction = tf.keras.losses.Reduction.NONE, name = 'binary_crossentropy'),
                metrics = ['accuracy'])
  return model

In [15]:
model_selected = 'bert-base-uncased'
max_len = 256
epochs = 2
learning_rate = 2e-5
batch_size = 4

In [16]:
tokenizer = create_tokenizer(model_selected)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [17]:
x_train = data_tokenization(pre_train_df, feature_col[0], max_len, tokenizer)

In [18]:
y_train = pre_train_df[target_col].values

In [19]:
bert = bert_model(model_selected, max_len, learning_rate)
bert.summary()

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           

In [21]:
with tf.device('/GPU:0'):
    bert.fit(x_train, y_train, batch_size = batch_size, epochs = epochs, verbose = 1)

Epoch 1/2
Epoch 2/2


In [22]:
test_ids = pre_test_df['id']
x_test = data_tokenization(pre_test_df, feature_col[0], max_len, tokenizer)

In [23]:
preds = bert.predict(x_test)
submiss_df = pd.DataFrame(preds, columns = target_col)
submiss_df['id'] = test_ids

In [24]:
submiss_df.to_csv('submissioin.csv', index = False, header = True)