# DistilBERT 

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
! pip install sentencepiece
! pip install transformers
! pip install tensorflow-addons
! pip install tf-models-official
! pip install --upgrade tensorflow-hub

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 6.3MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.95
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/81/91/61d69d58a1af1bd81d9ca9d62c90a6de3ab80d77f27c5df65d9a2c1f5626/transformers-4.5.0-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.2MB 6.9MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 23.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers


# DistilBERT ENCODER Helper

In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    
    all_tokens = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
    
    return np.array(all_tokens)

#Building DistilBERT Model Helper

In [None]:
def build_model(transformer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

# Uploading Data

In [None]:
train_data = pd.read_csv('/content/gdrive/MyDrive/all_data/train_data.csv')
train_data.head()

test_data = pd.read_csv('/content/gdrive/MyDrive/all_data/test_data.csv')
train_data.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,tweets,bot
5000,5000,b3bca488109c80a6f37419fb4933558c,"I learned law so well, the day I graduated I s...",bot
5001,5001,b3bca488109c80a6f37419fb4933558c,Great Review of The Ruins of Gorlan by John Fl...,bot
5002,5002,b3bca488109c80a6f37419fb4933558c,"When Life Gives You Questions, Google has Answ...",bot
5003,5003,b3bca488109c80a6f37419fb4933558c,If you're listening to a rock star in order to...,bot
5004,5004,b3bca488109c80a6f37419fb4933558c,Hilarious What Went Wrong T-shirt - Love It! ...,bot


# Train Model

In [None]:
transformer_layer = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=363423424.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'activation_13', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [None]:
model = build_model(transformer_layer, max_len=160)
model.summary()

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 160)]             0         
_________________________________________________________________
tf_distil_bert_model (TFDist TFBaseModelOutput(last_hi 66362880  
_________

# Converting Labels to Binary

In [None]:
batch_1 = train_data[train_data.columns[-2:]]

batch_1.bot[batch_1.bot == 'bot'] = 1
batch_1.bot[batch_1.bot == 'human'] = 0

batch_1.head()


batch_2 = test_data[test_data.columns[-2:]]


batch_2.bot[batch_2.bot == 'bot'] = 1
batch_2.bot[batch_2.bot == 'human'] = 0

batch_2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(~key, value, inplace=True)


Unnamed: 0,tweets,bot
5000,Yellows building. HRRR did a good job with thi...,0
5001,RT @AWxNYC: #SNOW starting to stick to the roa...,0
5002,@mattlanza I’ll prob sleep thru most of it hah,0
5003,@KevinMyattWx Good ole weather,0
5004,RT @_CTWeather: @MikeTFox5 @dougkammerer @hbwx...,0


# Model

In [None]:
train_input = bert_encode(batch_1.tweets.values, tokenizer, max_len=160)
test_input = bert_encode(batch_2.tweets.values, tokenizer, max_len=160)
train_labels = batch_1.bot.values

In [None]:
train_history = model.fit(train_input.astype(np.float32), train_labels.astype(np.float32), epochs=1)


test_pred = model.predict(test_input.astype(np.float32), verbose=1)



# Accuracy Score

In [None]:
target = test_pred.round().astype(int)

from sklearn.metrics import accuracy_score

original = []
final = []

for i in target:
  for j in i:
    final.append(j)

for i in batch_2.bot.values:
  original.append(i)


accuracy_score(original, final)