In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
! pip install sentencepiece
! pip install transformers
! pip install tensorflow-addons
! pip install tf-models-official
! pip install --upgrade tensorflow-hub
! pip install lime

Requirement already up-to-date: tensorflow-hub in /usr/local/lib/python3.7/dist-packages (0.11.0)


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers
from lime.lime_text import LimeTextExplainer


In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
    
    return np.array(all_tokens)

In [None]:
def build_model(transformer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
train_data = pd.read_csv('/content/gdrive/My Drive/w266_bot_id_final_project/bot_id_pan/train_data.csv')#[5000:]
train_data.head()

test_data = pd.read_csv('/content/gdrive/My Drive/w266_bot_id_final_project/bot_id_pan/test_data.csv')#[5000:]
train_data.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,tweets,bot
0,0,aba047cddb0cac75b2c1cec87491579d,Time Flies: 10 ‘Best-of’ Posts You Missed in S...,bot
1,1,aba047cddb0cac75b2c1cec87491579d,Time Flies: 10 ‘Best-of’ Posts You Missed in S...,bot
2,2,aba047cddb0cac75b2c1cec87491579d,Time Flies: 10 ‘Best-of’ Posts You Missed in S...,bot
3,3,aba047cddb0cac75b2c1cec87491579d,Time Flies: 10 ‘Best-of’ Posts You Missed in S...,bot
4,4,aba047cddb0cac75b2c1cec87491579d,Time Flies: 10 ‘Best-of’ Posts You Missed in S...,bot


In [None]:
transformer_layer = transformers.TFDistilBertModel.from_pretrained('distilbert-base-cased')
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-cased')

Some layers from the model checkpoint at distilbert-base-cased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_projector', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [None]:
model = build_model(transformer_layer, max_len=160)
model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 160)]             0         
_________________________________________________________________
tf_distil_bert_model_3 (TFDi TFBaseModelOutput(last_hi 65190912  
_________________________________________________________________
tf.__operators__.getitem_3 ( (None, 768)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 769       
Total params: 65,191,681
Trainable params: 65,191,681
Non-trainable params: 0
_________________________________________________________________


In [None]:
batch_1 = train_data[train_data.columns[-2:]]

batch_1.bot[batch_1.bot == 'bot'] = 1
batch_1.bot[batch_1.bot == 'human'] = 0

batch_1.head()


batch_2 = test_data[test_data.columns[-2:]]


batch_2.bot[batch_2.bot == 'bot'] = 1
batch_2.bot[batch_2.bot == 'human'] = 0

batch_2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(~key, value, inplace=True)


Unnamed: 0,tweets,bot
0,RT @Colmogorman: Watch to the end. Fairly shoc...,0
1,So I forgot it was my Uncle’s anniversary toda...,0
2,RT @CuteBabyAnimals: This hedgehog got stuck i...,0
3,RT @ubiquitousnjh: I can’t stop laughing https...,0
4,"RT @TarynDeVere: I have been raped, twice.\nI ...",0


In [None]:
train_input = bert_encode(batch_1.tweets.values, tokenizer, max_len=160)
test_input = bert_encode(batch_2.tweets.values, tokenizer, max_len=160)
train_labels = batch_1.bot.values

In [None]:
train_history = model.fit(train_input.astype(np.float32), train_labels.astype(np.float32), epochs=1)


test_pred = model.predict(test_input.astype(np.float32), verbose=1)



In [None]:
target = test_pred.round().astype(int)

from sklearn.metrics import accuracy_score

original = []
final = []

for i in target:
  for j in i:
    final.append(j)

for i in batch_2.bot.values:
  original.append(i)


accuracy_distillbert = accuracy_score(original, final)
print(accuracy_distillbert)
#from sklearn.metrics import log_loss
#log_loss = log_loss(original, final)


0.845375


In [None]:
#https://www.kaggle.com/xhlulu/disaster-nlp-distilbert-in-tf?select=test.csv

## Run lime for tweet numbers for distilbert 150, 741, 291 all in X_test_base

In [None]:
X_train_base = pd.read_csv('/content/gdrive/MyDrive/w266_bot_id_final_project/bot_id_pan/all_data/X_train.csv')
X_dev_base = pd.read_csv('/content/gdrive/MyDrive/w266_bot_id_final_project/bot_id_pan/all_data/X_dev.csv')
X_test_base = pd.read_csv('/content/gdrive/MyDrive/w266_bot_id_final_project/bot_id_pan/all_data/X_test.csv')
X_earlybird_base = pd.read_csv('/content/gdrive/MyDrive/w266_bot_id_final_project/bot_id_pan/all_data/X_earlybird.csv')
y_train_base = pd.read_csv('/content/gdrive/MyDrive/w266_bot_id_final_project/bot_id_pan/all_data/y_train.csv')
y_dev_base = pd.read_csv('/content/gdrive/MyDrive/w266_bot_id_final_project/bot_id_pan/all_data/y_dev.csv')
y_test_base = pd.read_csv('/content/gdrive/MyDrive/w266_bot_id_final_project/bot_id_pan/all_data/y_test.csv')
y_early_bird_base = pd.read_csv('/content/gdrive/MyDrive/w266_bot_id_final_project/bot_id_pan/all_data/y_earlybird.csv')

In [None]:
X_train_base = X_train_base['tweets']
X_dev_base = X_dev_base['tweets']
X_earlybird_base = X_earlybird_base['tweets']
X_test_base = X_test_base['tweets']
y_train_base = y_train_base['bot']
y_dev_base = y_dev_base['bot']
y_test_base = y_test_base['bot']
y_early_bird_base = y_early_bird_base['bot']
X_train_base, X_dev_base, y_train_base, y_dev_base = np.array(X_train_base),np.array(X_dev_base),np.array(y_train_base), np.array(y_dev_base)
X_test_base, y_test_base = np.array(X_test_base), np.array(y_test_base)

In [None]:
class_names = ['human', 'bot']
max_length = 160
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
bert_encode(X_test_base[150], tokenizer, 160)

In [None]:
def bert_predict(texts):
  "Returns prediction with embeding for lime"
  _text_data = bert_encode(texts, tokenizer, max_length)
  predictions = np.round(model.predict(_text_data))
  return np.array([[float(1-x), float(x)] for x in predictions])

In [None]:
def make_bert_interpretable(tweet): 
  "returns interpretability for individual tweet"
  exp = explainer.explain_instance(
      tweet, bert_predict, num_features = max_length, top_labels = 1
  )
  return exp.show_in_notebook(text = True)

In [None]:
make_bert_interpretable(X_test_base[150])

In [None]:
make_bert_interpretable(X_test_base[741])

In [None]:
make_bert_interpretable(X_test_base[291])

In [None]:
def wrong_prediction_distill_bert(model,X_data, X_data_encoded, Y_data):
  "Returns an array of wrong predictions"
  predictions = model.predict(X_data_encoded)
  max_pred = np.round(predictions)
  wrong_predictions = X_data[max_pred != Y_data].T
  return wrong_predictions

In [None]:
wrong_vals_distill_bert = wrong_prediction_distill_bert(model, X_test_base, test_input, y_test_base)



In [None]:
distillbert_test_predictions = bert_encode(X_test_base, tokenizer, max_length)
predictions_distillbert_test = np.round(model.predict(distillbert_test_predictions))

In [None]:

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test_base, predictions_distillbert_test))


In [None]:
wrong_predictions_distill_bert = pd.DataFrame(wrong_vals_distill_bert)
wrong_predictions_distill_bert.to_csv('/content/gdrive/MyDrive/w266_bot_id_final_project/bot_id_pan/wrong_test_distillbert_correct.csv')