In [0]:
!pip install bert-for-tf2

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/35/5c/6439134ecd17b33fe0396fb0b7d6ce3c5a120c42a4516ba0e9a2d6e43b25/bert-for-tf2-0.14.4.tar.gz (40kB)
[K     |████████                        | 10kB 19.5MB/s eta 0:00:01[K     |████████████████▏               | 20kB 3.9MB/s eta 0:00:01[K     |████████████████████████▎       | 30kB 5.0MB/s eta 0:00:01[K     |████████████████████████████████| 40kB 3.1MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2

In [0]:
import tensorflow_hub as hub
import pandas as pd
import seaborn as sns
import numpy as np
import tensorflow as tf
import re
import string
from tqdm import tqdm

import bert
from bert.tokenization.bert_tokenization import FullTokenizer
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from sklearn.metrics import confusion_matrix, classification_report


import os

In [0]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [0]:
def remove_url(text):
    return re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+","",text)
def remove_html(text):
    return re.sub(r"<.*?>","",text)
def remove_punctuation(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

In [0]:
train_df['text'] = train_df['text'].apply(lambda x:remove_url(x))
test_df['text'] = test_df['text'].apply(lambda x:remove_url(x))
train_df['text'] = train_df['text'].apply(lambda x:remove_html(x))
test_df['text'] = test_df['text'].apply(lambda x:remove_html(x))
train_df['text'] = train_df['text'].apply(lambda x:remove_punctuation(x))
test_df['text'] = test_df['text'].apply(lambda x:remove_punctuation(x))
train_df['text'] = train_df['text'].apply(lambda x:x.lower())
test_df['text'] = test_df['text'].apply(lambda x:x.lower())

# Downloading Bert Model

In [0]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip

--2020-05-13 12:46:49--  https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.76.128, 2a00:1450:400c:c0c::80
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.76.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 407727028 (389M) [application/zip]
Saving to: ‘uncased_L-12_H-768_A-12.zip’


2020-05-13 12:46:56 (66.6 MB/s) - ‘uncased_L-12_H-768_A-12.zip’ saved [407727028/407727028]



In [0]:
!unzip uncased_L-12_H-768_A-12.zip

Archive:  uncased_L-12_H-768_A-12.zip
   creating: uncased_L-12_H-768_A-12/
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: uncased_L-12_H-768_A-12/vocab.txt  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: uncased_L-12_H-768_A-12/bert_config.json  


In [0]:
os.makedirs('model', exist_ok = True)

In [0]:
!mv uncased_L-12_H-768_A-12/ model

In [0]:
bert_model_name = "uncased_L-12_H-768_A-12"

bert_ckpt_dir = os.path.join('model',bert_model_name)
bert_ckpt_file = os.path.join(bert_ckpt_dir,'bert_model.ckpt')
bert_config_file = os.path.join(bert_ckpt_dir, 'bert_config.json')

# Pre Processing of the text
  * Tokenizing the text
  * Padding the seqence to max length

In [0]:
class DisasterClassifier:
  DATA_COLUMN = "text"
  LABEL_COLUMN = "target"

  def __init__(self, train, test, max_seq_len, tokenizer:FullTokenizer):
    self.tokenizer = tokenizer
    self.max_seq_len = 0

    self.x_train, self.y_train = self._prepare_train(train)
    self.max_seq_len = min(self.max_seq_len, max_seq_len)
    self.x_train  = self._pad(self.x_train)
    self.x_test = self._prepare_test(test)
    self.x_test = self._pad(self.x_test)


  def _prepare_train(self, data):
    x, y = [] , []
    for _, row in tqdm(data.iterrows()):
      text, label =\
       row[DisasterClassifier.DATA_COLUMN], \
       row[DisasterClassifier.LABEL_COLUMN]
      
      tokens = self.tokenizer.tokenize(text)
      tokens = ["[CLS]"] + tokens + ["[SEP]"]
      token_ids = self.tokenizer.convert_tokens_to_ids(tokens)

      self.max_seq_len = max(self.max_seq_len, len(token_ids))
      x.append(token_ids)
      y.append(label)
    return np.array(x), np.array(y)

  def _prepare_test(self, data):
    x = []
    for _, row in tqdm(data.iterrows()):
      text = row[DisasterClassifier.DATA_COLUMN]
      
      tokens = self.tokenizer.tokenize(text)
      tokens = ["[CLS]"] + tokens + ["[SEP]"]
      token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
      x.append(token_ids)
    return np.array(x)
  
  def _pad(self, ids):
    x = []
    for input_ids in tqdm(ids):
      cut = min(len(input_ids), self.max_seq_len - 2)
      input_ids = input_ids[:cut]
      input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
      x.append(np.array(input_ids))      
    
    return np.array(x)

In [0]:
tokenizer = FullTokenizer(vocab_file= os.path.join(bert_ckpt_dir, "vocab.txt"))

In [0]:
data = DisasterClassifier(train_df, test_df, max_seq_len= 64, tokenizer=tokenizer)

7613it [00:03, 2087.24it/s]
100%|██████████| 7613/7613 [00:00<00:00, 144478.04it/s]
3263it [00:01, 2153.59it/s]
100%|██████████| 3263/3263 [00:00<00:00, 116122.91it/s]


In [0]:
def create_model(max_seq_len, bert_ckpt_dir, bert_config_file):
  with tf.io.gfile.GFile(bert_config_file, "r") as reader:
      bc = StockBertConfig.from_json_string(reader.read())
      bert_params = map_stock_config_to_params(bc)
      bert_params.adapter_size = None
      bert = BertModelLayer.from_params(bert_params, name="bert")

  input_ids = tf.keras.layers.Input(shape = (max_seq_len, ), dtype= tf.int32, name= 'input_ids')
  bert_output = bert(input_ids)

  bert_output = bert_output[:,0,:]

  drop_out = tf.keras.layers.Dropout(0.5)(bert_output)
  d_out = tf.keras.layers.Dense(768, activation='tanh')(drop_out)
  logits = tf.keras.layers.Dropout(0.5)(d_out)
  out = tf.keras.layers.Dense(2, activation='softmax')(logits)

  model = tf.keras.models.Model(inputs = input_ids, outputs = out)
  model.summary()

  load_stock_weights(bert, bert_ckpt_file)

  return model

In [0]:
model = create_model(data.max_seq_len, bert_ckpt_file, bert_config_file)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 55)]              0         
_________________________________________________________________
bert (BertModelLayer)        (None, 55, 768)           108890112 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 768)]             0         
_________________________________________________________________
dropout (Dropout)            (None, 768)               0         
_________________________________________________________________
dense (Dense)                (None, 768)               590592    
_________________________________________________________________
dropout_1 (Dropout)          (None, 768)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 1538  

In [0]:
model.compile(
  optimizer=tf.keras.optimizers.Adam(1e-5),
  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="acc")]
)

In [0]:
data.x_train

array([[  101,  2256, 15616, ...,     0,     0,     0],
       [  101,  3224,  2543, ...,     0,     0,     0],
       [  101,  2035,  3901, ...,     0,     0,     0],
       ...,
       [  101, 23290,  2683, ...,     0,     0,     0],
       [  101,  2610, 11538, ...,     0,     0,     0],
       [  101,  1996,  6745, ...,     0,     0,     0]])

In [0]:
data.y_train.shape

(7613,)

In [0]:
history = model.fit(
  x=data.x_train, 
  y=data.y_train,
  validation_split=0.1,
  batch_size=16,
  shuffle=True,
  epochs=5,
  verbose = 1
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [0]:
_, train_acc = model.evaluate(data.x_train, data.y_train)
print("train acc", train_acc)

train acc 0.895442008972168


In [0]:
y_tain_pred = model.predict(data.x_train).argmax(axis=-1)

In [0]:
y_pred = model.predict(data.x_test).argmax(axis=-1)

In [0]:
sample_sub=pd.read_csv('sample_submission.csv')
y_pre=np.round(y_pred).astype(int).reshape(3263)
sub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})
sub.to_csv('submission.csv',index=False)

In [0]:
model.save("BERT_MODEL_1.h5")

In [0]:
print(classification_report(data.y_train, y_tain_pred))

              precision    recall  f1-score   support

           0       0.87      0.96      0.91      4342
           1       0.94      0.81      0.87      3271

    accuracy                           0.90      7613
   macro avg       0.90      0.88      0.89      7613
weighted avg       0.90      0.90      0.89      7613



In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
!ls /content/gdrive/My\ Drive

 10-ps.JPG
 10_schematics_1.JPG
 10_simulation.JPG
 11089.pdf
'12 Oct '
'18UCC159 (1).pdf'
'18UCC159 (2).pdf'
'18UCC159 (3).pdf'
 18ucc159_company
 18UCC159_Exp1.pdf
 18UCC159_Exp2.pdf
 18UCC159.pdf
 18ucc159_program
'18uec049@lnmiit.ac.in 2018-10-19 14:28'
'1st Semester'
'2nd Year.pdf'
'AI ML WORKSHOp'
 ass1.sas
 ass2.sas
 assembler.py
 assembly
 Assignment_5
 Backup
 Classroom
'Colab Notebooks'
'Course List For Summer 2019.gsheet'
 dandiyanight
 Date_Conversion
'DCS LAB'
 Deepvel
 DMS.pdf
 DMS_SOLUTION.pdf
'Document (1).gdoc'
'Document (2).gdoc'
 Document.gdoc
'ENVIRONMENT ECOLOGY & BIOLOGY.gslides'
'FEDERATED LEARNING.gdoc'
'Getting started.pdf'
 image
 JDBC
 lab8.docx
 lab9_18ucc159
'lab9 sol'
 lab.c
'Microelectronic_Circuits_Sedra_Smith_7th (1).pdf'
 Microelectronic_Circuits_Sedra_Smith_7th.pdf
 MLCOURSE.AI
 mysql-connector-java-8.0.18.jar
 primeno.txt
 schedular.c
 screamshorts
'SEM 1'
 SSC_SOFTWERE
'Study Material'
'Study material (exclusively for ECE students)'
 thread
 Untitle

In [0]:
model_save_name = 'BERT_Diaster_classificatoin.h5'
path = F"/content/gdrive/My Drive/{model_save_name}" 
model.save(path)

In [0]:
def create_model_2(max_seq_len, bert_ckpt_dir, bert_config_file):
  with tf.io.gfile.GFile(bert_config_file, "r") as reader:
      bc = StockBertConfig.from_json_string(reader.read())
      bert_params = map_stock_config_to_params(bc)
      bert_params.adapter_size = None
      bert = BertModelLayer.from_params(bert_params, name="bert")

  input_ids = tf.keras.layers.Input(shape = (max_seq_len, ), dtype= tf.int32, name= 'input_ids')
  bert_output = bert(input_ids)
  d_out = bert_output[:,0,:]
  d_out = tf.keras.layers.Dense(768, activation='relu',
                                activity_regularizer= tf.keras.regularizers.l2())(d_out)
  logits = tf.keras.layers.Dropout(0.3)(d_out)
  d_out = tf.keras.layers.Dense(256, activation='relu', 
                                activity_regularizer= tf.keras.regularizers.l2())(logits)
  logits = tf.keras.layers.Dropout(0.3)(d_out)
  out = tf.keras.layers.Dense(2, activation='softmax')(logits)

  model = tf.keras.models.Model(inputs = input_ids, outputs = out)
  model.summary()

  load_stock_weights(bert, bert_ckpt_file)

  return model

In [0]:
model_2 = create_model_2(data.max_seq_len, bert_ckpt_file, bert_config_file)

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 55)]              0         
_________________________________________________________________
bert (BertModelLayer)        (None, 55, 768)           108890112 
_________________________________________________________________
tf_op_layer_strided_slice_7  [(None, 768)]             0         
_________________________________________________________________
dense_12 (Dense)             (None, 768)               590592    
_________________________________________________________________
dropout_10 (Dropout)         (None, 768)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 256)               196864    
_________________________________________________________________
dropout_11 (Dropout)         (None, 256)               0   

In [0]:
model_2.compile(
  optimizer=tf.keras.optimizers.Adam(1e-5),
  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="acc")]
)

In [0]:
history = model_2.fit(
  x=data.x_train, 
  y=data.y_train,
  validation_split=0.1,
  batch_size=32,
  shuffle=True,
  epochs= 5,
  verbose = 1
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [0]:
%%time
y_pred = model_2.predict(data.x_test).argmax(axis=-1)

CPU times: user 11.6 s, sys: 1.76 s, total: 13.4 s
Wall time: 30.6 s


In [0]:
sample_sub=pd.read_csv('sample_submission.csv')
y_pre=np.round(y_pred).astype(int).reshape(3263)
sub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})
sub.to_csv('submission2.csv',index=False)

In [0]:
sub.head(10)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


* We can do some filtering on the basis of keyword as we see in the EDA notebook 
* First off all we retrive the probability 

In [0]:
y_pred = model.predict(data.x_test).argmax(axis=-1)

In [0]:
sample_sub=pd.read_csv('sample_submission.csv')
y_pre=np.round(y_pred).astype(int).reshape(3263)
sub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})

In [0]:
train_df.fillna('None', inplace=True)

In [0]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this earthquake ma...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,just got sent this photo from ruby alaska as s...,1


In [0]:
ag = train_df.groupby('keyword').agg({'text':np.size, 'target':np.mean}).rename(columns={'text':'Count', 'target':'Disaster Probability'})
ag.sort_values('Disaster Probability', ascending=False).head(10)

Unnamed: 0_level_0,Count,Disaster Probability
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1
wreckage,39,1.0
debris,37,1.0
derailment,39,1.0
outbreak,40,0.975
oil%20spill,38,0.973684
typhoon,38,0.973684
suicide%20bombing,33,0.969697
suicide%20bomber,31,0.967742
bombing,29,0.931034
suicide%20bomb,35,0.914286


In [0]:
keyword_list = list(ag[(ag['Count']>2) & (ag['Disaster Probability']>=0.9)].index)
keyword_list


['bombing',
 'debris',
 'derailment',
 'nuclear%20disaster',
 'oil%20spill',
 'outbreak',
 'rescuers',
 'suicide%20bomb',
 'suicide%20bomber',
 'suicide%20bombing',
 'typhoon',
 'wreckage']

In [0]:
ids = test_df['id'][test_df.keyword.isin(keyword_list)].values
sub['target'][sub['id'].isin(ids)] = 1
sub.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [0]:
sub_id = np.array(sub.target.values)

In [0]:
print(type(sub_id))

<class 'numpy.ndarray'>


In [0]:
print(type(y_pred))

<class 'numpy.ndarray'>


In [0]:
y_pred[:5]

array([1, 1, 1, 1, 1])

In [0]:
count = 0
for i in range(len(sub_id)):
  if sub_id[i] != y_pred[i]:
    count = count + 1
print(count)

8


In [0]:
print(sub_id[0:25])
print(y_pred[0:25])

[1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0]
[1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0]
