In [1]:
import pandas as pd 

import warnings
warnings.filterwarnings('ignore')

#### Loading the data 

In [2]:
train_df = pd.read_csv('train.csv',usecols=['id','text','target'])
test_df = pd.read_csv('test.csv',usecols=['id','text'])

Show Examples:

In [3]:
test_df.head()

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, s..."
2,3,"there is a forest fire at spot pond, geese are..."
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan


In [4]:
train_df.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
train_df.shape

(7613, 3)

#### We need to do some text cleaning (optional)
here some signs and characters need to be removed , again cleaning the text data before training is a good practice well bert is more advanced architecture , it doesn't much affect if we dont do data cleaning 
, bert dont need extensive text_cleaning because bert comes with 40/60000 words hence its really not necessary to do text_cleaning but removing the special characters are good practice

In [6]:
%pip install text_hammer 

import text_hammer as th

def text_preprocessing(df,col_name):
    column = col_name

    df[column] = df[column].apply(lambda x:str(x).lower())
    df[column] = df[column].apply(lambda x: th.remove_emails(x))
    df[column] = df[column].apply(lambda x: th.remove_special_chars(x))
    df[column] = df[column].apply(lambda x: th.remove_accented_chars(x))
    
    return(df)

#train_cleaned_df = text_preprocessing(train_df,'text')
#train_cleaned_df[train_cleaned_df.target == 0]
#train_df = train_cleaned_df.copy()

Note: you may need to restart the kernel to use updated packages.


here target 1 means we are talking about any accident or disaster and 0 means just a formal tweets with not much attention

so far we have cleaned our text data and now lets load our model

#### Loading Pretrained BERT Model

In [7]:
from transformers import AutoTokenizer,TFBertModel

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [8]:
tokenizer('an example of bert fine-tuning in bert')

{'input_ids': [101, 2019, 2742, 1997, 14324, 2986, 1011, 17372, 1999, 14324, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

#### Convert Our text data into BERT input format 

In [9]:
print("max len of tweets",max([len(x.split()) for x in train_df.text]))

max len of tweets 31


In [10]:
x_train = tokenizer(
    text=train_df.text.tolist(),
    padding=True, 
    max_length=36,
    truncation=True,
    return_tensors='tf')

print(x_train)


{'input_ids': <tf.Tensor: shape=(7613, 36), dtype=int32, numpy=
array([[  101,  2256, 15616, ...,     0,     0,     0],
       [  101,  3224,  2543, ...,     0,     0,     0],
       [  101,  2035,  3901, ...,     0,     0,     0],
       ...,
       [  101, 23290,  1012, ...,   102,     0,     0],
       [  101,  2610, 11538, ...,     0,     0,     0],
       [  101,  1996,  6745, ...,     0,     0,     0]])>, 'token_type_ids': <tf.Tensor: shape=(7613, 36), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(7613, 36), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 1, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>}


In [11]:
x_train['input_ids'].shape

TensorShape([7613, 36])

In [12]:
x_train['attention_mask'].shape

TensorShape([7613, 36])

In [13]:
y_train = train_df.target.values
y_train

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [14]:
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

#### Building the model architecture 

In [15]:
import tensorflow as tf
from keras import layers
from keras.optimizers import Adam

max_length = 36

input_ids = layers.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
input_mask = layers.Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")

embeddings = bert(input_ids,attention_mask = input_mask)[1] #(0 is the last hidden states,1 means pooler_output)

out = layers.Dropout(0.1)(embeddings)
out = layers.Dense(128, activation='relu')(out)
out = layers.Dropout(0.1)(out)
out = layers.Dense(32,activation = 'relu')(out)

y = layers.Dense(1,activation = 'sigmoid')(out)
    
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True
# for training bert our lr must be so small

In [16]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 36)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 36)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 36,                                            

In [17]:
optimizer = Adam(
    learning_rate=6e-06, # this learning rate is for bert model , taken from huggingface website 
    epsilon=1e-08,
    decay=0.01)

# Compile the model
model.compile(
    optimizer = optimizer,
    loss = "binary_crossentropy", 
    metrics = ["accuracy"])

In [18]:
train_history = model.fit(
    x ={'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']} ,
    y = y_train,
    validation_split = 0.1,
    epochs=2,
    batch_size=32
)

Epoch 1/2
Epoch 2/2


#### TESTING PHASE
on this phase we will make predictions out of our model and then submit to kaggle comptetions

In [19]:
test_df

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, s..."
2,3,"there is a forest fire at spot pond, geese are..."
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...
3258,10861,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,Storm in RI worse than last hurricane. My city...
3260,10868,Green Line derailment in Chicago http://t.co/U...
3261,10874,MEG issues Hazardous Weather Outlook (HWO) htt...


In [20]:
x_test = tokenizer(
    text=test_df.text.tolist(),
    padding=True, 
    max_length=36,
    truncation=True,
    return_tensors='tf')


In [21]:
predicted = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})

