In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
import re
warnings.filterwarnings('ignore')

In [2]:
import nltk

In [3]:
dataset = pd.read_csv('./data/Kaggle/train.csv',delimiter=',',\
                           names=['id','keyword','location', 'text','target'])
dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,id,keyword,location,text,target
1,1,,,Our Deeds are the Reason of this #earthquake M...,1
2,4,,,Forest fire near La Ronge Sask. Canada,1
3,5,,,All residents asked to 'shelter in place' are ...,1
4,6,,,"13,000 people receive #wildfires evacuation or...",1


In [4]:
# Drop Id, Keyword, Location
dataset = dataset.drop(labels=['id', 'keyword','location'], axis=1)
dataset.head()

Unnamed: 0,text,target
0,text,target
1,Our Deeds are the Reason of this #earthquake M...,1
2,Forest fire near La Ronge Sask. Canada,1
3,All residents asked to 'shelter in place' are ...,1
4,"13,000 people receive #wildfires evacuation or...",1


In [5]:
def clean(tweet):
    # Special characters
    tweet = re.sub(r"\x89Û_", "", tweet)
    tweet = re.sub(r"\x89ÛÒ", "", tweet)
    tweet = re.sub(r"\x89ÛÓ", "", tweet)
    tweet = re.sub(r"\x89ÛÏWhen", "When", tweet)
    tweet = re.sub(r"\x89ÛÏ", "", tweet)
    tweet = re.sub(r"China\x89Ûªs", "China's", tweet)
    tweet = re.sub(r"let\x89Ûªs", "let's", tweet)
    tweet = re.sub(r"\x89Û÷", "", tweet)
    tweet = re.sub(r"\x89Ûª", "", tweet)
    tweet = re.sub(r"\x89Û\x9d", "", tweet)
    tweet = re.sub(r"å_", "", tweet)
    tweet = re.sub(r"\x89Û¢", "", tweet)
    tweet = re.sub(r"\x89Û¢åÊ", "", tweet)
    tweet = re.sub(r"fromåÊwounds", "from wounds", tweet)
    tweet = re.sub(r"åÊ", "", tweet)
    tweet = re.sub(r"åÈ", "", tweet)
    tweet = re.sub(r"JapÌ_n", "Japan", tweet)    
    tweet = re.sub(r"Ì©", "e", tweet)
    tweet = re.sub(r"å¨", "", tweet)
    tweet = re.sub(r"SuruÌ¤", "Suruc", tweet)
    tweet = re.sub(r"åÇ", "", tweet)
    tweet = re.sub(r"å£3million", "3 million", tweet)
    tweet = re.sub(r"åÀ", "", tweet)
     
    # Remove http
    tweet = re.sub(r"http[^\s]+","", tweet)
    
    # Remove @abc
    tweet = re.sub(r"@[^\s]+", "", tweet)
    
    return tweet

In [6]:
# Drop first row
dataset = dataset.drop(index=0)
# Clean data
dataset['text_cleaned'] = dataset['text'].apply(lambda s : clean(s))

In [None]:
dataset.head()

In [None]:
from nltk.tokenize import TweetTokenizer
tt = TweetTokenizer()
dataset['tokenized'] = dataset['text_cleaned'].apply(tt.tokenize)
dataset.head()

In [None]:
wordList = sum(dataset['tokenized'].values,[])
wordList

In [None]:
freDist = nltk.FreqDist(wordList)

In [None]:
freDist

In [None]:
freDist.keys()

### Try Tokenized BERT Embeding and Create Model

In [7]:
def pad_token_list(sample):
    # Find the sentence with the max length
    max_len = 0
    for token_list in sample:
        if len(token_list) > max_len:
            max_len = len(token_list)
    # Adjust every sentence to the same length
    padded = np.array([token_list + [0]*(max_len-len(token_list)) for token_list in sample])
    return padded, max_len

In [8]:
def get_embeddings_from_sample(sample, model):
    # Pad sample data:
#     sample = pad_token_list(sample)
    # Define mask from data: - 0 token entry     -> padding, set mask entry to 0
    #                        - non-0 token entry -> valid word, set mask entry to 1
    mask = np.where(sample != 0, 1, 0)
    
    # Create tensor objects from numpy arrays
    input_ids = torch.tensor(sample).long()
    attention_mask = torch.tensor(mask).long()

    # Use BERT model to get embeddings
    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)
    # Extract [CLS] embedding for each sample as numpy array to be used for classification task
    features = last_hidden_states[0][:,0,:].numpy()
    return features, mask

In [9]:
sample_size = 4000
random_sample = dataset.sample(n=sample_size)
random_sample.shape

(4000, 3)

In [10]:
val_set = dataset.loc[dataset.index.difference(random_sample.index)]
val_set.shape

(3613, 3)

In [30]:
val_random_sample.shape

(1500, 3)

In [11]:
val_sample_size = 1500
val_random_sample = val_set.sample(n=val_sample_size)
val_random_sample.shape
test_set = val_set.loc[val_set.index.difference(val_random_sample.index)]
test_set.shape

(2113, 3)

In [32]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [14]:
sample_tokenized = random_sample['text_cleaned'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
#sample_tokenized2 = random_sample2['text_cleaned'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
val_random_sample_tokenized = val_random_sample['text_cleaned'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
#test_tokenized = test_set['text_cleaned'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [15]:
sample_padded, sample_len = pad_token_list(sample_tokenized.values)
val_padded, val_len = pad_token_list(val_random_sample_tokenized.values)
#sample_padded2, sample_len2 = pad_token_list(sample_tokenized2.values)
#test_padded, test_len = pad_token_list(test_tokenized.values)

In [16]:
sample_features, mask = get_embeddings_from_sample(sample_padded, model)
val_features, mask = get_embeddings_from_sample(val_padded, model)

In [None]:
sample_features.shape

In [17]:
# Create cuda device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [18]:
train_features_tensor = torch.tensor(np.asarray(sample_features))
train_features_tensor = train_features_tensor.to(device)
train_labels_tensor =  torch.tensor(np.asarray(random_sample['target']).astype(np.int))
train_labels_tensor = train_labels_tensor.to(device)

val_features_tensor = torch.tensor(np.asarray(val_features))
val_features_tensor = val_features_tensor.to(device)
val_labels_tensor =  torch.tensor(np.asarray(val_random_sample['target']).astype(np.int))
val_labels_tensor = val_labels_tensor.to(device)

In [19]:
# Define neural network class to be trained
# Structure:
# input -> fc1 -> sigmoid -> out -> log_softmax
import torch.nn as nn
import torch.nn.functional as F
class Shallow_Network(nn.Module):
    def __init__(self):
        super(Shallow_Network,self).__init__()
        self.fc1 = nn.Linear(768,1000)
        self.out = nn.Linear(1000,2)
    def forward(self,input):
        # Take input, feed through fc1 layer,
        # then apply activation function to it
        x = F.sigmoid(self.fc1(input))
        # Take output of sigmoid, input into out layer,
        # and apply log_softmax function
        return (F.log_softmax(self.out(x),dim=1))

In [43]:
# Create neural network object
net = Shallow_Network()
net = net.to(device)

In [44]:
import torch.optim as optim
#Create an stochastic gradient descent optimizer
adam = optim.Adam(net.parameters(), lr=0.001)
loss_func = nn.NLLLoss()
loss_func = loss_func.to(device)

In [38]:
def accuracy(net,features,labels):
    # Get classification probabilities from hidden state array
    # And apply Softmax
    with torch.no_grad():
        probs = net(features)
        softprobs = F.softmax(probs)
    # Get most likely class and its index for each sample point
    values, indices = torch.max(softprobs,1)
    # Calculate number of sample points where prediction failed
    nums = torch.sum(torch.abs(labels-indices)).detach().cpu().numpy()
    # Number of correct predictions
    numcorrect = len(labels)-(nums+0)
    # Accuracy of prediction
    accuracy = numcorrect/len(labels)
    return(accuracy)

In [45]:
# Train network
cnt = 0
average_losses = []
average_val_losses = []
acc = []
cur_loss = []
min_validation = 10000.0
min_val_epoch = 0
for epoch in range(400):
    net.train()
    #zero the gradient
    adam.zero_grad()
    #Get output of network
    probs = net(train_features_tensor)
    #compute loss
    loss = loss_func(probs,train_labels_tensor)
    #compute the backward gradient and move network in that direction
    loss.backward()
    adam.step()
    #gather loss
    cur_loss.append(loss.detach().cpu().numpy())
    print("epoch ",epoch)
    print("training loss: ", np.mean(cur_loss))
    net.eval()
    probs_val = net(val_features_tensor)
    loss_val = loss_func(probs_val,val_labels_tensor)
    print("validation loss: ", np.mean(loss_val.detach().cpu().numpy()))
    print("validation accuracy: ", accuracy(net,val_features_tensor,val_labels_tensor))
    #Save model if validation is min
    if min_validation > np.mean(loss_val.detach().cpu().numpy()):
        min_validation = np.mean(loss_val.detach().cpu().numpy())
        min_val_epoch = epoch
        torch.save(net.state_dict(), './net_parameters_kaggle.pth')
    

epoch  0
training loss:  0.71944815
validation loss:  0.80657357
validation accuracy:  0.5766666666666667
epoch  1
training loss:  0.77052534
validation loss:  0.68252236
validation accuracy:  0.5766666666666667
epoch  2
training loss:  0.7438679
validation loss:  0.64430773
validation accuracy:  0.7473333333333333
epoch  3
training loss:  0.7186316
validation loss:  0.69184595
validation accuracy:  0.42333333333333334
epoch  4
training loss:  0.71202004
validation loss:  0.6555037
validation accuracy:  0.494
epoch  5
training loss:  0.7017092
validation loss:  0.5925359
validation accuracy:  0.7673333333333333
epoch  6
training loss:  0.685902
validation loss:  0.5764187
validation accuracy:  0.708
epoch  7
training loss:  0.67255175
validation loss:  0.58750844
validation accuracy:  0.666
epoch  8
training loss:  0.6636671
validation loss:  0.578551
validation accuracy:  0.6813333333333333
epoch  9
training loss:  0.6556619
validation loss:  0.5497175
validation accuracy:  0.724
epoc

epoch  85
training loss:  0.46776637
validation loss:  0.4299602
validation accuracy:  0.8113333333333334
epoch  86
training loss:  0.46705955
validation loss:  0.42987663
validation accuracy:  0.8113333333333334
epoch  87
training loss:  0.46636355
validation loss:  0.42979622
validation accuracy:  0.8113333333333334
epoch  88
training loss:  0.4656783
validation loss:  0.42967343
validation accuracy:  0.8113333333333334
epoch  89
training loss:  0.46500346
validation loss:  0.4294975
validation accuracy:  0.8113333333333334
epoch  90
training loss:  0.46433857
validation loss:  0.42930403
validation accuracy:  0.812
epoch  91
training loss:  0.46368346
validation loss:  0.42913148
validation accuracy:  0.8126666666666666
epoch  92
training loss:  0.4630379
validation loss:  0.4289933
validation accuracy:  0.814
epoch  93
training loss:  0.46240166
validation loss:  0.4288878
validation accuracy:  0.8153333333333334
epoch  94
training loss:  0.46177432
validation loss:  0.42880306
val

training loss:  0.43114758
validation loss:  0.4225883
validation accuracy:  0.814
epoch  166
training loss:  0.43083498
validation loss:  0.42255148
validation accuracy:  0.814
epoch  167
training loss:  0.43052438
validation loss:  0.422514
validation accuracy:  0.814
epoch  168
training loss:  0.4302157
validation loss:  0.42247698
validation accuracy:  0.814
epoch  169
training loss:  0.42990902
validation loss:  0.42244247
validation accuracy:  0.814
epoch  170
training loss:  0.42960414
validation loss:  0.42241114
validation accuracy:  0.814
epoch  171
training loss:  0.42930123
validation loss:  0.42238164
validation accuracy:  0.8146666666666667
epoch  172
training loss:  0.42900005
validation loss:  0.42235216
validation accuracy:  0.8153333333333334
epoch  173
training loss:  0.42870077
validation loss:  0.422323
validation accuracy:  0.8153333333333334
epoch  174
training loss:  0.42840323
validation loss:  0.4222957
validation accuracy:  0.8146666666666667
epoch  175
train

epoch  261
training loss:  0.40704128
validation loss:  0.4266993
validation accuracy:  0.812
epoch  262
training loss:  0.40683025
validation loss:  0.42683032
validation accuracy:  0.812
epoch  263
training loss:  0.40661973
validation loss:  0.42696306
validation accuracy:  0.812
epoch  264
training loss:  0.4064098
validation loss:  0.4270972
validation accuracy:  0.812
epoch  265
training loss:  0.40620035
validation loss:  0.42723307
validation accuracy:  0.812
epoch  266
training loss:  0.40599144
validation loss:  0.42737043
validation accuracy:  0.812
epoch  267
training loss:  0.40578303
validation loss:  0.4275093
validation accuracy:  0.812
epoch  268
training loss:  0.4055751
validation loss:  0.42764983
validation accuracy:  0.812
epoch  269
training loss:  0.4053677
validation loss:  0.42779192
validation accuracy:  0.812
epoch  270
training loss:  0.4051608
validation loss:  0.42793533
validation accuracy:  0.8113333333333334
epoch  271
training loss:  0.4049544
validat

validation accuracy:  0.8013333333333333
epoch  374
training loss:  0.3861831
validation loss:  0.44751567
validation accuracy:  0.8006666666666666
epoch  375
training loss:  0.38602448
validation loss:  0.44784835
validation accuracy:  0.8006666666666666
epoch  376
training loss:  0.38586617
validation loss:  0.44787818
validation accuracy:  0.8006666666666666
epoch  377
training loss:  0.38570827
validation loss:  0.44829985
validation accuracy:  0.8006666666666666
epoch  378
training loss:  0.3855508
validation loss:  0.4482333
validation accuracy:  0.8
epoch  379
training loss:  0.38539374
validation loss:  0.44877902
validation accuracy:  0.802
epoch  380
training loss:  0.3852371
validation loss:  0.4485594
validation accuracy:  0.7986666666666666
epoch  381
training loss:  0.38508078
validation loss:  0.4493394
validation accuracy:  0.8
epoch  382
training loss:  0.38492498
validation loss:  0.448832
validation accuracy:  0.7993333333333333
epoch  383
training loss:  0.38476965


In [46]:
min_val_epoch

190

In [33]:
net = Shallow_Network()
checkpoint = torch.load('./net_parameters_kaggle.pth')
net.load_state_dict(checkpoint)
net = net.to(device)
net.eval()

Shallow_Network(
  (fc1): Linear(in_features=768, out_features=1000, bias=True)
  (out): Linear(in_features=1000, out_features=2, bias=True)
)

In [27]:
probs_val = net(val_features_tensor)
loss_val = loss_func(probs_val,val_labels_tensor)
print("validation loss: ", np.mean(loss_val.detach().cpu().numpy()))

validation loss:  0.42253873


In [40]:
print(accuracy(net,val_features_tensor,val_labels_tensor))

0.816


In [35]:
test_set = val_set.loc[val_set.index.difference(val_random_sample.index)]
test_random_sample_tokenized = test_set['text_cleaned'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
test_padded, test_len = pad_token_list(test_random_sample_tokenized.values)
test_features, mask = get_embeddings_from_sample(test_padded, model)
test_features_tensor = torch.tensor(np.asarray(test_features))
test_features_tensor = test_features_tensor.to(device)
test_labels_tensor =  torch.tensor(np.asarray(test_set['target']).astype(np.int))
test_labels_tensor = test_labels_tensor.to(device)

In [39]:
print(accuracy(net,test_features_tensor,test_labels_tensor))

0.8187411263606247


Baseline only html tag and special character cleaning yielded 81.8% 