In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import evaluate
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

from gensim.models import Word2Vec
import nltk
from tqdm import tqdm

np.random.seed(8021)

In [2]:
input_data = pd.read_csv('HC3.csv')
input_data = input_data.dropna()
input_data = input_data.sample(frac=1)

manual_data = pd.read_excel('test_set.xlsx')

input_data['question'] = input_data['question'].str.lower()
input_data['answers'] = input_data['answers'].str.lower()
manual_data['question'] = manual_data['question'].str.lower()
manual_data['answers'] = manual_data['answers'].str.lower()

In [3]:
input_data['q'] = input_data['question'].map(lambda x : nltk.tokenize.word_tokenize(x))
input_data['a'] = input_data['answers'].map(lambda x : nltk.tokenize.word_tokenize(x))

manual_data['q'] = manual_data['question'].map(lambda x : nltk.tokenize.word_tokenize(x))
manual_data['a'] = manual_data['answers'].map(lambda x : nltk.tokenize.word_tokenize(x))

In [4]:
# init Word2Vec model

word2vec_vector_size = 25

all_words1 = input_data['q'] + input_data['a']
all_words2 = manual_data['q'] + manual_data['a']
all_words = pd.concat([all_words1, all_words2], axis=0)

word2vec_model = Word2Vec(sentences=all_words, vector_size=word2vec_vector_size, window=5, min_count=1, workers=4)
word2vec_model.save("word2vec_lstm.model")

In [5]:
# constants
question_max_len = 100
answer_max_len = 400
text_max_len = question_max_len + answer_max_len
# include question flag
include_question = False
sentence_max_len = question_max_len + answer_max_len if include_question else answer_max_len
    
def get_embedding(row):
    # embedding for q
    if include_question:
        q = row['q']
        if len(q) > question_max_len:
            q = q[0:question_max_len]
        embedding_q = word2vec_model.wv[q]
        if len(q) < question_max_len:
            embedding_q = np.concatenate([embedding_q, np.zeros((question_max_len - len(q), word2vec_vector_size))])

    # embedding for a
    a = row['a']
    if len(a) > answer_max_len:
        a = a[0:answer_max_len]
    embedding_a = word2vec_model.wv[a]
    
    if len(a) < answer_max_len:
        embedding_a = np.concatenate([embedding_a, np.zeros((answer_max_len - len(a), word2vec_vector_size))])

    if include_question:
        embedding = np.concatenate([embedding_q, embedding_a])
    else:
        embedding = embedding_a

    embedding = np.array(embedding, dtype='float')

    return embedding

# add column for embedding
input_data['embedding'] = np.nan

embeddings = list()
for index, row in input_data.iterrows():
    try:
        embeddings.append(get_embedding(row))
    except Exception as err:
        pass
        print(err)

# put embedding into dataframe
input_data['embedding'] = embeddings

manual_data['embedding'] = np.nan
embeddings_manual = list()
for index, row in manual_data.iterrows():
    try:
        embeddings_manual.append(get_embedding(row))
    except Exception as err:
        pass
        print(err)

# put embedding into dataframe
manual_data['embedding'] = embeddings_manual

### Display input_data

In [6]:
len(input_data)

23865

In [7]:
# Split the data into train and test sets with a 80:20 split
train_data, test_data = train_test_split(input_data, test_size=0.2)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [8]:
train_data.head()

Unnamed: 0,question,source,labels,answers,q,a,embedding
0,intentions of deductible amount for small busi...,finance,0,if your sole proprietorship losses exceed all ...,"[intentions, of, deductible, amount, for, smal...","[if, your, sole, proprietorship, losses, excee...","[[3.9413859844207764, 2.8509163856506348, 0.35..."
1,high leverage inflation hedges for personal in...,finance,0,"i assume you're looking for advice, not an act...","[high, leverage, inflation, hedges, for, perso...","[i, assume, you, 're, looking, for, advice, ,,...","[[1.693681240081787, -3.0884225368499756, -0.2..."
2,are there tax liabilities (in the us) for havi...,finance,1,if you are a us citizen or a resident alien an...,"[are, there, tax, liabilities, (, in, the, us,...","[if, you, are, a, us, citizen, or, a, resident...","[[3.9413859844207764, 2.8509163856506348, 0.35..."
3,"how is it possible for humans to "" lose "" or d...",reddit_eli5,0,he means the us does n't have a ready - to - g...,"[how, is, it, possible, for, humans, to, ``, l...","[he, means, the, us, does, n't, have, a, ready...","[[-0.5366077423095703, -3.5822958946228027, 0...."
4,why does a computer need to be cooled ? why ca...,reddit_eli5,1,computers generate heat because they have elec...,"[why, does, a, computer, need, to, be, cooled,...","[computers, generate, heat, because, they, hav...","[[-2.278885841369629, -0.0902240127325058, 1.6..."


In [9]:
test_data.head()

Unnamed: 0,question,source,labels,answers,q,a,embedding
0,why do people see proprietary software as bad ...,reddit_eli5,0,there are those that believe if everything was...,"[why, do, people, see, proprietary, software, ...","[there, are, those, that, believe, if, everyth...","[[4.366678714752197, -2.834803342819214, 3.286..."
1,"please explain what is ""cyber defence""",wiki_csai,0,proactive cyber defence means acting in antici...,"[please, explain, what, is, ``, cyber, defence...","[proactive, cyber, defence, means, acting, in,...","[[0.11999265849590302, -0.47481417655944824, 0..."
2,how are digestion processes different for suga...,reddit_eli5,0,the body is fueled by monosacchirides such as ...,"[how, are, digestion, processes, different, fo...","[the, body, is, fueled, by, monosacchirides, s...","[[1.0252605676651, -0.29854241013526917, -0.70..."
3,what is object oriented code ? i 've heard it ...,reddit_eli5,0,object oriented code is a way of building a pr...,"[what, is, object, oriented, code, ?, i, 've, ...","[object, oriented, code, is, a, way, of, build...","[[-0.35408398509025574, 4.469900608062744, -2...."
4,should i use a bank or a credit union for my s...,finance,0,"in practical terms, these days, a credit union...","[should, i, use, a, bank, or, a, credit, union...","[in, practical, terms, ,, these, days, ,, a, c...","[[1.08537757396698, 1.732524037361145, 3.03414..."


In [10]:
class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, embedding, labels):
        'Initialization'
        self.labels = labels
        self.embedding = embedding

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.labels)

    def __getitem__(self, index):
        'Generates one sample of data'    
        X = self.embedding[index]
        y = self.labels[index]
        return X, y

training_set = Dataset(torch.tensor(train_data['embedding'], dtype=torch.float32)
                         , torch.tensor(train_data['labels'].to_numpy(), dtype=torch.float32))

validation_set = Dataset(torch.tensor(test_data['embedding'], dtype=torch.float32)
                         , torch.tensor(test_data['labels'].to_numpy(), dtype=torch.float32))

manual_set = Dataset(torch.tensor(manual_data['embedding'], dtype=torch.float32)
                         , torch.tensor(manual_data['labels'].to_numpy(), dtype=torch.float32))

  training_set = Dataset(torch.tensor(train_data['embedding'], dtype=torch.float32)


In [11]:
batch_size = 64

train_dataloader = DataLoader(training_set, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(validation_set, batch_size=batch_size, shuffle=True)
manual_dataloader = DataLoader(manual_set, batch_size=batch_size, shuffle=True)

### Model

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
print(device)

cuda


In [13]:
class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, num_classes=1):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(
            input_size, hidden_size, num_layers, batch_first=True, bidirectional=True
        )
        self.fc = nn.Linear(hidden_size * 2, num_classes)
        # replace softmax by sigmoid for binary classification
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)

        out, (final_hidden_state, final_cell_state) = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        out = self.sigmoid(out)
        return out.float()


In [14]:
criterion = nn.BCELoss()
input_dim = word2vec_vector_size
model = BiLSTM(input_dim, 200).to(device)

# attempt to use 0.1, bad, 0.01 a bit fast, 0.0001 do not converge
learning_rate = 0.005
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
num_epochs = 20
training_lost = dict()

# Train Bi-LSTM
total_steps = len(train_dataloader)
for epoch in range(num_epochs):
    total_loss = 0
    model.train()
    for index, (embedding, labels) in tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc=f"Epoch {epoch:02d}", leave=False):

        embedding = embedding.to(device)
        labels = labels.to(device)
        outputs = model(embedding)        
        outputs = torch.squeeze(outputs).float()

        labels = torch.squeeze(labels).float()
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
        
    # Testing
    test_preds = []
    test_labels = []

    with torch.no_grad():
        model.eval()
        for batch in val_dataloader:

            embedding, labels = batch
            embedding = embedding.to(device)
            labels = labels.to(device)
            outputs = model(embedding)

            test_preds.append(outputs.cpu().numpy())
            test_labels.append(labels.cpu().numpy())

        test_preds = np.concatenate(test_preds)
        test_preds = (test_preds > 0.5).astype(int)
        test_labels = np.concatenate(test_labels)
        
        test_accuracy = accuracy_score(test_labels, test_preds)
    
    print(f'Epoch [{epoch+1:02d}/{num_epochs}], Training Loss: {avg_loss:.4f}, Testing accuracy: {test_accuracy:.4f}') 



                                                           

Epoch [01/20], Training Loss: 0.6781, Testing accuracy: 0.5248


                                                           

Epoch [02/20], Training Loss: 0.6736, Testing accuracy: 0.5307


                                                           

Epoch [03/20], Training Loss: 0.6415, Testing accuracy: 0.5081


                                                           

Epoch [04/20], Training Loss: 0.3900, Testing accuracy: 0.8864


                                                           

Epoch [05/20], Training Loss: 0.1783, Testing accuracy: 0.9434


                                                           

Epoch [06/20], Training Loss: 0.1165, Testing accuracy: 0.9560


                                                           

Epoch [07/20], Training Loss: 0.0919, Testing accuracy: 0.9581


                                                           

Epoch [08/20], Training Loss: 0.0693, Testing accuracy: 0.9688


                                                           

Epoch [09/20], Training Loss: 0.0644, Testing accuracy: 0.9679


                                                           

Epoch [10/20], Training Loss: 0.0553, Testing accuracy: 0.9629


                                                           

Epoch [11/20], Training Loss: 0.0519, Testing accuracy: 0.9583


                                                           

Epoch [12/20], Training Loss: 0.0448, Testing accuracy: 0.9728


                                                           

Epoch [13/20], Training Loss: 0.0332, Testing accuracy: 0.9709


                                                           

Epoch [14/20], Training Loss: 0.0367, Testing accuracy: 0.9751


                                                           

Epoch [15/20], Training Loss: 0.0306, Testing accuracy: 0.9728


                                                           

Epoch [16/20], Training Loss: 0.0421, Testing accuracy: 0.9571


                                                           

Epoch [17/20], Training Loss: 0.0322, Testing accuracy: 0.9728


                                                           

Epoch [18/20], Training Loss: 0.0247, Testing accuracy: 0.9738


                                                           

Epoch [19/20], Training Loss: 0.0235, Testing accuracy: 0.9721


                                                           

Epoch [20/20], Training Loss: 0.0274, Testing accuracy: 0.9753


In [15]:
torch.save(model, "word2vec_lstm.pt")

### Performance

In [16]:
model.eval()

train_preds = []
train_labels = []
test_preds = []
test_labels = []
manual_preds = []
manual_labels = []

with torch.no_grad():
    model.eval()
    for batch in train_dataloader:
        embedding, labels = batch
        embedding = embedding.to(device)
        labels = labels.to(device)

        outputs = model(embedding)

        train_preds.append(outputs.cpu().numpy())
        train_labels.append(labels.cpu().numpy())

    for batch in val_dataloader:
        embedding, labels = batch
        embedding = embedding.to(device)
        labels = labels.to(device)

        outputs = model(embedding)

        test_preds.append(outputs.cpu().numpy())
        test_labels.append(labels.cpu().numpy())

    for batch in manual_dataloader:
        embedding, labels = batch
        embedding = embedding.to(device)
        labels = labels.to(device)

        outputs = model(embedding)

        manual_preds.append(outputs.cpu().numpy())
        manual_labels.append(labels.cpu().numpy())

train_preds = np.concatenate(train_preds)
train_labels = np.concatenate(train_labels)
test_preds = np.concatenate(test_preds)
test_labels = np.concatenate(test_labels)
manual_preds = np.concatenate(manual_preds)
manual_labels = np.concatenate(manual_labels)

train_preds = (train_preds > 0.5).astype(int)
test_preds = (test_preds > 0.5).astype(int)
manual_preds = (manual_preds > 0.5).astype(int)


In [17]:
accuracy = accuracy_score(train_labels, train_preds)
precision, recall, f_score, _ = precision_recall_fscore_support(
        train_labels, train_preds, average="macro")
        
print('Training Set')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F-score: {f_score:.4f}')
print('===')

accuracy = accuracy_score(test_labels, test_preds)
precision, recall, f_score, _ = precision_recall_fscore_support(
        test_labels, test_preds, average="macro")
        
print('Testing Set')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F-score: {f_score:.4f}')
print('===')

accuracy = accuracy_score(manual_labels, manual_preds)
precision, recall, f_score, _ = precision_recall_fscore_support(
        manual_labels, manual_preds, average="macro")
        
print('Manual Set')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F-score: {f_score:.4f}')

Training Set
Accuracy: 0.9942
Precision: 0.9942
Recall: 0.9942
F-score: 0.9942
===
Testing Set
Accuracy: 0.9753
Precision: 0.9756
Recall: 0.9751
F-score: 0.9753
===
Manual Set
Accuracy: 0.9750
Precision: 0.9762
Recall: 0.9750
F-score: 0.9750


In [18]:
manual_preds = manual_preds.squeeze()
idxs = [i for i in range(40) if (manual_labels != manual_preds).tolist()[i]]

display(manual_data.iloc[idxs])

for idx in idxs:
    print(f'[idx {idx}]')
    print(manual_data.iloc[idx]['labels'])
    print(manual_data.iloc[idx]['question'])
    print(manual_data.iloc[idx]['answers'])

Unnamed: 0,question,source,labels,answers,q,a,embedding
38,"can animals really predict natural disasters, ...",science,1,it is a common belief that some animals can pr...,"[can, animals, really, predict, natural, disas...","[it, is, a, common, belief, that, some, animal...","[[1.512847900390625, 1.3490185737609863, -1.43..."


[idx 38]
1
can animals really predict natural disasters, and if so, how?
it is a common belief that some animals can predict natural disasters, such as earthquakes or tsunamis, but the extent to which they can do so remains unclear and controversial.

there is some scientific evidence to suggest that animals may be able to detect the subtle changes in the environment that precede natural disasters. for example, some animals are sensitive to changes in atmospheric pressure, electromagnetic fields, or seismic vibrations, which can occur before an earthquake or other natural disaster.

some studies have reported unusual animal behavior prior to natural disasters, such as birds leaving their nests or fish swimming to the surface of the water. however, these observations are often difficult to verify and may be subject to other explanations.


In [20]:
! jupyter nbconvert --to html word2vec_lstm.ipynb

[NbConvertApp] Converting notebook word2vec_lstm.ipynb to html
[NbConvertApp] Writing 665017 bytes to word2vec_lstm.html
