In [1]:
## Build models using word2vec and neural network
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import evaluate
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import nltk
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

np.random.seed(8021)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
input_data = pd.read_csv('HC3.csv')
input_data = input_data.dropna()
input_data = input_data.sample(frac=1)

manual_data = pd.read_excel('test_set.xlsx')

input_data['question'] = input_data['question'].str.lower()
input_data['answers'] = input_data['answers'].str.lower()
manual_data['question'] = manual_data['question'].str.lower()
manual_data['answers'] = manual_data['answers'].str.lower()

In [3]:
input_data['q'] = input_data['question'].map(lambda x : nltk.tokenize.word_tokenize(x))
input_data['a'] = input_data['answers'].map(lambda x : nltk.tokenize.word_tokenize(x))

manual_data['q'] = manual_data['question'].map(lambda x : nltk.tokenize.word_tokenize(x))
manual_data['a'] = manual_data['answers'].map(lambda x : nltk.tokenize.word_tokenize(x))

In [4]:
manual_data.head()

Unnamed: 0,question,source,labels,answers,q,a
0,should i buy more tsla stock or should i hold ...,Finance,1,"as an ai language model, i cannot provide fina...","[should, i, buy, more, tsla, stock, or, should...","[as, an, ai, language, model, ,, i, can, not, ..."
1,should i buy more tsla stock or should i hold ...,Finance,0,imo tesla is overvalued. not a bad company but...,"[should, i, buy, more, tsla, stock, or, should...","[imo, tesla, is, overvalued, ., not, a, bad, c..."
2,hi i'm from singapore and i recently sent an u...,Finance,1,"first, it's important to understand that finan...","[hi, i, 'm, from, singapore, and, i, recently,...","[first, ,, it, 's, important, to, understand, ..."
3,hi i'm from singapore and i recently sent an u...,Finance,0,trade the wheel as this is a fully covered str...,"[hi, i, 'm, from, singapore, and, i, recently,...","[trade, the, wheel, as, this, is, a, fully, co..."
4,is it possible to do call/ put options on the ...,Finance,0,"hello. no.\n\nthe closest to options are ""warr...","[is, it, possible, to, do, call/, put, options...","[hello, ., no, ., the, closest, to, options, a..."


In [5]:
# init Word2Vec model

word2vec_vector_size = 25

all_words1 = input_data['q'] + input_data['a']
all_words2 = manual_data['q'] + manual_data['a']
all_words = pd.concat([all_words1, all_words2], axis=0)

word2vec_model = Word2Vec(sentences=all_words, vector_size=word2vec_vector_size, window=5, min_count=1, workers=4)
word2vec_model.save("word2vec.model")

In [6]:
# constants
question_max_len = 100
answer_max_len = 400
text_max_len = question_max_len + answer_max_len
# include question flag
include_question = False
sentence_max_len = question_max_len + answer_max_len if include_question else answer_max_len
    
def get_embedding(row):
    # embedding for q
    if include_question:
        q = row['q']
        if len(q) > question_max_len:
            q = q[0:question_max_len]
        embedding_q = word2vec_model.wv[q]
        embedding_q = embedding_q.flatten()
        embedding_q = np.pad(embedding_q, (0,(question_max_len - len(q)) * word2vec_vector_size), mode='constant', constant_values=0)

    # embedding for a
    a = row['a']
    if len(a) > answer_max_len:
        a = a[0:answer_max_len]
    embedding_a = word2vec_model.wv[a]
    embedding_a = embedding_a.flatten()
    embedding_a = np.pad(embedding_a, (0,(answer_max_len - len(a)) * word2vec_vector_size), mode='constant', constant_values=0)

    if include_question:
        embedding = np.concatenate([embedding_q, embedding_a])
    else:
        embedding = embedding_a

    embedding = np.array(embedding, dtype='float')

    return embedding

# add column for embedding
input_data['embedding'] = np.nan

embeddings = list()
for index, row in input_data.iterrows():
    try:
        embeddings.append(get_embedding(row))
    except Exception as err:
        pass
        print(err)

# put embedding into dataframe
input_data['embedding'] = embeddings


manual_data['embedding'] = np.nan
embeddings_manual = list()
for index, row in manual_data.iterrows():
    try:
        embeddings_manual.append(get_embedding(row))
    except Exception as err:
        pass
        print(err)

# put embedding into dataframe
manual_data['embedding'] = embeddings_manual

## Display input_data

In [7]:
len(input_data)

23865

In [8]:
# Split the data into train and test sets with a 80:20 split
train_data, test_data = train_test_split(input_data, test_size=0.2)

In [9]:
train_data.head()

Unnamed: 0,question,source,labels,answers,q,a,embedding
3120,intentions of deductible amount for small busi...,finance,0,if your sole proprietorship losses exceed all ...,"[intentions, of, deductible, amount, for, smal...","[if, your, sole, proprietorship, losses, excee...","[1.2712088823318481, -0.7088891267776489, 0.35..."
2949,high leverage inflation hedges for personal in...,finance,0,"i assume you're looking for advice, not an act...","[high, leverage, inflation, hedges, for, perso...","[i, assume, you, 're, looking, for, advice, ,,...","[2.1779658794403076, -3.5017971992492676, -0.5..."
1747,are there tax liabilities (in the us) for havi...,finance,1,if you are a us citizen or a resident alien an...,"[are, there, tax, liabilities, (, in, the, us,...","[if, you, are, a, us, citizen, or, a, resident...","[1.2712088823318481, -0.7088891267776489, 0.35..."
20996,"how is it possible for humans to "" lose "" or d...",reddit_eli5,0,he means the us does n't have a ready - to - g...,"[how, is, it, possible, for, humans, to, ``, l...","[he, means, the, us, does, n't, have, a, ready...","[-1.014471411705017, -5.9781951904296875, 1.40..."
7471,why does a computer need to be cooled ? why ca...,reddit_eli5,1,computers generate heat because they have elec...,"[why, does, a, computer, need, to, be, cooled,...","[computers, generate, heat, because, they, hav...","[-1.5659393072128296, -1.2250750064849854, 2.6..."


In [10]:
test_data.head()

Unnamed: 0,question,source,labels,answers,q,a,embedding
13499,why do people see proprietary software as bad ...,reddit_eli5,0,there are those that believe if everything was...,"[why, do, people, see, proprietary, software, ...","[there, are, those, that, believe, if, everyth...","[2.9102416038513184, -4.839542388916016, 3.150..."
23237,"please explain what is ""cyber defence""",wiki_csai,0,proactive cyber defence means acting in antici...,"[please, explain, what, is, ``, cyber, defence...","[proactive, cyber, defence, means, acting, in,...","[0.0045166015625, -0.43950194120407104, 0.3010..."
21864,how are digestion processes different for suga...,reddit_eli5,0,the body is fueled by monosacchirides such as ...,"[how, are, digestion, processes, different, fo...","[the, body, is, fueled, by, monosacchirides, s...","[2.3667237758636475, -1.0097883939743042, -0.5..."
19165,what is object oriented code ? i 've heard it ...,reddit_eli5,0,object oriented code is a way of building a pr...,"[what, is, object, oriented, code, ?, i, 've, ...","[object, oriented, code, is, a, way, of, build...","[-1.3146284818649292, 1.5390233993530273, 1.08..."
2135,should i use a bank or a credit union for my s...,finance,0,"in practical terms, these days, a credit union...","[should, i, use, a, bank, or, a, credit, union...","[in, practical, terms, ,, these, days, ,, a, c...","[0.344411700963974, 0.6015273332595825, 3.1311..."


In [11]:
  
def get_embedding_tensor(data):
    embedding_array = np.array([np.array(embedding) for embedding in data], dtype=np.float32)
    embedding_tensor = torch.tensor(embedding_array)
    return embedding_tensor

# Convert train_data['embedding'] to a 2D NumPy array of floats
embedding_array = np.array([np.array(embedding) for embedding in train_data['embedding']], dtype=np.float32)

# Convert the NumPy array to a PyTorch tensor
embedding_tensor = torch.tensor(embedding_array)

x_train = get_embedding_tensor(train_data['embedding'])
x_test = get_embedding_tensor(test_data['embedding'])
y_train = get_embedding_tensor(train_data['labels'])
y_test = get_embedding_tensor(test_data['labels'])

In [12]:
print(f'Training set random guess rate={y_train.sum().item() / len(y_train)}')

print(f'Testing set random guess rate={y_test.sum().item() / len(y_test)}')

# check the gpt/human response rate is roughly the same or not

Training set random guess rate=0.5020951183741882
Testing set random guess rate=0.4910957469097004


In [13]:
torch.manual_seed(8021)
learning_rate = 0.005

input_dim = word2vec_vector_size * sentence_max_len
hidden_dim = 200
hidden_dim2 = 10
batch_size = 128

loss_fn = nn.BCELoss()
model = nn.Sequential(
    nn.Linear(input_dim, hidden_dim),
    nn.BatchNorm1d(hidden_dim),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(hidden_dim, 1),
    nn.Sigmoid()
)

# model = nn.Sequential(
#     nn.Linear(input_dim, hidden_dim),
#     nn.BatchNorm1d(hidden_dim),
#     nn.ReLU(),
#     nn.Linear(hidden_dim, hidden_dim2),
#     nn.BatchNorm1d(hidden_dim2),
#     nn.ReLU(),
#     nn.Linear(hidden_dim2, 1),
#     nn.Sigmoid()
# )

opt = torch.optim.Adam(model.parameters(), lr=learning_rate)

loss_values = []
total_step = 0
epochs = 5
num_samples = len(x_train)

for epoch in range(epochs):
    running_loss = 0.0
    running_nsample = 0.0
    loss_fn = nn.BCELoss()

    cur_idxs = torch.randperm(num_samples)

    model.train()
    
    for step in tqdm(range(num_samples // batch_size + 1), desc=f"Epoch {epoch}", leave=False):
        x_train = x_train[cur_idxs]
        y_train = y_train[cur_idxs]
        left = step * batch_size
        right = min((step+1) * batch_size, num_samples)
        
        batch_x = x_train[left:right]
        batch_y = y_train[left:right]

        # ------------------
        y_pred = model(batch_x).squeeze(1)
        loss = loss_fn(y_pred, batch_y)
        opt.zero_grad()
        loss.backward()
        opt.step()

        # ------------------
        running_loss += loss.item() * (right - left)
        running_nsample += (right - left)

        total_step += 1

    epoch_loss = running_loss/running_nsample
    
    with torch.no_grad():
        model.eval()
        test_preds = model(x_test).squeeze(1)
        test_preds = torch.from_numpy(np.array([1 if pred.item() > 0.5 else 0 for pred in test_preds]))

        accuracy = accuracy_score(test_preds.numpy(), y_test.numpy())
    
    print(f'Epoch {epoch} with training loss {epoch_loss}, accuracy = {accuracy}')
    loss_values.append(epoch_loss)

                                                          

Epoch 0 with training loss 0.2226749590962732, accuracy = 0.905300649486696


                                                          

Epoch 1 with training loss 0.10797302660616394, accuracy = 0.9226901319924575


                                                          

Epoch 2 with training loss 0.07236178222128163, accuracy = 0.9243662266918081


                                                          

Epoch 3 with training loss 0.06273388786746001, accuracy = 0.9241567148543893


                                                          

Epoch 4 with training loss 0.041219248721466206, accuracy = 0.9239472030169704


In [14]:
# testing accuracy
with torch.no_grad():
    model.eval()
    print('Training Set')
    train_preds = model(x_train).squeeze(1)
    train_preds = torch.from_numpy(np.array([1 if pred.item() > 0.5 else 0 for pred in train_preds]))
    accuracy = accuracy_score(y_train, train_preds)
    precision, recall, f_score, _ = precision_recall_fscore_support(
        y_train, train_preds, average="macro")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F-score: {f_score:.4f}")

    print('Testing Set')
    test_preds = model(x_test).squeeze(1)
    test_preds = torch.from_numpy(np.array([1 if pred.item() > 0.5 else 0 for pred in test_preds]))
    accuracy = accuracy_score(y_test, test_preds)
    precision, recall, f_score, _ = precision_recall_fscore_support(
        y_test, test_preds, average="macro")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F-score: {f_score:.4f}")

Training Set
Accuracy: 0.9946
Precision: 0.9946
Recall: 0.9946
F-score: 0.9946
Testing Set
Accuracy: 0.9239
Precision: 0.9239
Recall: 0.9240
F-score: 0.9239


In [15]:
torch.save(model, "word2vec_model.pt")

### New Data set

In [16]:
display(manual_data)

Unnamed: 0,question,source,labels,answers,q,a,embedding
0,should i buy more tsla stock or should i hold ...,Finance,1,"as an ai language model, i cannot provide fina...","[should, i, buy, more, tsla, stock, or, should...","[as, an, ai, language, model, ,, i, can, not, ...","[-1.183118224143982, 1.110776662826538, 4.2539..."
1,should i buy more tsla stock or should i hold ...,Finance,0,imo tesla is overvalued. not a bad company but...,"[should, i, buy, more, tsla, stock, or, should...","[imo, tesla, is, overvalued, ., not, a, bad, c...","[0.06931285560131073, -0.4087589681148529, 0.1..."
2,hi i'm from singapore and i recently sent an u...,Finance,1,"first, it's important to understand that finan...","[hi, i, 'm, from, singapore, and, i, recently,...","[first, ,, it, 's, important, to, understand, ...","[0.5811381340026855, -0.9786335229873657, -1.1..."
3,hi i'm from singapore and i recently sent an u...,Finance,0,trade the wheel as this is a fully covered str...,"[hi, i, 'm, from, singapore, and, i, recently,...","[trade, the, wheel, as, this, is, a, fully, co...","[2.069535493850708, -1.0942184925079346, 2.511..."
4,is it possible to do call/ put options on the ...,Finance,0,"hello. no.\n\nthe closest to options are ""warr...","[is, it, possible, to, do, call/, put, options...","[hello, ., no, ., the, closest, to, options, a...","[1.166224718093872, -3.007317304611206, -3.783..."
5,is it possible to do call/ put options on the ...,Finance,1,options trading involves buying or selling con...,"[is, it, possible, to, do, call/, put, options...","[options, trading, involves, buying, or, selli...","[5.058363437652588, -2.5446996688842773, 2.604..."
6,what do you miss about the covid lockdowns?,medicine,0,no traffic. my life didnt change at all during...,"[what, do, you, miss, about, the, covid, lockd...","[no, traffic, ., my, life, didnt, change, at, ...","[-0.46490922570228577, -0.053871188312768936, ..."
7,what do you miss about the covid lockdowns?,medicine,1,while lockdowns were implemented to slow the s...,"[what, do, you, miss, about, the, covid, lockd...","[while, lockdowns, were, implemented, to, slow...","[-1.7857741117477417, -1.6701786518096924, 1.4..."
8,how to stop winter asthma cough?,medicine,0,what meds have they tried? sounds like they sh...,"[how, to, stop, winter, asthma, cough, ?]","[what, meds, have, they, tried, ?, sounds, lik...","[6.735128402709961, -0.14866364002227783, -2.2..."
9,how to stop winter asthma cough?,medicine,1,winter asthma cough can be triggered by a vari...,"[how, to, stop, winter, asthma, cough, ?]","[winter, asthma, cough, can, be, triggered, by...","[-1.1037442684173584, -0.7236596941947937, -1...."


In [17]:
# Model class must be defined somewhere
model = torch.load("word2vec_model.pt")
model.eval()

Sequential(
  (0): Linear(in_features=10000, out_features=200, bias=True)
  (1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
  (3): Dropout(p=0.2, inplace=False)
  (4): Linear(in_features=200, out_features=1, bias=True)
  (5): Sigmoid()
)

In [18]:
x_manual = get_embedding_tensor(manual_data['embedding'])
y_manual = get_embedding_tensor(manual_data['labels'])

In [19]:
with torch.no_grad():
    model.eval()
    preds_manual = model(x_manual).squeeze(1)
    preds_manual = torch.from_numpy(np.array([1 if pred.item() > 0.5 else 0 for pred in preds_manual]))

accuracy = accuracy_score(y_manual, preds_manual)
precision, recall, f_score, _ = precision_recall_fscore_support(
    y_manual, preds_manual, average="macro")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F-score: {f_score:.4f}")

Accuracy: 0.8000
Precision: 0.8125
Recall: 0.8000
F-score: 0.7980


In [20]:
idxs = [i for i in range(40) if (y_manual != preds_manual).tolist()[i]]

manual_data.iloc[idxs]

Unnamed: 0,question,source,labels,answers,q,a,embedding
13,is it normal to feel guilty for hiring a cleaner?,open_qa,1,it is not uncommon to feel guilty for hiring a...,"[is, it, normal, to, feel, guilty, for, hiring...","[it, is, not, uncommon, to, feel, guilty, for,...","[0.6197357773780823, -1.6820577383041382, 0.90..."
18,how to handle not wanting to go on a second da...,open_qa,0,"just tell him, that you are not interested in ...","[how, to, handle, not, wanting, to, go, on, a,...","[just, tell, him, ,, that, you, are, not, inte...","[-0.7063497304916382, -0.17221549153327942, -1..."
21,anyone else giving up on finding someone?,open_qa,1,it is not uncommon to feel discouraged or frus...,"[anyone, else, giving, up, on, finding, someon...","[it, is, not, uncommon, to, feel, discouraged,...","[0.6197357773780823, -1.6820577383041382, 0.90..."
23,what is a derivative?,Finance,1,a derivative is a financial contract that deri...,"[what, is, a, derivative, ?]","[a, derivative, is, a, financial, contract, th...","[-1.5230835676193237, -4.1018242835998535, -0...."
24,what is philosophy?,open_qa,0,"quite literally, the term ""philosophy"" means, ...","[what, is, philosophy, ?]","[quite, literally, ,, the, term, ``, philosoph...","[-1.736244559288025, -1.4428322315216064, -0.3..."
28,what kind of people will not succeed in life?,open_qa,1,it is not appropriate to generalize and make a...,"[what, kind, of, people, will, not, succeed, i...","[it, is, not, appropriate, to, generalize, and...","[0.6197357773780823, -1.6820577383041382, 0.90..."
36,what is the largest single structure discovere...,science,1,the largest single structure discovered in the...,"[what, is, the, largest, single, structure, di...","[the, largest, single, structure, discovered, ...","[2.3667237758636475, -1.0097883939743042, -0.5..."
39,can tornadoes form on venus?,science,1,"no, tornadoes cannot form on venus as the atmo...","[can, tornadoes, form, on, venus, ?]","[no, ,, tornadoes, can, not, form, on, venus, ...","[-0.46490922570228577, -0.053871188312768936, ..."


In [21]:
for idx in idxs:
    print(f'[idx {idx}]')
    print(manual_data.iloc[idx]['labels'])
    print(manual_data.iloc[idx]['question'])
    print(manual_data.iloc[idx]['answers'])

[idx 13]
1
is it normal to feel guilty for hiring a cleaner?
it is not uncommon to feel guilty for hiring a cleaner, as there are many cultural and social stigmas attached to domestic work and outsourcing household tasks. however, it is important to recognize that hiring a cleaner is a personal choice that can provide significant benefits for both you and the cleaner.
[idx 18]
0
how to handle not wanting to go on a second date? ghost, lie, or full honesty?
just tell him, that you are not interested in seeing him again and you should be fine
[idx 21]
1
anyone else giving up on finding someone?
it is not uncommon to feel discouraged or frustrated when it comes to finding a romantic partner, especially if you have been searching for a while without success. however, it's important to remember that everyone's journey towards finding love is different, and it's never too late to find a fulfilling and happy relationship
[idx 23]
1
what is a derivative?
a derivative is a financial contract th

In [2]:
! jupyter nbconvert --to html word2vec_nn.ipynb

[NbConvertApp] Converting notebook word2vec_nn.ipynb to html
[NbConvertApp] Writing 672821 bytes to word2vec_nn.html
