In [20]:
import pandas as pd
import numpy as np
import re
from collections import Counter

import torch
import wandb
from torch import nn

In [2]:
lau = pd.read_csv("Cleaning/lau.csv")
legge = pd.read_csv("Cleaning/legge.csv")

lau['txt'] = lau['txt'].map(lambda x: re.sub(r'[^\w\s]', ' ', x))
lau['txt'] = lau['txt'].apply(lambda x: x.lower())

legge['txt'] = legge['txt'].map(lambda x: re.sub(r'[^\w\s]', ' ', x))
legge['txt'] = legge['txt'].apply(lambda x: x.lower())

lau['seq_words'] = lau['txt'].apply(lambda x: x.split())
legge['seq_words'] = legge['txt'].apply(lambda x: x.split())

set1 = lau["seq_words"]
set2 = legge["seq_words"]

train_label = []
train_tokens = []
test_label = []
test_tokens = []

for i in range(0, 70):
    train_tokens.append(set1[i])
    train_label.append(0)
    train_tokens.append(set2[i])
    train_label.append(1)


for i in range(70, 81):
    test_tokens.append(set1[i])
    test_label.append(0)
    test_tokens.append(set2[i])
    test_label.append(1)

train_data = pd.DataFrame({'label': train_label, 'seq_words': train_tokens})
test_data = pd.DataFrame({'label': test_label, 'seq_words': test_tokens})

In [3]:
train_data.head()

Unnamed: 0,label,seq_words
0,0,"[the, way, that, can, be, spoken, of, is, not,..."
1,1,"[the, tao, that, can, be, trodden, is, not, th..."
2,0,"[the, whole, world, recognizes, the, beautiful..."
3,1,"[all, in, the, world, know, the, beauty, of, t..."
4,0,"[not, to, honor, men, of, worth, will, keep, t..."


In [4]:
words = train_data['seq_words'].tolist()
tokens_list = []
for i in words:
    tokens_list.extend(i)
    
count_tokens = Counter(tokens_list)
sorted_tokens = count_tokens.most_common(len(tokens_list))
print(sorted_tokens[10:30])
tokens_top = sorted_tokens[:1000]
    
tokens2index = {w:i+2 for i, (w,c) in enumerate(tokens_top)}

tokens2index['<pad>'] = 0
tokens2index['<unk>'] = 1

[('be', 198), ('he', 192), ('one', 147), ('who', 146), ('are', 141), ('no', 135), ('his', 132), ('will', 122), ('as', 119), ('its', 116), ('them', 114), ('all', 113), ('when', 112), ('i', 94), ('by', 93), ('from', 92), ('their', 91), ('way', 90), ('there', 88), ('with', 86)]


In [5]:
def encode_word2index(x,tokens2index):
    input_tokens = [tokens2index.get(w,1) for w in x]

    return input_tokens

train_data['input_x'] = train_data['seq_words'].apply(lambda x: encode_word2index(x, tokens2index))

test_data['input_x'] = test_data['seq_words'].apply(lambda x: encode_word2index(x, tokens2index))

In [6]:
train_data

Unnamed: 0,label,seq_words,input_x
0,0,"[the, way, that, can, be, spoken, of, is, not,...","[2, 29, 11, 46, 12, 770, 6, 3, 9, 2, 143, 29, ..."
1,1,"[the, tao, that, can, be, trodden, is, not, th...","[2, 35, 11, 46, 12, 1, 3, 9, 2, 282, 5, 190, 3..."
2,0,"[the, whole, world, recognizes, the, beautiful...","[2, 232, 75, 780, 2, 284, 20, 2, 284, 42, 34, ..."
3,1,"[all, in, the, world, know, the, beauty, of, t...","[23, 8, 2, 75, 61, 2, 784, 6, 2, 284, 5, 8, 11..."
4,0,"[not, to, honor, men, of, worth, will, keep, t...","[9, 4, 792, 56, 6, 460, 19, 85, 2, 45, 27, 793..."
...,...,...,...
135,1,"[he, who, in, tao, s, wars, has, skill, assume...","[13, 15, 8, 35, 65, 1, 43, 212, 1, 17, 1, 1, 1..."
136,0,"[the, strategists, have, a, saying, i, dare, n...","[2, 1, 37, 10, 268, 25, 236, 9, 1, 2, 423, 38,..."
137,1,"[a, master, of, the, art, of, war, has, said, ...","[10, 370, 6, 2, 744, 6, 227, 43, 204, 25, 47, ..."
138,0,"[my, words, are, very, easy, to, understand, a...","[171, 145, 16, 250, 114, 4, 584, 5, 250, 114, ..."


In [7]:
seq_len = 100
def _pad_truncate_seq(x,seq_len):
    if len(x) >= seq_len:
        return x[:seq_len]
    else:
        return x+[0]*(seq_len-len(x))

train_data['input_x'] = train_data['input_x'].apply(lambda x: _pad_truncate_seq(x, seq_len))

test_data['input_x'] = test_data['input_x'].apply(lambda x: _pad_truncate_seq(x, seq_len))

In [8]:
train_data.head()

Unnamed: 0,label,seq_words,input_x
0,0,"[the, way, that, can, be, spoken, of, is, not,...","[2, 29, 11, 46, 12, 770, 6, 3, 9, 2, 143, 29, ..."
1,1,"[the, tao, that, can, be, trodden, is, not, th...","[2, 35, 11, 46, 12, 1, 3, 9, 2, 282, 5, 190, 3..."
2,0,"[the, whole, world, recognizes, the, beautiful...","[2, 232, 75, 780, 2, 284, 20, 2, 284, 42, 34, ..."
3,1,"[all, in, the, world, know, the, beauty, of, t...","[23, 8, 2, 75, 61, 2, 784, 6, 2, 284, 5, 8, 11..."
4,0,"[not, to, honor, men, of, worth, will, keep, t...","[9, 4, 792, 56, 6, 460, 19, 85, 2, 45, 27, 793..."


In [9]:
train_data.to_csv('training_data.csv', index = False)
test_data.to_csv('test_data.csv', index = False)

In [10]:
from torch.utils.data import Dataset
from ast import literal_eval
import torch.autograd as autograd

In [11]:
class LoadData(Dataset):
    def __init__(self, filename):
        self.df = pd.read_csv(filename, converters={'input_x': literal_eval})

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        ## load the input features and labels
        input_x = self.df.loc[index, 'input_x']
        label = self.df.loc[index, 'label']

        return torch.tensor(input_x), torch.tensor(label,dtype=torch.float)
    

training_set = LoadData('training_data.csv')
test_set = LoadData('test_data.csv')
training_generator = torch.utils.data.DataLoader(training_set, batch_size = 70, shuffle=True, num_workers = 1)
test_generator = torch.utils.data.DataLoader(test_set, batch_size = 70, shuffle=True, num_workers = 1)

In [12]:
training_set

<__main__.LoadData at 0x7fe51bba0a90>

In [13]:
vocab_size = len(tokens2index)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

In [14]:
class LSTMModel(nn.Module):
    # LSTM Algorithm built with help from Professor Huajie Shao's lecture slides

    def __init__(self, vocab_size, output_size, embedding_dim,\
        hidden_dim, n_layers, input_len):

        super().__init__()
        
        self.output_size = output_size  # y_out size = 1
        self.n_layers = n_layers   # layers of LSTM
        self.hidden_dim = hidden_dim  # hidden dim of LSTM
        self.input_len = input_len # input features
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.init_weights()

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.pool = nn.MaxPool1d(self.input_len)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()

    def init_weights(self):
        self.embedding.weight.data.uniform_(-0.1, 0.1)
    
    ## initial hidden state and cell state
    def _init_hidden(self, batch_size):
        return(autograd.Variable(torch.randn(self.n_layers, batch_size, self.hidden_dim)).to(device),
                autograd.Variable(torch.randn(self.n_layers, batch_size, self.hidden_dim)).to(device)
                )

    def forward(self, x):
        batch_size = x.size(0)
        hidden_cell = self._init_hidden(batch_size)

        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds, hidden_cell)
        lstm_out = lstm_out.permute(0,2,1)
        out = self.pool(lstm_out)
        out = out.view(out.size(0),-1)
        
        out = self.dropout(out)

        out = self.fc(out)
        out = self.sigmoid(out)
        out = out[:,0]
        
        return out

In [15]:
n_layers = 2
hidden_dim = 50
output_size = 1
learning_rate = 0.003
input_len = 100
embedding_dim = 100
mode = 'train'
num_epoches = 40
clip = 5

In [16]:
model = LSTMModel(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, input_len)
model.to(device)

LSTMModel(
  (embedding): Embedding(1002, 100)
  (lstm): LSTM(100, 50, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (pool): MaxPool1d(kernel_size=100, stride=100, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=50, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [17]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fun = nn.BCELoss()

In [18]:
def _compute_accuracy(y_pred, y_batch, batch_size):
    accy = ((y_pred==y_batch).sum().item())/batch_size
    return accy

In [21]:
def main():
    global_step = 0
    print("Started")
    wandb.init(project='daodejing', name='LSTM')
    if mode == 'train':
        model.train()
        for epoches in range(num_epoches):
            for x_batch, y_labels in training_generator:
                global_step += 1
                x_batch, y_labels = x_batch.to(device), y_labels.to(device)
           
                y_out = model(x_batch)

                loss = loss_fun(y_out, y_labels)

                optimizer.zero_grad()
                loss.backward()
                ## clip_grad_norm helps prevent the exploding gradient problem in LSTMs.
                nn.utils.clip_grad_norm_(model.parameters(), clip)
                optimizer.step()
                if global_step%2==0:
                    print('step: {} loss: {}'.format(global_step, loss.item()))
                    wandb.log({'step': global_step, 'loss': loss.item()})
            
        total = 0
        accy_count = 0
    
        model.eval()
        with torch.no_grad():
            for x_batch, y_labels in test_generator:
                x_batch, y_labels = x_batch.to(device), y_labels.to(device)
                y_out = model(x_batch)
                y_pred = torch.round(y_out)

                total += len(y_labels)
                accy_count += _compute_accuracy(y_pred, y_labels, 150) * 150

        accy = accy_count/total
        print("testing accy: ", accy)

if __name__ == '__main__':
	main()

Started


[34m[1mwandb[0m: Currently logged in as: [33mbingrui-ben-li[0m ([33mbenbingruili[0m). Use [1m`wandb login --relogin`[0m to force relogin


step: 2 loss: 0.697630763053894
step: 4 loss: 0.6973266005516052
step: 6 loss: 0.6953089833259583
step: 8 loss: 0.7024322748184204
step: 10 loss: 0.6936585307121277
step: 12 loss: 0.6784517765045166
step: 14 loss: 0.6844329833984375
step: 16 loss: 0.6999346017837524
step: 18 loss: 0.674450159072876
step: 20 loss: 0.6917302012443542
step: 22 loss: 0.6929349899291992
step: 24 loss: 0.6831426024436951
step: 26 loss: 0.6816210150718689
step: 28 loss: 0.6780406832695007
step: 30 loss: 0.6762481927871704
step: 32 loss: 0.6636157035827637
step: 34 loss: 0.6251184940338135
step: 36 loss: 0.5898539423942566
step: 38 loss: 0.5021918416023254
step: 40 loss: 0.3811611831188202
step: 42 loss: 0.29215574264526367
step: 44 loss: 0.28130656480789185
step: 46 loss: 0.19154000282287598
step: 48 loss: 0.16102786362171173
step: 50 loss: 0.1250331550836563
step: 52 loss: 0.13284893333911896
step: 54 loss: 0.08262196183204651
step: 56 loss: 0.11330393701791763
step: 58 loss: 0.13333888351917267
step: 60 los