In [2]:
import argparse
import os
# from pickletools import optimize
import random
import string
import time
from math import log
import numpy as np
import scipy.sparse as sp
from nltk.corpus import stopwords
from stanfordcorenlp import StanfordCoreNLP
from torch import Tensor, nn
from tqdm import tqdm
import torch
from torch.optim import AdamW
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler, DataLoader
import pickle as pkl
import json

In [3]:
seed=148
print(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
device

148


device(type='cpu')

In [4]:
def gen_corpus(dataset):
    input1 = os.sep.join(['data', dataset])
    doc_name_list = []
    doc_train_list = []
    doc_test_list = []

    f = open(input1 + '.txt', 'r', encoding='latin1')
    lines = f.readlines()
    for line in lines:
        doc_name_list.append(line.strip())
        temp = line.split("\t")
        if temp[1].find('test') != -1:
            doc_test_list.append(line.strip())
        elif temp[1].find('train') != -1:
            doc_train_list.append(line.strip())
    f.close()

    doc_content_list = []
    f = open(input1 + '.clean.txt', 'r')
    lines = f.readlines()
    for line in lines:
        doc_content_list.append(line.strip())
    f.close()

    train_ids = []
    for train_name in doc_train_list:
        train_id = doc_name_list.index(train_name)
        train_ids.append(train_id)
    random.shuffle(train_ids)

    train_ids_str = '\n'.join(str(index) for index in train_ids)

    test_ids = []
    for test_name in doc_test_list:
        test_id = doc_name_list.index(test_name)
        test_ids.append(test_id)
    # print(test_ids)
    random.shuffle(test_ids)

    test_ids_str = '\n'.join(str(index) for index in test_ids)

    ids = train_ids + test_ids
    # print(ids)
    # print(len(ids))

    shuffle_doc_name_list = []
    shuffle_doc_words_list = []
    for id in ids:
        shuffle_doc_name_list.append(doc_name_list[int(id)])
        shuffle_doc_words_list.append(doc_content_list[int(id)])
    label_set = set()
    for doc_meta in shuffle_doc_name_list:
        temp = doc_meta.split('\t')
        label_set.add(temp[2])
    label_list = list(label_set)
    labels = []
    for one in shuffle_doc_name_list:
        entry = one.split('\t')
        labels.append(label_list.index(entry[-1]))
    shuffle_doc_name_str = '\n'.join(shuffle_doc_name_list)
    shuffle_doc_words_str = '\n'.join(shuffle_doc_words_list)
    word_freq = {}
    word_set = set()
    for doc_words in shuffle_doc_words_list:
        words = doc_words.split()
        for word in words:
            word_set.add(word)
            if word in word_freq:
                word_freq[word] += 1
            else:
                word_freq[word] = 1

    vocab = list(word_set)
    vocab_size = len(vocab)

    word_doc_list = {}

    for i in range(len(shuffle_doc_words_list)):
        doc_words = shuffle_doc_words_list[i]
        words = doc_words.split()
        appeared = set()
        for word in words:
            if word in appeared:
                continue
            if word in word_doc_list:
                doc_list = word_doc_list[word]
                doc_list.append(i)
                word_doc_list[word] = doc_list
            else:
                word_doc_list[word] = [i]
            appeared.add(word)

    word_doc_freq = {}
    for word, doc_list in word_doc_list.items():
        word_doc_freq[word] = len(doc_list)

    word_id_map = {}
    id_word_map = {}
    for i in range(vocab_size):
        word_id_map[vocab[i]] = i
        id_word_map[i] = vocab[i]

    return shuffle_doc_name_list, shuffle_doc_words_list, train_ids, test_ids, word_doc_freq, word_id_map, id_word_map, vocab, labels, label_list


In [5]:
def gen_tfidf(corpus, word_id_map, word_doc_freq, vocab, train_size):
    row, col, weight_tfidf = [],[],[]
    vocab_size = len(vocab)
    doc_word_freq = {}
    for doc_id in range(len(corpus)):
        doc_words = corpus[doc_id]
        words = doc_words.split()
        for word in words:
            word_id = word_id_map[word]
            doc_word_str = str(doc_id) + ',' + str(word_id)
            if doc_word_str in doc_word_freq:
                doc_word_freq[doc_word_str] += 1
            else:
                doc_word_freq[doc_word_str] = 1
    
    for i in range(len(corpus)):
        doc_words = corpus[i]
        words = doc_words.split()
        doc_word_set = set()
        for word in words:
            if word in doc_word_set:
                continue
            j = word_id_map[word]
            key = str(i) + ',' + str(j)
            freq = doc_word_freq[key]
            if i < train_size:
                row.append(i)
            else:
                row.append(i + vocab_size)
            col.append(train_size + j)
            idf = log(1.0 * len(corpus) /
                    word_doc_freq[vocab[j]])
            weight_tfidf.append(freq * idf)
            doc_word_set.add(word)
    return row, col, weight_tfidf


In [6]:
# load corpus
name, corpus, train_ids, test_ids, word_doc_freq, word_id_map, id_word_map, vocab, labels, label_list = gen_corpus("20ng")
data = [train_ids, test_ids, corpus, labels, vocab, word_id_map, id_word_map, label_list]

num_labels = len(label_list)
row_tfidf, col_tfidf, weight_tfidf = gen_tfidf(corpus, word_id_map, word_doc_freq, vocab, len(train_ids))

In [7]:
valid_size = int(0.1*len(train_ids))
train_size = len(train_ids) - valid_size

In [13]:
thres=0.05
max_len=512
window_size=7
lr=1e-3
batch_size=32
embed_size=200
hidden_size=200
dropout=0
weight_decay=1e-6
epochs=0


In [9]:
def trans_corpus_to_ids(corpus, word_id_map, max_len):
    new_corpus = []
    for text in corpus:
        word_list = text.split()
        if len(word_list) > max_len:
            word_list = word_list[:max_len]
        new_corpus.append([word_id_map[w] + 1 for w in word_list]) # + 1 for padding
    # padding
    for i, one in enumerate(new_corpus):
        if len(one) < max_len:
            new_corpus[i] = one + [0]*(max_len-len(one))
    new_corpus = np.asarray(new_corpus, dtype=np.int32)
    return new_corpus

def lstm_eval(model, dataloader, device):
    model.eval()
    all_preds, all_labels,all_outs = [],[],[]
    for batch in dataloader:
        batch = [one.to(device) for one in batch]
        x, y = batch
        with torch.no_grad():
            output, pred = model(x)
            all_outs.append(output.cpu().numpy())
            pred_ids = torch.argmax(pred, dim=-1)
            all_preds += pred_ids.tolist()
            all_labels += y.tolist()
    acc = np.mean(np.asarray(all_preds) == np.asarray(all_labels))
    all_outs = np.concatenate(all_outs, axis=0)

    model.train()
    return acc, all_outs

class LSTM_classifier(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, num_labels, dropout) -> None:
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.lstm = nn.LSTM(input_size=emb_size, hidden_size=hidden_size,num_layers=1, batch_first=True, dropout=dropout, bidirectional=False)
        self.classifier = nn.Linear(hidden_size, num_labels)
    def forward(self, inputs):
        emb = self.embedding(inputs)
        output, (h_n, c_n) = self.lstm(emb)
        inter_output = torch.mean(output, dim=1)
        res = self.classifier(inter_output)
        return output, res

def train_lstm(corpus, word_id_map, train_size, valid_size, labels, emb_size, hidden_size, dropout, batch_size, epochs, lr, weight_decay, num_labels,device,max_len):
    vocab_size = len(word_id_map) + 1
    corpus_ids = trans_corpus_to_ids(corpus, word_id_map, max_len)
    model = LSTM_classifier(vocab_size, emb_size, hidden_size, num_labels, dropout)
    model.to(device)
    train_data = corpus_ids[:train_size,:]
    dev_data = corpus_ids[train_size:train_size+valid_size,:]
    test_data = corpus_ids[train_size+valid_size:,:]
    train_label = labels[:train_size]
    dev_label = labels[train_size:train_size+valid_size]
    test_label = labels[train_size+valid_size:]
    train_x = torch.tensor(train_data, dtype=torch.long)
    train_y = torch.tensor(train_label, dtype=torch.long)
    dev_x = torch.tensor(dev_data, dtype=torch.long)
    dev_y = torch.tensor(dev_label, dtype=torch.long)
    test_x = torch.tensor(test_data, dtype=torch.long)
    test_y = torch.tensor(test_label, dtype=torch.long)
    train_dataset = TensorDataset(train_x, train_y)
    dev_dataset = TensorDataset(dev_x, dev_y)
    test_dataset = TensorDataset(test_x, test_y)
    train_sampler = RandomSampler(train_dataset)
    train_dev_sampler = SequentialSampler(train_dataset)
    dev_sampler = SequentialSampler(dev_dataset)
    test_sampler = SequentialSampler(test_dataset)
    train_dataloader = DataLoader(train_dataset,batch_size,sampler=train_sampler)
    train_dev_dataloader = DataLoader(train_dataset,batch_size,sampler=train_dev_sampler)
    dev_dataloader = DataLoader(dev_dataset,batch_size,sampler=dev_sampler)
    test_dataloader = DataLoader(test_dataset,batch_size,sampler=test_sampler)
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    model.train()
    loss_func = nn.CrossEntropyLoss(reduction='mean')
    best_acc = 0.0
    if epochs > 0:
        for ep in range(epochs):
            for batch in tqdm(train_dataloader):
                batch = [one.to(device) for one in batch]
                x, y = batch
                output, pred = model(x)
                loss = loss_func(pred, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            acc, all_outs = lstm_eval(model, dev_dataloader, device)
            if acc > best_acc:
                best_acc = acc
                print("Saving semantic model into lstm1.bin")
                torch.save(model.state_dict(), 'lstm1.bin')
                print("current best acc={:4f}".format(acc))
    else:
        print("loading lstm model")
        model.load_state_dict(torch.load('lstm.bin'))
    acc, all_outs_train = lstm_eval(model, train_dev_dataloader, device)
    acc, all_outs_dev = lstm_eval(model, dev_dataloader, device)
    acc, all_outs_test = lstm_eval(model, test_dataloader, device)
    all_outs = np.concatenate([all_outs_train, all_outs_dev, all_outs_test], axis=0)
    return model, all_outs, corpus_ids  

In [12]:
def train_lstm(corpus, word_id_map, train_size, valid_size, labels, emb_size, hidden_size, dropout, batch_size, epochs, lr, weight_decay, num_labels,device,max_len):
    vocab_size = len(word_id_map) + 1
    corpus_ids = trans_corpus_to_ids(corpus, word_id_map, max_len)
    model = LSTM_classifier(vocab_size, emb_size, hidden_size, num_labels, dropout)
    model.to(device)
    train_data = corpus_ids[:train_size,:]
    dev_data = corpus_ids[train_size:train_size+valid_size,:]
    test_data = corpus_ids[train_size+valid_size:,:]
    train_label = labels[:train_size]
    dev_label = labels[train_size:train_size+valid_size]
    test_label = labels[train_size+valid_size:]
    train_x = torch.tensor(train_data, dtype=torch.long)
    train_y = torch.tensor(train_label, dtype=torch.long)
    dev_x = torch.tensor(dev_data, dtype=torch.long)
    dev_y = torch.tensor(dev_label, dtype=torch.long)
    test_x = torch.tensor(test_data, dtype=torch.long)
    test_y = torch.tensor(test_label, dtype=torch.long)
    train_dataset = TensorDataset(train_x, train_y)
    dev_dataset = TensorDataset(dev_x, dev_y)
    test_dataset = TensorDataset(test_x, test_y)
    train_sampler = RandomSampler(train_dataset)
    train_dev_sampler = SequentialSampler(train_dataset)
    dev_sampler = SequentialSampler(dev_dataset)
    test_sampler = SequentialSampler(test_dataset)
    train_dataloader = DataLoader(train_dataset,batch_size,sampler=train_sampler)
    train_dev_dataloader = DataLoader(train_dataset,batch_size,sampler=train_dev_sampler)
    dev_dataloader = DataLoader(dev_dataset,batch_size,sampler=dev_sampler)
    test_dataloader = DataLoader(test_dataset,batch_size,sampler=test_sampler)
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    model.train()
    loss_func = nn.CrossEntropyLoss(reduction='mean')
    best_acc = 0.0
    if epochs > 0:
        for ep in range(epochs):
            for batch in tqdm(train_dataloader):
                batch = [one.to(device) for one in batch]
                x, y = batch
                output, pred = model(x)
                loss = loss_func(pred, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            acc, all_outs = lstm_eval(model, dev_dataloader, device)
            if acc > best_acc:
                best_acc = acc
                print("Saving semantic model into lstm1.bin")
                torch.save(model.state_dict(), 'lstm1.bin')
                print("current best acc={:4f}".format(acc))
    else:
        print("loading lstm model")
        model.load_state_dict(torch.load('lstm.bin'))
    acc, all_outs_train = lstm_eval(model, train_dev_dataloader, device)
    acc, all_outs_dev = lstm_eval(model, dev_dataloader, device)
    acc, all_outs_test = lstm_eval(model, test_dataloader, device)
    all_outs = np.concatenate([all_outs_train, all_outs_dev, all_outs_test], axis=0)
    return model, all_outs, corpus_ids  

In [11]:
vocab_size = len(word_id_map) + 1
corpus_ids = trans_corpus_to_ids(corpus, word_id_map, max_len)

In [14]:
model, all_outs, corpus_ids = train_lstm(corpus, word_id_map, train_size, valid_size, labels, embed_size, hidden_size, dropout, batch_size, epochs, lr, weight_decay, num_labels,device, max_len)

loading lstm model
