In [1]:
import torch
if torch.cuda.is_available and torch.has_cudnn:
    device = torch.device('cuda')
else:
    device = torch.device("cpu")
print(device)


cpu


In [2]:
import re
import numpy as np
import pandas as pd
import pickle as pkl
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter

# Load the tokens from pickle files
loc_tokens = pkl.load(open("loc_tokens.p", "rb"))

loc_label = pd.read_csv('location.csv', sep='\t')['Y']
print(len(loc_tokens))
print(len(loc_label))

28056
28056


In [3]:
import math
x = []
y = []
data = loc_tokens
label = loc_label.tolist()
for i in range(len(data)):
    if len(data[i]) != 0 and float(label[i]) >= 0:
        x.append(data[i])
        y.append(int(label[i]))
from collections import Counter
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=99)
Counter(y_train)
max_vocab_size = 50000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1


all_tokens = []
for token in x_train:
    all_tokens += token

def build_vocab(all_tokens):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

token2id, id2token = build_vocab(all_tokens)

    
train_data_indices = token2index_dataset(x_train)
val_data_indices = token2index_dataset(x_test)
Counter(y_train)

Counter({0: 11882, 1: 6988, 2: 2168})

In [4]:
import json
with open('token2id.json', 'w') as fp:
    json.dump(token2id, fp)

In [5]:
with open('id2token.txt', 'w') as filehandle:  
    for listitem in id2token:
        filehandle.write('%s\n' % listitem)

In [11]:
MAX_SENTENCE_LENGTH = 200

import numpy as np
import torch
from torch.utils.data import Dataset

class LocDataset(Dataset):

    def __init__(self, data_list, target_list):
  
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def loc_collate_func(batch):

    data_list = []
    label_list = []
    length_list = []
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)

    return [torch.from_numpy(np.array(data_list)).to(device), 
            torch.LongTensor(length_list).to(device), torch.LongTensor(label_list).to(device)]

BATCH_SIZE = 16
train_dataset = LocDataset(train_data_indices, y_train)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=loc_collate_func,
                                           shuffle=True)

val_dataset = LocDataset(val_data_indices, y_test)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=loc_collate_func,
                                           shuffle=True)


In [15]:
# First import torch related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

class BagOfWords(nn.Module):

    def __init__(self, vocab_size, emb_dim):
   
        super(BagOfWords, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(emb_dim,3)
    
    def forward(self, data, length):
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
#         parameters = self.embed.weight.cpu().detach().numpy()
        # return logits
        out = self.relu(out.float())
        out = self.linear(out.float())
        
        return out

emb_dim = 100
model = BagOfWords(len(id2token), emb_dim)
model.to(device)

BagOfWords(
  (embed): Embedding(39750, 100, padding_idx=0)
  (relu): ReLU()
  (linear): Linear(in_features=100, out_features=3, bias=True)
)

In [16]:
import numpy as np
learning_rate = 0.001
num_epochs = 3 # number epoch to train
ngram = 1
# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Function for testing the model
def test_model(loader, model):
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        data_batch = data_batch.long()
        out = model(data_batch, length_batch)
        outputs = F.softmax(out, dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
#         print(Counter(predicted.to('cpu').numpy().ravel().tolist()))
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

for epoch in range(num_epochs):
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        data_batch, length_batch, label_batch = data, lengths, labels
        optimizer.zero_grad()
        data_batch = data_batch.long()
        label_batch = label_batch.long()
        outputs = model(data_batch, length_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))
            

Epoch: [1/3], Step: [101/1315], Validation Acc: 56.83730215314416
Epoch: [1/3], Step: [201/1315], Validation Acc: 56.83730215314416
Epoch: [1/3], Step: [301/1315], Validation Acc: 56.83730215314416
Epoch: [1/3], Step: [401/1315], Validation Acc: 56.82304292029089
Epoch: [1/3], Step: [501/1315], Validation Acc: 56.83730215314416
Epoch: [1/3], Step: [601/1315], Validation Acc: 56.865820618850705
Epoch: [1/3], Step: [701/1315], Validation Acc: 57.293597604448884
Epoch: [1/3], Step: [801/1315], Validation Acc: 57.62155996007415
Epoch: [1/3], Step: [901/1315], Validation Acc: 57.50748609724797
Epoch: [1/3], Step: [1001/1315], Validation Acc: 57.7498930557536
Epoch: [1/3], Step: [1101/1315], Validation Acc: 58.99044631398831
Epoch: [1/3], Step: [1201/1315], Validation Acc: 59.018964779694855
Epoch: [1/3], Step: [1301/1315], Validation Acc: 59.38970483387994
Epoch: [2/3], Step: [101/1315], Validation Acc: 60.25951803792956
Epoch: [2/3], Step: [201/1315], Validation Acc: 60.188221873663196
Epo

In [17]:
np.savetxt('embedding.txt',model.embed.weight.cpu().detach().numpy())