In [1]:
import sys
import os
import csv
import random

import pickle
import scipy
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.init as init
from torch import optim

import common
from utils.io_utils import IOUtils
from utils.nlp_utils import NLPUtils

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import KFold

seed = 10

random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)

In [2]:
PERMISSION_TYPE="READ_CALENDAR"
MODEL_TYPE="HAN"
OUTPUT_DIR="output/reports/{}".format(MODEL_TYPE)
PARAMETERS_DIR="/home/huseyinalecakir/HAN/data/saved-parameters"

In [3]:
class Opts:
    permission_type = PERMISSION_TYPE
    useful_reviews = 5
    saved_data = "{}/saved-data/emdeddings-documents-w2i.pickle".format(PARAMETERS_DIR)
    saved_reviews = "{}/saved-data/reviews.pickle".format(PARAMETERS_DIR)
    saved_predicted_reviews = "{}/saved-data/predicted-{}-reviews.pickle".format(PARAMETERS_DIR, PERMISSION_TYPE)
    model_checkpoint = "{}/saved-models/{}-{}.pt".format(PARAMETERS_DIR, MODEL_TYPE, PERMISSION_TYPE)
    outdir = "{}/{}-{}.out".format(OUTPUT_DIR, MODEL_TYPE, PERMISSION_TYPE)
    stemmer = "porter"
    disable_cuda = False
    hidden_size = 128
    init_weight = 0.08
    output_size = 1
    grad_clip = 5
    dropout = 0
    dropoutrec = 0
    learning_rate_decay = 0.985
    learning_rate_decay_after = 2
    print_every = 100
    device = None

In [4]:
args = Opts()

if not args.disable_cuda and torch.cuda.is_available():
    args.device = torch.device("cuda")
else:
    args.device = torch.device("cpu")

In [5]:
class Data:
    def __init__(self):
        self.w2i = None
        self.entries = None
        self.train_entries = None
        self.test_entries = None
        self.ext_embedding = None
        self.reviews = None
        self.predicted_reviews = None

    def to(self, device):
        if self.entries:
            for document in self.entries:
                for i in range(len(document.index_tensors)):
                    document.index_tensors[i] = document.index_tensors[i].to(
                        device=device
                    )
        if self.reviews:
            for doc_id in self.reviews:
                for review in self.reviews[doc_id]:
                    review.index_tensor = review.index_tensor.to(device=device)
        if self.predicted_reviews:
            for doc_id in self.predicted_reviews:
                for review in self.predicted_reviews[doc_id]:
                    review.index_tensor = review.index_tensor.to(device=device)

    def load(self, infile):
        with open(infile, "rb") as target:
            self.ext_embeddings, self.entries, self.w2i = pickle.load(target)

    def save_data(self, infile):
        with open(infile, "rb") as target:
            self.ext_embeddings, self.entries, self.w2i = pickle.dump(target)

    def load_predicted_reviews(self, infile):
        with open(infile, "rb") as target:
            self.predicted_reviews = pickle.load(target)
        for app_id in self.predicted_reviews.keys():
            self.predicted_reviews[app_id].sort(
                key=lambda x: x.prediction_result.item(), reverse=True
            )

    def load_reviews(self, infile):
        with open(infile, "rb") as target:
            self.reviews = pickle.load(target)

In [14]:
class Encoder(nn.Module):
    def __init__(self, opt, w2i):
        super(Encoder, self).__init__()
        self.opt = opt
        self.w2i = w2i

        self.gru = nn.GRU(
            self.opt.hidden_size, self.opt.hidden_size, batch_first=True, bidirectional=True
        )
        self.embedding = nn.Embedding(len(self.w2i), self.opt.hidden_size)
        self.__initParameters()

    def __initParameters(self):
        for name, param in self.named_parameters():
            if param.requires_grad:
                init.uniform_(param, -self.opt.init_weight, self.opt.init_weight)

    def initalizedPretrainedEmbeddings(self, embeddings):
        weights_matrix = np.zeros(((len(self.w2i), self.opt.hidden_size)))
        for word in self.w2i:
            weights_matrix[self.w2i[word]] = embeddings[word]
        self.embedding.weight = nn.Parameter(torch.FloatTensor(weights_matrix))

    def forward(self, input_src):
        src_emb = self.embedding(input_src)  # batch_size x src_length x emb_size
        outputs, (h, c) = self.gru(src_emb)
        return outputs, (h, c)


class Classifier(nn.Module):
    def __init__(self, opt):
        super(Classifier, self).__init__()
        self.opt = opt
        self.hidden_size = opt.hidden_size
        self.linear = nn.Linear(self.hidden_size, opt.output_size)

        self.sigmoid = nn.Sigmoid()
        self.__initParameters()

    def __initParameters(self):
        for name, param in self.named_parameters():
            if param.requires_grad:
                init.uniform_(param, -self.opt.init_weight, self.opt.init_weight)

    def forward(self, prev_h):
        if self.opt.dropout > 0:
            prev_h = self.dropout(prev_h)
        h2y = self.linear(prev_h)
        pred = self.sigmoid(h2y)
        return pred
    
class Attention(nn.Module):
    def __init__(self, opt):
        super(Attention, self).__init__()
        self.opt = opt
        self.hidden_size = opt.hidden_size
        self.linear = nn.Linear(2*self.hidden_size, opt.hidden_size)
        self.context = torch.rand(self.hidden_size, device=self.opt.device, requires_grad=True)
        self.__initParameters()

    def __initParameters(self):
        for name, param in self.named_parameters():
            if param.requires_grad:
                init.uniform_(param, -self.opt.init_weight, self.opt.init_weight)

    def forward(self, inputs):
        inputs = inputs.view(inputs.shape[1], -1)
        hidden_repr = self.linear(inputs)
        hidden_with_context = torch.exp(torch.mv(hidden_repr, self.context))
        normalized_weight = hidden_with_context/torch.sum(hidden_with_context)
        normalized_hidden_repr = torch.sum(inputs.t()*normalized_weight, dim=1)
        return normalized_hidden_repr

In [23]:
class Model:
    def __init__(self):
        self.opt = None
        self.encoders = {}
        self.attentions = {}
        self.review_encoder = None
        self.classifier = None
        self.optimizer = None
        self.criterion = None

    def create(self, opt, data):
        self.opt = opt
        self.encoders["sentenceL1"] = Encoder(self.opt, data.w2i).to(
            device=self.opt.device
        )
        self.attentions["word_attention"] = Attention(self.opt).to(
            device=self.opt.device
        )
        self.encoders["sentenceL2"] = nn.GRUCell(
            2*self.opt.hidden_size, self.opt.hidden_size)
    
        params = []
        for encoder in self.encoders:
            params += list(self.encoders[encoder].parameters())
        self.classifier = Classifier(self.opt).to(device=self.opt.device)
        params += list(self.classifier.parameters())
        self.optimizer = optim.RMSprop(params)
        self.criterion = nn.BCELoss()

    def train(self):
        for encoder in self.encoders:
            self.encoders[encoder].train()
        for attention in self.attentions:
            self.attentions[attention].train()
        self.classifier.train()

    def eval(self):
        for encoder in self.encoders:
            self.encoders[encoder].eval()
        for attention in self.attentions:
            self.attentions[attention].eval()
        self.classifier.eval()

    def step(self):
        self.optimizer.step()

    def zero_grad(self):
        self.optimizer.zero_grad()

    def rate_decay(self):
        for param_group in self.optimizer.param_groups:
            param_group["lr"] = param_group["lr"] * self.opt.learning_rate_decay

    def grad_clip(self):
        for encoder in self.encoders:
            torch.nn.utils.clip_grad_value_(
                self.encoders[encoder].parameters(), self.opt.grad_clip
            )
        for attention in self.attentions:
            torch.nn.utils.clip_grad_value_(
                self.attentions[attention].parameters(), self.opt.grad_clip
            )
        torch.nn.utils.clip_grad_value_(
            self.classifier.parameters(), self.opt.grad_clip
        )

    def save(self, filename):
        checkpoint = {}
        checkpoint["opt"] = self.opt
        for encoder in self.encoders:
            checkpoint[encoder] = self.encoders[encoder].state_dict()
        for attention in self.attentions:
            checkpoint[attention] = self.attentions[attention].state_dict()
        checkpoint["classifier"] = self.classifier.state_dict()
        checkpoint["optimizer"] = self.optimizer.state_dict()
        torch.save(checkpoint, filename)

    def load(self, filename, data):
        checkpoint = torch.load(filename)
        opt = checkpoint["opt"]
        self.create(opt, data)
        for encoder in self.encoders:
            self.encoders[encoder].load_state_dict(checkpoint[encoder])
        for attention in self.attentions:
            self.attentions[attention].load_state_dict(checkpoint[attention])
        self.classifier.load_state_dict(checkpoint["classifier"])
        self.optimizer.load_state_dict(checkpoint["optimizer"])


def write_file(filename, string):
    with open(filename, "a") as target:
        target.write("{}\n".format(string))
        target.flush()


def train_item(args, model, document):
    model.zero_grad()
    sentence_encodings = []
    for sentence_index_tensor in document.index_tensors:
        if sentence_index_tensor.shape[1] > 0:
            outputs_s, (hidden_s, cell_s) = model.encoders["sentenceL1"](sentence_index_tensor)
            context = model.attentions["word_attention"](outputs_s)
            sentence_encodings.append(context)
    torch.zeros([len(sentence_encodings), sentence_encodings[0].shape[0]])

In [24]:
data = Data()
data.load(args.saved_data)
data.to(args.device)

data.entries = np.array(data.entries)
random.shuffle(data.entries)
data.test_entries = data.entries[: int(len(data.entries) / 10)]
data.train_entries = data.entries[int(len(data.entries) / 10) :]

In [25]:
model = Model()
model.create(args, data)
document = data.train_entries[0]

In [26]:
train_item(args, model, document)

torch.Size([1, 6, 256])
torch.Size([1, 7, 256])
torch.Size([1, 7, 256])
torch.Size([1, 3, 256])
torch.Size([1, 18, 256])
torch.Size([1, 19, 256])
torch.Size([1, 8, 256])
torch.Size([1, 13, 256])
torch.Size([1, 4, 256])
torch.Size([1, 9, 256])
torch.Size([1, 6, 256])
torch.Size([1, 13, 256])
torch.Size([1, 15, 256])
torch.Size([1, 9, 256])
torch.Size([1, 10, 256])
torch.Size([1, 6, 256])
torch.Size([1, 10, 256])
torch.Size([1, 9, 256])
torch.Size([1, 9, 256])
torch.Size([1, 14, 256])
