In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

torch.manual_seed(1)

import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from nltk.corpus import wordnet
import json

  (fname, cnt))
  (fname, cnt))


In [2]:
import gensim
gensim_model = gensim.models.KeyedVectors.load_word2vec_format('./word2vec/data.bin', binary=True)

In [3]:
weights = gensim_model.wv.syn0
# weights.shape
embedding_dim = weights.shape[1]

  if __name__ == '__main__':
  if __name__ == '__main__':


In [4]:
weights = np.append(weights,np.zeros((1,embedding_dim)),axis=0)
# 末尾にunknown_wordを追加

In [5]:
vocab_size = weights.shape[0]

In [6]:
cuda = torch.cuda.is_available()

In [7]:
out_size = 3

In [8]:
class LSTMTagger(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.out = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        h = torch.zeros(1, 1, self.hidden_dim)
        c = torch.zeros(1, 1, self.hidden_dim)
        if cuda:
            h = h.cuda()
            c = c.cuda()
        return (h,c)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
#         print(embeds.size())

        lstm_output, self.hidden = self.lstm(
            embeds.view(len(sentence),1,-1), self.hidden)

        output = self.out(lstm_output.view(len(sentence),-1))
        output = F.tanh(output)
        return output

In [9]:
import re
import nltk
from nltk import word_tokenize

In [10]:
def prepare_sequence(seq):
    vocab = gensim_model.wv.vocab
    idxs = [vocab[w].index if w in vocab else vocab_size - 1 for w in seq]
    res = torch.tensor(idxs, dtype=torch.long)
    if cuda:
        res = res.cuda()
    return res

In [39]:
def sentence2vec(sentence,debug=False):
    sentence = sentence.replace("'"," ").replace("."," ").replace(","," ").replace("\""," ")
    w_list = word_tokenize(sentence)
    w_list = [wordnet.morphy(w).lower() if wordnet.morphy(w) is not None else w.lower() for w in w_list]
    if debug:
        print(w_list)
    res_seq = prepare_sequence(w_list)
    return res_seq

In [12]:
def load_model(hidden_dim,model_name):
    torch.manual_seed(1)
    model = LSTMTagger(embedding_dim, hidden_dim, vocab_size, out_size)
    model_state_dict = torch.load(model_name)
    model_state_dict['word_embeddings.weight'] = torch.from_numpy(weights).float()
    model.load_state_dict(model_state_dict)
    model.word_embeddings.weight.requires_grad = False
    return model

In [13]:
def make_prediction(hidden_dim,model_path,csv_name):
    model = load_model(hidden_dim, model_path)
    if cuda:
        model.cuda()

    data_cut = pd.read_csv('./data_cut.csv',encoding='utf-16')
    pred_list = []
    for index,row in data_cut.iterrows():
        sentence = row['sentence']
        model.zero_grad()
        model.hidden = model.init_hidden()
        sentence_in = sentence2vec(sentence)
        y = model(sentence_in)[-1,:]
        if cuda:
            y = y.cpu()
        pred_list.append(y.detach().numpy())

    pred_list = np.array(pred_list)

    pred_list = pred_list*2 + 3

    pred_data = data_cut.assign(
        Valence_pred = pd.Series(pred_list[:,0]),
        Arousal_pred = pd.Series(pred_list[:,1]),
        Dominance_pred = pd.Series(pred_list[:,2])
    )
    pred_data.to_csv(csv_name,encoding='utf-16',sep="\t")

In [35]:
hidden_dim = 6
model_path = './dat/model_data_6_epoch_59'
csv_name = './dat/pred_6_epoch_59.csv'
make_prediction(hidden_dim,model_path,csv_name)

  from ipykernel import kernelapp as app


In [15]:
weights.shape

(3000001, 300)

In [14]:
hidden_dim = 6
model_path = './dat/model_data_6_epoch_59'
model = load_model(hidden_dim, model_path)

In [17]:
if cuda:
    model.cuda()

In [43]:
in_dat = [
    "I feel so sorry about that"
]
pred_list = []

for i in in_dat:
    model.zero_grad()
    model.hidden = model.init_hidden()
    sentence_in = sentence2vec(i)
    y = model(sentence_in)[-1,:]
    if cuda:
        y = y.cpu()
    pred_list.append(y.detach().numpy())

  from ipykernel import kernelapp as app


In [44]:
pred_list = np.array(pred_list)
pred_list = pred_list*2 + 3

In [45]:
pred_list = list(pred_list)
print("[Valence  ,Arousal  ,Dominance]")
for sentence,vad in zip(in_dat,pred_list):
    print(str(vad)+"  "+sentence)

[Valence  ,Arousal  ,Dominance]
[2.5193458 3.1973207 2.9886205]  I feel so sorry about that
