In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

torch.manual_seed(1)

import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from nltk.corpus import wordnet
import json

  (fname, cnt))
  (fname, cnt))


In [2]:
import gensim
gensim_model = gensim.models.KeyedVectors.load_word2vec_format('./word2vec/data.bin', binary=True)

In [3]:
weights = gensim_model.wv.syn0
# weights.shape
embedding_dim = weights.shape[1]

  if __name__ == '__main__':
  if __name__ == '__main__':


In [4]:
weights = np.append(weights,np.zeros((1,embedding_dim)),axis=0)
# 末尾にunknown_wordを追加

In [5]:
vocab_size = weights.shape[0]

In [6]:
cuda = torch.cuda.is_available()

In [7]:
out_size = 1

In [8]:
class LSTMTagger(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.out = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        h = torch.zeros(1, 1, self.hidden_dim)
        c = torch.zeros(1, 1, self.hidden_dim)
        if cuda:
            h = h.cuda()
            c = c.cuda()
        return (h,c)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
#         print(embeds.size())

        lstm_output, self.hidden = self.lstm(
            embeds.view(len(sentence),1,-1), self.hidden)

        output = self.out(lstm_output.view(len(sentence),-1))
        output = F.tanh(output)
        return output

In [9]:
import re
import nltk
from nltk import word_tokenize

In [11]:
def prepare_sequence(seq):
    vocab = gensim_model.wv.vocab
    idxs = [vocab[w].index if w in vocab else vocab_size - 1 for w in seq]
    res = torch.tensor(idxs, dtype=torch.long)
    if cuda:
        res = res.cuda()
    return res

In [12]:
def sentence2vec(sentence,debug=False):
    sentence = sentence.replace("'"," ").replace("."," ").replace(","," ").replace("\""," ")
    w_list = word_tokenize(sentence)
    w_list = [wordnet.morphy(w).lower() if wordnet.morphy(w) is not None else w.lower() for w in w_list]
    if debug:
        print(w_list)
    res_seq = prepare_sequence(w_list)
    return res_seq

In [13]:
def load_model(hidden_dim,model_name):
    torch.manual_seed(1)
    model = LSTMTagger(embedding_dim, hidden_dim, vocab_size, out_size)
    model_state_dict = torch.load(model_name)
    model_state_dict['word_embeddings.weight'] = torch.from_numpy(weights).float()
    model.load_state_dict(model_state_dict)
    model.word_embeddings.weight.requires_grad = False
    return model

In [19]:
def make_prediction(hidden_dim,model_path,dat_csv):
    model = load_model(hidden_dim, model_path)
    if cuda:
        model.cuda()

    data_cut = pd.read_csv(dat_csv,encoding='utf-16')
    data_cut = data_cut[data_cut['words']>=2]
    pred_list = []
    for index,row in data_cut.iterrows():
        sentence = row['sentence']
        model.zero_grad()
        model.hidden = model.init_hidden()
        sentence_in = sentence2vec(sentence)
        y = model(sentence_in)[-1,:]
        if cuda:
            y = y.cpu()
        pred_list.append(y.detach().numpy())

    pred_list = np.array(pred_list)

    pred_list = pred_list*2 + 3

    return pred_list

In [20]:
hidden_dim = 8

In [21]:
model_v_path = './dat_to1/model_Valence_hidden_dim_8_epoch_50'
csv_v_path = './data_preprocessed_Valence.csv'
v_ary = make_prediction(hidden_dim,model_v_path,csv_v_path)

  from ipykernel import kernelapp as app


In [22]:
data_v = pd.read_csv('./data_preprocessed_Valence.csv',encoding='utf-16')
data_v.head()
v_to_name = './pred_v_2.csv'
pred_data = data_v.assign(
                Valence_pred = pd.Series(v_ary.reshape(-1))
            )
pred_data.to_csv(v_to_name,encoding='utf-16',sep="\t")

Unnamed: 0.1,Unnamed: 0,id,sentence,Arousal,Dominance,Valence,sd.Arousal,sd.Dominance,sd.Valence,freq,reg,words,data_type,Valence_reg,Arousal_reg,Dominance_reg
0,2568,cable_spool_fort_5107_5146,Several more boys spewed out after him.,3.2,2.6,2.6,0.748331,0.489898,0.489898,5,several more boys spewed out after him,7,train,-0.2,0.1,-0.2
1,2548,cable_spool_fort_3784_3808,Chad just hung his head.,3.2,2.6,2.4,0.4,0.8,0.8,5,chad just hung his head,5,train,-0.3,0.1,-0.2
2,3883,hotel-california_28834_28854,"“I don't know much,”",2.6,2.6,2.6,0.489898,0.489898,0.489898,5,i don t know much,5,train,-0.2,-0.2,-0.2
3,370,detroit_10463_10549,They lack the means to build her back to her f...,3.4,3.0,2.6,0.489898,0.0,0.489898,5,they lack the means to build her back to her f...,18,train,-0.2,0.2,0.0
4,9790,SemEval_1143,Steelers' Roethlisberger has concussion,2.4,2.4,1.8,0.8,0.8,0.748331,5,steelers roethlisberger has concussion,4,train,-0.6,-0.3,-0.3


In [27]:
model_a_path = './dat_to1/model_Arousal_hidden_dim_8_epoch_50'
csv_a_path = './data_preprocessed_Arousal.csv'
a_ary = make_prediction(hidden_dim,model_a_path,csv_a_path)

  from ipykernel import kernelapp as app


In [28]:
data_a = pd.read_csv('./data_preprocessed_Arousal.csv',encoding='utf-16')
a_to_name = './pred_a_2.csv'
pred_data = data_a.assign(
                Arousal_pred = pd.Series(a_ary.reshape(-1))
            )
pred_data.to_csv(a_to_name,encoding='utf-16',sep="\t")

In [29]:
model_d_path = './dat_to1/model_Dominance_hidden_dim_8_epoch_50'
csv_d_path = './data_preprocessed_Dominance.csv'
d_ary = make_prediction(hidden_dim,model_d_path,csv_d_path)

  from ipykernel import kernelapp as app


In [30]:
data_d = pd.read_csv('./data_preprocessed_Dominance.csv',encoding='utf-16')
d_to_name = './pred_d_2.csv'
pred_data = data_d.assign(
                Dominance_pred = pd.Series(d_ary.reshape(-1))
            )
pred_data.to_csv(d_to_name,encoding='utf-16',sep="\t")

In [22]:
hidden_dim = 8
model_v_path = './dat_to1/model_Valence_hidden_dim_8_epoch_50'
model_a_path = './dat_to1/model_Arousal_hidden_dim_8_epoch_50'
model_d_path = './dat_to1/model_Dominance_hidden_dim_8_epoch_50'
v_ary = make_prediction(hidden_dim,model_v_path)
a_ary = make_prediction(hidden_dim,model_a_path)
d_ary = make_prediction(hidden_dim,model_d_path)

  from ipykernel import kernelapp as app


In [24]:
a_ary.shape

(6917, 1)

In [25]:
data_cut = pd.read_csv('./data_cut.csv',encoding='utf-16')
csv_name = './pred_vad_1.csv'
pred_data = data_cut.assign(
                Valence_pred = pd.Series(v_ary.reshape(-1)),
                Arousal_pred = pd.Series(a_ary.reshape(-1)),
                Dominance_pred = pd.Series(d_ary.reshape(-1))
            )
pred_data.to_csv(csv_name,encoding='utf-16',sep="\t")

In [15]:
weights.shape

(3000001, 300)

In [14]:
hidden_dim = 6
model_path = './dat/model_data_6_epoch_59'
model = load_model(hidden_dim, model_path)

In [17]:
if cuda:
    model.cuda()

In [43]:
in_dat = [
    "I feel so sorry about that"
]
pred_list = []

for i in in_dat:
    model.zero_grad()
    model.hidden = model.init_hidden()
    sentence_in = sentence2vec(i)
    y = model(sentence_in)[-1,:]
    if cuda:
        y = y.cpu()
    pred_list.append(y.detach().numpy())

  from ipykernel import kernelapp as app


In [44]:
pred_list = np.array(pred_list)
pred_list = pred_list*2 + 3

In [45]:
pred_list = list(pred_list)
print("[Valence  ,Arousal  ,Dominance]")
for sentence,vad in zip(in_dat,pred_list):
    print(str(vad)+"  "+sentence)

[Valence  ,Arousal  ,Dominance]
[2.5193458 3.1973207 2.9886205]  I feel so sorry about that
