In [1]:
import sys
import os
sys.path.append(os.path.join(os.path.dirname(''), '..'))

import numpy
import pandas
import custom
import pickle
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from IPython.display import display


In [2]:
def word_vectorize(sentence : [str,list], vec_dict : dict, word_len : [int, None] = None) :
    temp = []
    
    if type(sentence) == str : 
        words = str(sentence).split()
    else :
        words = sentence
    if word_len is None :
        word_len = len(words)
        
    for i in range(word_len - len(words)) :
        temp.append(vec_dict["<pad>"])
    for i in range(len(words)) :
        if words[i] not in vec_dict :
            temp.append(vec_dict["<unk>"])
            continue
        temp.append(vec_dict[words[i]])

    return temp

In [3]:
##데이터 불러오기
df = pandas.read_csv("data/simple_movie_data.csv", encoding="UTF8")
display(df)

with open("data/vector.pkl", mode="rb") as f:
    vec = pickle.load(f)

print(len(vec))

Unnamed: 0,quote,scoreSentiment,word_len
0,if you have not seen it do,1,7
1,it s benchmark cinema a highpoint of movie his...,1,9
2,this movie is all about brando,1,6
3,this is sean connery at his best,1,7
4,there are so many questions that the script ne...,0,10
...,...,...,...
101047,technically a little bit crude but it delivers,1,8
101048,a sturdy N entertaining late night watch N,1,8
101049,what on earth happened here N,0,6
101050,is it tuesday yet,0,4


124833


In [4]:
##데이터 전처리

sentences = df["quote"].values.tolist()
t = df["scoreSentiment"].values.tolist()
word_len = df["word_len"].max()

x = []
for s in sentences :
    s = custom.text_preprocess(s)
    x.append(word_vectorize(s, vec, word_len))
x = numpy.array(x)

print(x.shape)
print(x[0])

(101052, 10, 300)
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [-0.07179341  0.17243269 -0.00432201 ... -0.14865407  0.16357985
   0.0580472 ]
 [ 0.021268    0.26896062  0.07436031 ...  0.00719601 -0.12248817
   0.25796288]
 [-0.50632536 -0.17561522 -0.24907936 ... -0.4505613  -0.02780188
  -0.14853011]]


In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

##DataLoader 생성
tensor_x = torch.tensor(x, dtype = torch.float, device = device)
tensor_t = torch.tensor(t, dtype = torch.long, device = device)
zip_list = list(zip(tensor_x, tensor_t))
dataloader = DataLoader(zip_list, batch_size= 200, shuffle= True)
test_dataloader = DataLoader(zip_list, batch_size= 10000, shuffle= False)

print(len(dataloader))
print(len(test_dataloader))

506
11


In [6]:
##AI 만들기
#함수 선언
class NN(nn.Module) :
    def __init__(self) :
        super().__init__()
        self.rnn = nn.RNN(300,10,batch_first = True,device = device)
        self.f = nn.Sequential(
            nn.Linear(10,2),
            nn.ReLU()
        )
    def forward(self, x) :
        x, h = self.rnn(x)
        x = x[:,-1,:]
        x = self.f(x)
        return x
F = NN()


In [7]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(F.parameters(), lr = 0.25)
epoch = 50

for e in range(epoch) :
    loss_sum = 0
    for x, t in dataloader :
    # x, t 입력
    # y = F(x)
        y = F(x)
    # 손실함수
        loss = loss_function(y,t)
        loss_sum += loss
    # 최적화함수
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"epoch {e+1} || loss {loss_sum / len(dataloader)}")

epoch 1 || loss 0.596303403377533
epoch 2 || loss 0.5107028484344482
epoch 3 || loss 0.47933876514434814
epoch 4 || loss 0.4684526026248932
epoch 5 || loss 0.4587562680244446
epoch 6 || loss 0.4527088701725006
epoch 7 || loss 0.4466838240623474
epoch 8 || loss 0.44319742918014526
epoch 9 || loss 0.43767809867858887
epoch 10 || loss 0.4350571632385254
epoch 11 || loss 0.4314829409122467
epoch 12 || loss 0.4273010492324829
epoch 13 || loss 0.4247073829174042
epoch 14 || loss 0.42251724004745483
epoch 15 || loss 0.4201330542564392
epoch 16 || loss 0.4168042838573456
epoch 17 || loss 0.4149315357208252
epoch 18 || loss 0.41475990414619446
epoch 19 || loss 0.41113534569740295
epoch 20 || loss 0.4091576337814331
epoch 21 || loss 0.40798184275627136
epoch 22 || loss 0.40692946314811707
epoch 23 || loss 0.40368297696113586
epoch 24 || loss 0.40443670749664307
epoch 25 || loss 0.4046676754951477
epoch 26 || loss 0.40219399333000183
epoch 27 || loss 0.402404248714447
epoch 28 || loss 0.400829017

In [8]:
cnt = 0
total = 0

for x, t in test_dataloader :
    y = F(x)
    cnt += (y.argmax(dim=1) == t).sum().item()
    total += len(x)

print("possiblity : %f" %(cnt / total))

possiblity : 0.820538


In [9]:
torch.save(F.to("cpu"), "simple_movie_RNN.pt")