In [1]:
import sys
import os
sys.path.append(os.path.join(os.path.dirname(''), '..'))

import numpy
import pandas
import custom
import pickle
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from IPython.display import display


In [2]:
def word_vectorize(sentence : str | list, vec_dict : dict, word_len : int | None = None) :
    temp = []
    
    if type(sentence) == str : 
        words = str(sentence).split()
    else :
        words = sentence
    if word_len is None :
        word_len = len(words)
        
    for i in range(word_len - len(words)) :
        temp.append(vec_dict["<pad>"])
    for i in range(len(words)) :
        if words[i] not in vec_dict :
            temp.append(vec_dict["<unk>"])
            continue
        temp.append(vec_dict[words[i]])

    return temp

In [3]:
##데이터 불러오기
df = pandas.read_csv("data/simple_movie_data.csv", encoding="UTF8")
display(df)

with open("data/vector.pkl", mode="rb") as f:
    vec = pickle.load(f)

print(len(vec))

Unnamed: 0,quote,scoreSentiment,word_len
0,if you have not seen it do,1,7
1,it s benchmark cinema a highpoint of movie his...,1,9
2,this movie is all about brando,1,6
3,this is sean connery at his best,1,7
4,there are so many questions that the script ne...,0,10
...,...,...,...
101047,technically a little bit crude but it delivers,1,8
101048,a sturdy N entertaining late night watch N,1,8
101049,what on earth happened here N,0,6
101050,is it tuesday yet,0,4


124833


In [4]:
##데이터 전처리

sentences = df["quote"].values.tolist()
t = df["scoreSentiment"].values.tolist()
word_len = df["word_len"].max()

x = []
for s in sentences :
    s = custom.text_preprocess(s)
    x.append(word_vectorize(s, vec, word_len))
x = numpy.array(x)

print(x.shape)
print(x[0])

(101052, 10, 300)
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [-0.07179341  0.17243269 -0.00432201 ... -0.14865407  0.16357985
   0.0580472 ]
 [ 0.021268    0.26896062  0.07436031 ...  0.00719601 -0.12248817
   0.25796288]
 [-0.50632536 -0.17561522 -0.24907936 ... -0.4505613  -0.02780188
  -0.14853011]]


In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

##DataLoader 생성
tensor_x = torch.tensor(x, dtype = torch.float, device = device)
tensor_t = torch.tensor(t, dtype = torch.long, device = device)
zip_list = list(zip(tensor_x, tensor_t))
dataloader = DataLoader(zip_list, batch_size= 200, shuffle= True)
test_dataloader = DataLoader(zip_list, batch_size= 10000, shuffle= False)

print(len(dataloader))
print(len(test_dataloader))

506
11


In [6]:
##AI 만들기
#함수 선언
class NN(nn.Module) :
    def __init__(self) :
        super().__init__()
        self.rnn = nn.RNN(300,10,batch_first = True,device = device)
        self.f = nn.Sequential(
            nn.Linear(10,2),
            nn.ReLU()
        )
    def forward(self, x) :
        x, h = self.rnn(x)
        x = x[:,-1,:]
        x = self.f(x)
        return x
F = NN()

loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(F.parameters(), lr = 0.25)
epoch = 50

for e in range(epoch) :
    loss_sum = 0
    for x, t in dataloader :
    # x, t 입력
    # y = F(x)
        y = F(x)
    # 손실함수
        loss = loss_function(y,t)
        loss_sum += loss
    # 최적화함수
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"epoch {e+1} || loss {loss_sum / len(dataloader)}")

epoch 1 || loss 0.5792392492294312
epoch 2 || loss 0.5006054043769836
epoch 3 || loss 0.47771063446998596
epoch 4 || loss 0.4689435362815857
epoch 5 || loss 0.46088308095932007
epoch 6 || loss 0.4542107582092285
epoch 7 || loss 0.44985082745552063
epoch 8 || loss 0.4437525272369385
epoch 9 || loss 0.4386419951915741
epoch 10 || loss 0.43544629216194153
epoch 11 || loss 0.432691365480423
epoch 12 || loss 0.4285111129283905
epoch 13 || loss 0.4247748553752899
epoch 14 || loss 0.4221464693546295
epoch 15 || loss 0.4197808504104614
epoch 16 || loss 0.41818469762802124
epoch 17 || loss 0.4162460267543793
epoch 18 || loss 0.4137991964817047
epoch 19 || loss 0.413222998380661
epoch 20 || loss 0.40997496247291565
epoch 21 || loss 0.40853893756866455
epoch 22 || loss 0.40850117802619934
epoch 23 || loss 0.4060201942920685
epoch 24 || loss 0.4052460491657257
epoch 25 || loss 0.4044886827468872
epoch 26 || loss 0.40313398838043213
epoch 27 || loss 0.401256799697876
epoch 28 || loss 0.399768382310

In [7]:
cnt = 0
total = 0

for x, t in test_dataloader :
    y = F(x)
    cnt += (y.argmax(dim=1) == t).sum().item()
    total += len(x)

print("possiblity : %f" %(cnt / total))

possiblity : 0.816233


In [8]:
torch.save(F.to("cpu"), "simple_movie_RNN.pt")

In [10]:
##LSTM AI 만들기
#함수 선언
class LSTM_NN(nn.Module) :
    def __init__(self) :
        super().__init__()
        self.rnn = nn.LSTM(300,10,batch_first = True,device = device)
        self.f = nn.Sequential(
            nn.Linear(10,2),
            nn.ReLU()
        )
    def forward(self, x) :
        x, h = self.rnn(x)
        x = x[:,-1,:]
        x = self.f(x)
        return x
F = LSTM_NN()

loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(F.parameters(), lr = 1)
epoch = 50

for e in range(epoch) :
    loss_sum = 0
    for x, t in dataloader :
    # x, t 입력
    # y = F(x)
        y = F(x)
    # 손실함수
        loss = loss_function(y,t)
        loss_sum += loss
    # 최적화함수
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"epoch {e+1} || loss {loss_sum / len(dataloader)}")

epoch 1 || loss 0.5696309208869934
epoch 2 || loss 0.48556819558143616
epoch 3 || loss 0.43722105026245117
epoch 4 || loss 0.4118592441082001
epoch 5 || loss 0.39580902457237244
epoch 6 || loss 0.3842879831790924
epoch 7 || loss 0.374743789434433
epoch 8 || loss 0.3676908016204834
epoch 9 || loss 0.36101606488227844
epoch 10 || loss 0.35526785254478455
epoch 11 || loss 0.35095351934432983
epoch 12 || loss 0.3463340401649475
epoch 13 || loss 0.3425554633140564
epoch 14 || loss 0.33879193663597107
epoch 15 || loss 0.3339422047138214
epoch 16 || loss 0.33173590898513794
epoch 17 || loss 0.32785964012145996
epoch 18 || loss 0.3251921236515045
epoch 19 || loss 0.32290953397750854
epoch 20 || loss 0.31977587938308716
epoch 21 || loss 0.3180411159992218
epoch 22 || loss 0.31519341468811035
epoch 23 || loss 0.31223776936531067
epoch 24 || loss 0.3103339970111847
epoch 25 || loss 0.3085728883743286
epoch 26 || loss 0.3061473071575165
epoch 27 || loss 0.3035643696784973
epoch 28 || loss 0.302002

In [11]:
cnt = 0
total = 0

for x, t in test_dataloader :
    y = F(x)
    cnt += (y.argmax(dim=1) == t).sum().item()
    total += len(x)

print("possiblity : %f" %(cnt / total))

possiblity : 0.884841


In [12]:
torch.save(F.to("cpu"), "simple_movie_LSTM.pt")