In [None]:
# 원래 사용하던 numpy 버전 -> 1.26.4 -> 최신버전 tensorflow 지원버전
# UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.26.4
#   warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"

In [None]:
import sys
print(sys.version)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re
import urllib.request
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/finance_sentiment_corpus/main/finance_data.csv", filename="finance_data.csv")

In [None]:
data = pd.read_csv('finance_data.csv')
print('총 샘플의 수 :',len(data))

In [None]:
data

In [None]:
data['labels'] = data['labels'].replace(['neutral', 'positive', 'negative'],[0, 1, 2])
data[:5]

In [None]:
del data['kor_sentence']

In [None]:
data[:5]

In [None]:
data.info()

In [None]:
print('결측값 여부 :',data.isnull().values.any())

In [None]:
print('sentence 열의 유니크한 값 :',data['sentence'].nunique())

In [None]:
duplicate = data[data.duplicated()]

In [None]:
duplicate

In [None]:
# 중복 제거
data.drop_duplicates(subset=['sentence'], inplace=True)
print('총 샘플의 수 :',len(data))

In [None]:
data['labels'].value_counts().plot(kind='bar')

In [None]:
print('레이블의 분포')
print(data.groupby('labels').size().reset_index(name='count'))

In [None]:
print(f'중립의 비율 = {round(data["labels"].value_counts()[0]/len(data) * 100,3)}%')
print(f'긍정의 비율 = {round(data["labels"].value_counts()[1]/len(data) * 100,3)}%')
print(f'부정의 비율 = {round(data["labels"].value_counts()[2]/len(data) * 100,3)}%')

In [None]:
data

In [None]:
X_data = data['sentence']
y_data = data['labels']
print('본문의 개수: {}'.format(len(X_data)))
print('레이블의 개수: {}'.format(len(y_data)))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=0, stratify=y_data)

In [None]:
X_train

In [None]:
y_train

In [None]:
print('훈련 샘플의 개수 :', len(X_train))
print('테스트 샘플의 개수 :', len(X_test))

In [None]:
print('--------훈련 데이터의 비율-----------')
print(f'중립 = {round(y_train.value_counts()[0]/len(y_train) * 100,3)}%')
print(f'긍정 = {round(y_train.value_counts()[1]/len(y_train) * 100,3)}%')
print(f'부정 = {round(y_train.value_counts()[2]/len(y_train) * 100,3)}%')

In [None]:
print('--------테스트 데이터의 비율-----------')
print(f'중립 = {round(y_test.value_counts()[0]/len(y_test) * 100,3)}%')
print(f'긍정 = {round(y_test.value_counts()[1]/len(y_test) * 100,3)}%')
print(f'부정 = {round(y_test.value_counts()[2]/len(y_test) * 100,3)}%')

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_encoded = tokenizer.texts_to_sequences(X_train)
X_test_encoded = tokenizer.texts_to_sequences(X_test)
print(X_train_encoded[:5])
print(X_test_encoded[:5])

In [None]:
tokenizer.word_index

In [None]:
word_to_index = tokenizer.word_index
vocab_size = len(word_to_index) + 1
print(vocab_size)

In [None]:
print('본문의 최대 길이 :',max(len(sent) for sent in X_train))
print('본문의 평균 길이 :',sum(map(len, X_train))/len(X_train))
plt.hist([len(sent) for sent in X_train], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
max_len = 302

In [None]:
X_train_encoded

In [None]:
X_train_encoded = pad_sequences(X_train_encoded, maxlen=max_len)
X_test_encoded = pad_sequences(X_test_encoded, maxlen=max_len)

In [None]:
X_train_encoded.shape

In [None]:
X_train_encoded[0]

In [None]:
X_test_encoded.shape

In [None]:
# from tensorflow.keras.layers import Embedding, Dense, LSTM
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.models import load_model
# from tensorflow.keras.utils import to_categorical
# from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
# y_train = to_categorical(y_train)
# y_test = to_categorical(y_test)

In [None]:
# embedding_dim = 64
# hidden_units = 64
# num_classes = 3

# model = Sequential()
# model.add(Embedding(vocab_size, embedding_dim))
# model.add(LSTM(hidden_units))
# model.add(Dense(num_classes, activation='softmax'))

# es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
# mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
# history = model.fit(X_train_encoded, y_train, epochs=15, callbacks=[es, mc], batch_size=32, validation_split=0.2)

In [None]:
# loaded_model = load_model('best_model.h5')
# print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test_encoded, y_test)[1]))

In [None]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

In [None]:
X_train_encoded.shape

In [None]:
y_train.shape

In [None]:
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(X_train_encoded), torch.from_numpy(y_train.values))
valid_data = TensorDataset(torch.from_numpy(X_test_encoded), torch.from_numpy(y_test.values))

# dataloaders
batch_size = 32

# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)

In [None]:
len(iter(train_loader))

In [None]:
next(iter(train_loader))[0].shape

In [None]:
next(iter(train_loader))[1].shape

In [None]:
valid_loader

In [None]:
input_size = 64
hidden_size = 64
num_layers = 2
num_classes = 3

In [None]:
# 설정값
# data_dim = 5
# hidden_dim = 10 
# output_dim = 1 
# learning_rate = 0.01
# nb_epochs = 100

class myModel(nn.Module):
    # # 기본변수, layer를 초기화해주는 생성자
    def __init__(self, vocab_size, embedding_dim, hidden_units, num_layers, num_classes):
        super(myModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.hidden_units = hidden_units
        self.num_layers = num_layers
        self.lstm = nn.LSTM(embedding_dim, hidden_units, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_units, num_classes)
    
    def forward(self, x):
        embedded = self.embedding(x)
        embedded.to(device)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_units).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_units).to(x.device)
        out, _ = self.lstm(embedded, (h0, c0))
        out = self.fc(out[:, -1, :])
        
        return out, h0

In [None]:
# # embedding 있음
# import torch
# import torch.nn as nn

# class StockNewsLSTM(nn.Module):
#     def __init__(self, vocab_size, embedding_dim, hidden_units, num_layers, num_classes):
#         super(StockNewsLSTM, self).__init__()
#         self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
#         self.hidden_units = hidden_units
#         self.num_layers = num_layers
#         self.lstm = nn.LSTM(embedding_dim, hidden_units, num_layers, batch_first=True)
#         self.fc = nn.Linear(hidden_units, num_classes)
    
#     def forward(self, x):
#         embedded = self.embedding(x)
#         h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_units).to(x.device)
#         c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_units).to(x.device)
#         out, _ = self.lstm(embedded, (h0, c0))
#         out = self.fc(out[:, -1, :])
#         return out

In [None]:
model = myModel(vocab_size, input_size, hidden_size, num_layers, num_classes)

In [None]:
model

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

In [None]:
epochs = 15
valid_loss_min = np.Inf

train_loss = torch.zeros(epochs)
valid_loss = torch.zeros(epochs)

train_acc = torch.zeros(epochs)
valid_acc = torch.zeros(epochs)

for e in tqdm(range(0, epochs)):
    model.train()
    # initialize hidden state 
    # h = model.init_hidden(batch_size)
    for inputs, labels in train_loader:
        labels = labels.to(device)
        
        model.zero_grad()
        output, h = model(inputs)
        # calculate the loss and perform backprop
        output, h = output.to(device), h.to(device)
        loss = criterion(output.squeeze(), labels.long())
        loss.backward()
        
        # calculating accuracy
        # accuracy = acc(output,labels)
        ps = F.softmax(output, dim=1)
        top_p, top_class = ps.topk(1, dim=1)
        equals = top_class == labels.reshape(top_class.shape)
        train_acc[e] += torch.mean(equals.type(torch.float)).detach().cpu()
        
        #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        # nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
    train_loss[e] /= len(train_loader)
    train_acc[e] /= len(train_loader)

    
    model.eval()
    for inputs, labels in valid_loader:
        labels = labels.to(device)

        output, val_h = model(inputs, val_h)
        output, val_h = output.to(device), val_h.to(device)
        val_loss = criterion(output.squeeze(), labels.float())

        ps = F.softmax(output, dim=1)
        top_p, top_class = ps.topk(1, dim=1)
        equals = top_class == labels.reshape(top_class.shape)
        valid_acc[e] += torch.mean(equals.type(torch.float)).detach().cpu()
    
    print(f'Epoch {e+1}') 
    print(f'train_loss : {train_loss[e]}, val_loss : {valid_loss[e]}')
    print(f'train_accuracy : {train_acc[e]*100}, val_accuracy : {valid_acc[e]*100}')
    if valid_loss <= valid_loss_min:
        torch.save(model, 'model.pth')
        torch.save(model.state_dict(), 'model_state_dict.pt')
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        valid_loss_min = valid_loss
    print(25*'==')