In [1]:
!pip install transformers
!pip install datasets



In [2]:
from datasets import load_dataset

import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm

import torch
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, BertConfig
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW

from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

In [3]:
!wget https://raw.githubusercontent.com/ukairia777/finance_sentiment_corpus/main/finance_data.csv

--2025-08-20 04:55:13--  https://raw.githubusercontent.com/ukairia777/finance_sentiment_corpus/main/finance_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1319001 (1.3M) [text/plain]
Saving to: ‘finance_data.csv’


2025-08-20 04:55:13 (35.0 MB/s) - ‘finance_data.csv’ saved [1319001/1319001]



In [4]:
df = pd.read_csv('finance_data.csv')
print('샘플의 개수 :',len(df))

샘플의 개수 : 4846


In [None]:
df.head()

In [6]:
df['labels'].value_counts()

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
neutral,2879
positive,1363
negative,604


In [None]:
df['labels'] = df['labels'].replace(['neutral','positive','negative'],[0, 1, 2])
df.head()

In [8]:
df.to_csv('finance_data.csv',index=False, encoding='utf-8-sig')

In [9]:
all_data = load_dataset("csv", data_files={"train":"finance_data.csv"})

Generating train split: 0 examples [00:00, ? examples/s]

In [10]:
cs = all_data['train'].train_test_split(0.2,seed=777)
train_cs=cs["train"]
test_cs=cs["test"]

In [11]:
train_cs

Dataset({
    features: ['labels', 'sentence', 'kor_sentence'],
    num_rows: 3876
})

In [12]:
test_cs

Dataset({
    features: ['labels', 'sentence', 'kor_sentence'],
    num_rows: 970
})

In [13]:
cs=train_cs.train_test_split(0.2,seed=777)
train_cs=cs["train"]
valid_cs=cs["test"]

In [14]:
train_sentences = list(map(lambda x: '[CLS] '+str(x)+'[SEP] ',train_cs['kor_sentence']))
val_sentences = list(map(lambda x: '[CLS] '+str(x)+'[SEP] ',valid_cs['kor_sentence']))
test_sentences = list(map(lambda x: '[CLS] '+str(x)+'[SEP] ',test_cs['kor_sentence']))

In [15]:
train_labels = train_cs['labels']
validation_labels = valid_cs['labels']
test_labels = test_cs['labels']

In [None]:
train_sentences [10:15]

In [17]:
train_labels[10:15]

[1, 0, 0, 2, 0]

In [18]:
tokenizer = BertTokenizer.from_pretrained('klue/bert-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

In [None]:
tokenized_text= tokenizer.tokenize('[CLS] 우리는 시스템의 구현 모델을 개발하기 위해 장기적인 투자를 해왔습니다. [SEP] ')
input_id = tokenizer.convert_tokens_to_ids(tokenized_text)

print(tokenized_text)
print(input_id)

In [20]:
max_len = 128

def data_to_tensor(sentences, labels, max_len):
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

    input_ids = pad_sequences(
        input_ids, maxlen=max_len, dtype="long", truncating="post", padding="post"
    )

    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i > 0) for i in seq]
        attention_masks.append(seq_mask)

    tensor_input = torch.tensor(input_ids)
    tensor_labels = torch.tensor(labels)
    tensor_masks = torch.tensor(attention_masks)

    return tensor_input, tensor_labels, tensor_masks


In [21]:
train_inputs, train_labels, train_masks = data_to_tensor(train_sentences, train_labels, max_len)
validation_inputs, validation_labels, validation_masks = data_to_tensor(val_sentences, validation_labels, max_len)
test_inputs, test_labels, test_masks = data_to_tensor(test_sentences, test_labels, max_len)

In [22]:
batch_size = 32
train_data= TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [23]:
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [24]:
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [25]:
if torch.cuda.is_available:
  device=torch.device("cuda")
else:
  device=torch.device("cpu")
print(device)

cuda


In [26]:
num_labels = 3

model=BertForSequenceClassification.from_pretrained("klue/bert-base", num_labels=num_labels)
model.cuda()

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [27]:
epochs = 5
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  )

In [28]:
def metrics(predictions, labels):
  y_pred = predictions
  y_true = labels
  accuracy = accuracy_score(y_true, y_pred)
  f1_macro_average = f1_score(y_true=y_true, y_pred=y_pred, average='macro', zero_division=0)
  f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro', zero_division=0)
  f1_weighted_average = f1_score(y_true=y_true, y_pred=y_pred, average='weighted', zero_division=0)

  metrics = {
      'accuracy': accuracy,
      'f1_macro_average': f1_macro_average,
      'f1_micro_average': f1_micro_average,
      'f1_weighted_average': f1_weighted_average
  }
  return metrics

In [29]:
def train_epoch(model, train_dataloader,optimzer, device):
  model.train()
  total_loss = 0
  for step, batch in tqdm(enumerate(train_dataloader),desc="Training Batch"):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  avg_train_loss = total_loss / len(train_dataloader)
  return avg_train_loss


In [32]:
def evaluate(model, validation_dataloader, device):
    model.eval()
    total_eval_loss = 0
    predictions, true_labels = [], []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_masks, b_labels = batch

        with torch.no_grad():
            outputs = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_masks,
                labels=b_labels
            )
        if outputs.loss is not None:
            loss = outputs.loss
            total_eval_loss += loss.item()

        logits = outputs.logits.detach().cpu().numpy()
        label_ids = b_labels.to("cpu").numpy()

        predictions.extend(np.argmax(logits, axis=1).flatten())
        true_labels.extend(label_ids.flatten())

    eval_metrics = metrics(predictions, true_labels)

    return total_eval_loss/len(validation_dataloader), eval_metrics


In [33]:
min_val_loss= float('inf')

for epoch in range(epochs):
  print("Epoch {0}/{1}".format(epoch+1,epochs))
  train_epoch(model, train_dataloader, optimizer, device)
  val_loss, val_metrics = evaluate(model, validation_dataloader, device)

  print("Validation Loss: {0:.2f}".format(val_loss))
  print("Validation Accuracy: {0:.2f}".format(val_metrics['accuracy']))
  print("Validation F1-Score (Weighted): {0:.2f}".format(val_metrics['f1_weighted_average']))

  if val_loss < min_val_loss:
    print("Validation loss decreased ({0:.2f} --> {1:.2f}). Saving model...".format(min_val_loss, val_loss))
    torch.save(model.state_dict(),'model_checkpoint.pt')
    min_val_loss = val_loss

Epoch 1/5


Training Batch: 97it [00:59,  1.62it/s]


Validation Loss: 0.47
Validation Accuracy: 0.81
Validation F1-Score (Weighted): 0.82
Validation loss decreased (inf --> 0.47). Saving model...
Epoch 2/5


Training Batch: 97it [01:01,  1.58it/s]


Validation Loss: 0.40
Validation Accuracy: 0.83
Validation F1-Score (Weighted): 0.83
Validation loss decreased (0.47 --> 0.40). Saving model...
Epoch 3/5


Training Batch: 97it [01:01,  1.57it/s]


Validation Loss: 0.49
Validation Accuracy: 0.80
Validation F1-Score (Weighted): 0.80
Epoch 4/5


Training Batch: 97it [01:00,  1.59it/s]


Validation Loss: 0.51
Validation Accuracy: 0.83
Validation F1-Score (Weighted): 0.83
Epoch 5/5


Training Batch: 97it [01:00,  1.59it/s]


Validation Loss: 0.58
Validation Accuracy: 0.82
Validation F1-Score (Weighted): 0.83


In [34]:
from transformers import pipeline

pipe = pipeline("text-classification",model=model.cuda(),tokenizer=tokenizer,device=0,max_length=512,return_all_scores=True, function_to_apply='softmax')

Device set to use cuda:0


In [35]:
result = pipe("SK하이닉스가 매출이 급성장하였다")
print(result)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[[{'label': 'LABEL_0', 'score': 0.0016561920056119561}, {'label': 'LABEL_1', 'score': 0.9974818825721741}, {'label': 'LABEL_2', 'score': 0.0008619696600362659}]]


In [36]:
pipe = pipeline("text-classification",model=model.cuda(),tokenizer=tokenizer,device=0,max_length=512, function_to_apply='softmax')
result = pipe("SK하이닉스가 매출이 급성장하였다")
print(result)

Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.9974818825721741}]


In [37]:
label_dict = {'LABEL_0': '중립', 'LABEL_1': '긍정', 'LABEL_2': '부정'}

def prediction(text):
  result =pipe(text)

  return [label_dict[result[0]['label']]]

In [38]:
prediction('네이버가 매출이 급성장하였다')

['긍정']

In [39]:
prediction('언어모델의 발달로 인공지능 스타트업들은 위기다')

['부정']

In [40]:
prediction('인공지능 기술의 벌전으로 누군가는 기회를 얻을 것이고, 누군가는 얻지 못 할 것이다')

['중립']