In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
sys.path.append('/content/drive/MyDrive/FinGPT/code/')

In [3]:
!nvidia-smi

Thu Dec 14 14:28:44 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
!pip install nlpaug sacremoses

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacremoses, nlpaug
Successfully installed nlpaug-1.1.11 sacremoses-0.1.1


In [5]:
import torch
from data import read_data, MyDataset
from torch.utils.data import DataLoader
from model import MyModel
from torch.optim import AdamW
import torch.nn as nn
from sklearn.metrics import accuracy_score
import time
from data_aug import DataAugmentation

In [6]:
args = {
    'us': True,
    'train_file' : '/content/drive/MyDrive/FinGPT/data/us/sent_train.csv',
    'train_aug_file': '/content/drive/MyDrive/FinGPT/data/us/sent_train_aug.csv',
    'valid_file' : '/content/drive/MyDrive/FinGPT/data/us/sent_valid.csv',
    'sentiments' : {
        0: "Bearish",
        1: "Bullish",
        2: "Neutral"
        },
    'bert_model' : 'bert-base-uncased',
    'max_len' : 128,
    'batch_size' : 32,
    'epochs' : 20,
    'learning_rate' : 1e-5,
    'num_filters' : 768,
    'save_model_best' : '/content/drive/MyDrive/FinGPT/model/best_model_us.pth',
    'save_model_last' : '/content/drive/MyDrive/FinGPT/model/last_model_us.pth',
}

In [None]:
train_text_aug = DataAugmentation(args['train_file'])
train_text_aug.to_csv(args['train_aug_file'])

In [7]:
start = time.time()
device = "cuda:0" if torch.cuda.is_available() else "cpu"

train_text, train_label, max_len = read_data(args['train_file'], us=args['us'])

# if use augmented data:
# train_aug_text, train_aug_label, _ = read_data(args['train_aug_file'], us=args['us'])
# train_text = train_text + train_aug_text
# train_label = train_label + train_aug_label

valid_text, valid_label, _ = read_data(args['valid_file'], us=args['us'])
args['max_len'] = max_len

train_dataset = MyDataset(train_text, train_label, args)
train_dataloader = DataLoader(train_dataset, batch_size=args['batch_size'], shuffle=True)

valid_dataset = MyDataset(valid_text, valid_label, args)
valid_dataloader = DataLoader(valid_dataset, batch_size=args['batch_size'], shuffle=False)

model = MyModel(args).to(device)
opt = AdamW(model.parameters(), lr=args['learning_rate'])
loss_fn = nn.CrossEntropyLoss()

acc_max = float("-inf")
for epoch in range(args['epochs']):
    loss_sum, count = 0, 0
    model.train()
    for batch_index, (batch_text, batch_label) in enumerate(train_dataloader):
        batch_label = batch_label.to(device)
        pred = model(batch_text)

        loss = loss_fn(pred, batch_label)
        opt.zero_grad()
        loss.backward()
        opt.step()
        loss_sum += loss
        count += 1

        if len(train_dataloader) - batch_index <= len(train_dataloader) % 1000 and count == len(train_dataloader) % 1000:
            msg = "[{0}/{1:5d}]\tTrain_Loss:{2:.4f}"
            print(msg.format(epoch + 1, batch_index + 1, loss_sum / count))
            loss_sum, count = 0.0, 0

        if batch_index % 1000 == 999:
            msg = "[{0}/{1:5d}]\tTrain_Loss:{2:.4f}"
            print(msg.format(epoch + 1, batch_index + 1, loss_sum / count))
            loss_sum, count = 0.0, 0

    model.eval()
    all_pred, all_true = [], []
    with torch.no_grad():
        for batch_text, batch_label in valid_dataloader:
            batch_label = batch_label.to(device)
            pred = model(batch_text)

            pred = torch.argmax(pred, dim=1).cpu().numpy().tolist()
            label = batch_label.cpu().numpy().tolist()

            all_pred.extend(pred)
            all_true.extend(label)

    acc = accuracy_score(all_pred, all_true)
    print(f"Valid acc:{acc:.4f}")
    if acc > acc_max:
        print(acc, acc_max)
        acc_max = acc
        torch.save(model.state_dict(), args['save_model_best'])
        print(f"Saving the best model.")

torch.save(model.state_dict(), args['save_model_last'])

end = time.time()
print(f"Running time: {(end-start)/60%60:.4f} min")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

[1/  299]	Train_Loss:0.6762
Valid acc:0.8132
0.8132328308207705 -inf
Saving the best model.
[2/  299]	Train_Loss:0.4176
Valid acc:0.8572
0.8572026800670016 0.8132328308207705
Saving the best model.
[3/  299]	Train_Loss:0.2980
Valid acc:0.8530
[4/  299]	Train_Loss:0.2091
Valid acc:0.8559
[5/  299]	Train_Loss:0.1430
Valid acc:0.8639
0.8639028475711893 0.8572026800670016
Saving the best model.
[6/  299]	Train_Loss:0.0987
Valid acc:0.8677
0.8676716917922948 0.8639028475711893
Saving the best model.
[7/  299]	Train_Loss:0.0730
Valid acc:0.8685
0.8685092127303182 0.8676716917922948
Saving the best model.
[8/  299]	Train_Loss:0.0594
Valid acc:0.8668
[9/  299]	Train_Loss:0.0531
Valid acc:0.8710
0.8710217755443886 0.8685092127303182
Saving the best model.
[10/  299]	Train_Loss:0.0424
Valid acc:0.8585
[11/  299]	Train_Loss:0.0355
Valid acc:0.8668
[12/  299]	Train_Loss:0.0291
Valid acc:0.8731
0.8731155778894473 0.8710217755443886
Saving the best model.
[13/  299]	Train_Loss:0.0251
Valid acc:0.871

In [None]:
# Record result
# initial data: 0.8802
# augmented data: 0.8771
# FinBert: 0.7328
# chinese data: 0.4902

In [None]:
# test demo
args = {
    'sentiments' : {
        0: "Bearish",
        1: "Bullish",
        2: "Neutral"
        },
    'bert_model' : 'bert-base-uncased',
    'max_len' : 128,
    'batch_size' : 32,
    'epochs' : 10,
    'learn_rate' : 1e-5,
    'num_filters' : 768,
    'save_model' : '/content/drive/MyDrive/FinGPT/model/best_model_us.pth',
    # 'save_model_last' : '/content/drive/MyDrive/FinGPT/model/last_model_cn.pth',
}
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = MyModel(args).to(device)
model.load_state_dict(torch.load(args['save_model'], map_location=torch.device(device)))

sentences = [
    "FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is aggressively pursuing its growth strategy by increasingly focusing on technologically more demanding HDI printed circuit boards PCBs .",
    "According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .",
    "A tinyurl link takes users to a scamming site promising that users can earn thousands of dollars by becoming a Google ( NASDAQ : GOOG ) Cash advertiser .",
    "Barclays cuts to Equal Weight",
]
# sentences = [
#     '东吴证券某日发布研报，维持公司买入评级',
#     '某日公司融资净偿还57.95万元，融资余额8.33亿元',
#     '某日，公司集团在京发布了Q-GPT安全机器人和大模型卫士。Q-GPT安全机器人是基于公司大模型的虚拟安全专家\
#     可以全天候工作，一台机器人等于60多位安全专家，可产生约2000万元的运营效益，极大提升了生产力。大模型卫士集安全风险发现、大模型访问控制、数据泄露管控、违法违规行为溯源、大模型应用分析等于一体，帮助企业更安全地向大模型要生产力',
#     '这只股票一定能涨，一定要大量买入'
# ]

def transfer(Inputs):
    test_dataset = MyDataset(Inputs, [0]*len(Inputs), args)
    test_dataloader = DataLoader(test_dataset, batch_size=args['batch_size'], shuffle=False)
    for batch_text, batch_label in test_dataloader:
        predict = model(batch_text)
        predict = torch.argmax(predict, dim=1).cpu().numpy().tolist()
    return [args['sentiments'][i] for i in predict]
res = transfer(sentences)
res

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

['Bearish', 'Neutral', 'Bullish', 'Bullish']

In [None]:
# FinBert for contrast
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

sentences = ["there is a shortage of capital, and we need extra financing",
             "growth is strong and we have plenty of liquidity",
             "there are doubts about our finances",
             "profits are flat",
             "The company is earning money"]
results = nlp(sentences)
# LABEL_0: neutral; LABEL_1: positive; LABEL_2: negative
for res in results:
    print(res)

# Calculate acc for validation set
label_dict = {
    'Negative': 0,
    'Positive': 1,
    'Neutral': 2,
}

text, label, _ = read_data('/content/drive/MyDrive/FinGPT/data/us/sent_valid.csv')
results = nlp(text)
results = [label_dict[res['label']] for res in results]
acc = accuracy_score(results, label)
print(acc)

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

{'label': 'Negative', 'score': 0.9966173768043518}
{'label': 'Positive', 'score': 1.0}
{'label': 'Negative', 'score': 0.9999710321426392}
{'label': 'Neutral', 'score': 0.9889441728591919}
{'label': 'Neutral', 'score': 0.9967959523200989}
0.7328308207705193
