In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# Get to the folder we are at
FOLDERNAME = 'Colab\ Notebooks/Project/SC201_Project'
%cd drive/MyDrive/$FOLDERNAME/

/content/drive/MyDrive/Colab Notebooks/Project/SC201_Project


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pandas as pd

In [4]:
# Seed for same output
torch.manual_seed(42)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [8]:
# Reading in our file
raw_data = pd.read_csv('train_data2015-2022.csv')

pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', None) 
pd.set_option('display.expand_frame_repr', False)  
print(raw_data.head(5))  
print(raw_data.tail(5)) 
print(raw_data.shape)  

   code                                            content        time  label
0  2884  玉山金12月份營收計     2,891,268千元, 比去年同期成長       28.9...  2015-03-20      1
1  2884  玉山金12月份營收計     2,891,268千元, 比去年同期成長       28.9...  2015-03-23      1
2  2884  玉山金12月份營收計     2,891,268千元, 比去年同期成長       28.9...  2015-03-24      1
3  2884  玉山金12月份營收計     2,891,268千元, 比去年同期成長       28.9...  2015-03-25      1
4  2884  玉山金12月份營收計     2,891,268千元, 比去年同期成長       28.9...  2015-03-26      1
        code                                            content        time  label
107651  1402  遠東新 9月份合併營收計    23,089,717千元, 比去年同期成長       15...  2022-12-26      1
107652  1402  遠東新 9月份合併營收計    23,089,717千元, 比去年同期成長       15...  2022-12-27      1
107653  1402  遠東新 9月份合併營收計    23,089,717千元, 比去年同期成長       15...  2022-12-28      1
107654  1402  遠東新 9月份合併營收計    23,089,717千元, 比去年同期成長       15...  2022-12-29      1
107655  1402  遠東新 9月份合併營收計    23,089,717千元, 比去年同期成長       15...  2022-12-30      1
(107656, 4)


In [6]:
# Get data & labels
raw_data['content'] = raw_data['content'].fillna('').astype(str)
reviews = raw_data['content'].values
labels = raw_data['label'].values

NameError: name 'raw_data' is not defined

In [6]:
patterns = ['<br />', '--','。','，', '.', ',', '!', '?', ')', '(', ';', ':', '*', '~', '_', "'", '"', ' ', '\n', 'nan']
replacements = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '', '', '', '']

In [7]:
def preprocessing(reviews, patterns, replacements):
  lst = []
  for i in range(len(reviews)):
    review = reviews[i].lower()
    for pattern, replacement in zip(patterns, replacements):
      review = review.replace(pattern, replacement)
    lst.append(review)
  return lst

In [12]:
reviews = preprocessing(reviews, patterns, replacements)
# print(reviews)

In [8]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
bert_model = AutoModelForMaskedLM.from_pretrained("bert-base-chinese").to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
from itertools import count
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset

# Tokenize reviews
def tokenize_reviews(reviews, tokenizer, max_length):
    all_input_ids = []
    all_token_type_ids = []
    all_attention_masks = []

    for review in reviews:
        split_review = review.split('@')

        avg_input_ids = torch.zeros(1, max_length, dtype=torch.long)  
        avg_token_type_ids = torch.zeros(1, max_length, dtype=torch.long)  
        avg_attention_mask = torch.zeros(1, max_length, dtype=torch.long) 

        count = 0

        for sentence in split_review:
            encodings = tokenizer(sentence, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

            input_ids = encodings['input_ids']
            token_type_ids = encodings['token_type_ids']
            attention_mask = encodings['attention_mask']

            pad_len = max_length - input_ids.size(1)

            part1 = input_ids[:, :-1]  
            part2 = input_ids[:, -1:]  
            padded_input_ids = F.pad(part1, (0, pad_len), value=0)
            padded_input_ids = torch.cat((padded_input_ids, part2), dim=1)
            avg_input_ids += padded_input_ids

            attention_mask = F.pad(attention_mask, (0, pad_len), value=1)
            avg_attention_mask += attention_mask

            token_type_ids = F.pad(token_type_ids, (0, pad_len), value=0)
            avg_token_type_ids += token_type_ids

            count += 1

        avg_input_ids = avg_input_ids / count
        avg_token_type_ids = avg_token_type_ids / count
        avg_attention_mask = avg_attention_mask / count

        all_input_ids.append(avg_input_ids)
        all_token_type_ids.append(avg_token_type_ids)
        all_attention_masks.append(avg_attention_mask)

    all_input_ids = torch.cat(all_input_ids, dim=0)
    all_token_type_ids = torch.cat(all_token_type_ids, dim=0)
    all_attention_masks = torch.cat(all_attention_masks, dim=0)

    encodings = {'input_ids': all_input_ids.long(),'token_type_ids': all_token_type_ids, 'attention_mask': all_attention_masks.long()}

    return encodings


In [13]:
# # Tokenize reviews
# def tokenize_reviews(reviews, tokenizer, max_length):
#     encodings = tokenizer(reviews, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
#     print(encodings)
#     return encodings

In [15]:
max_length = 300
tokenized_reviews = tokenize_reviews(reviews, tokenizer, max_length)
input_ids = tokenized_reviews['input_ids']
attention_masks = tokenized_reviews['attention_mask']

# Create tensor datasets
dataset = TensorDataset(input_ids, attention_masks, torch.tensor(labels))


# print(f"Dataset length: {len(dataset)}")

assert len(dataset) == 107656, "Dataset length does not match the expected total of train_size and val_size."

# print(f"Dataset length: {len(dataset)}")

# Create tensor datasets
dataset = TensorDataset(input_ids, attention_masks, torch.tensor(labels))

# Split dataset into training and validation
train_size = 97656
val_size = 10000
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create data loaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [10]:
class PositionalEncoding(nn.Module):
    def __init__(self, sequence_len, embedding_dim, device):
        super().__init__()
        self.sequence_len = sequence_len
        self.embedding_dim = embedding_dim
        self.device = device

    def forward(self, x):
        pe = torch.arange(0, self.embedding_dim // 2).reshape(1, self.embedding_dim // 2).to(self.device)
        pe = pe / (self.embedding_dim // 2)
        pe = torch.pow(10000, pe)
        positions = torch.arange(0, self.sequence_len).reshape(self.sequence_len, 1).to(self.device)
        pe_sin = torch.sin(positions / pe)
        pe_cos = torch.cos(positions / pe)
        stacked = torch.stack((pe_sin, pe_cos), dim=2)
        out = torch.flatten(stacked, 1)
        return out


In [11]:
class InputEncoding(nn.Module):
    def __init__(self, sequence_len, vocab_size, embedding_dim, device):
        super().__init__()
        self.word_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_encoding = PositionalEncoding(sequence_len, embedding_dim, device)
        self.device = device

    def forward(self, x):
        emb1 = self.word_embedding(x)
        emb2 = self.positional_encoding(x)
        return emb1 + emb2

In [12]:
# neural network
class FeedForwardLayer(nn.Module):
    def __init__(self, emb_size, d_out):
        super().__init__()
        self.linear1 = nn.Linear(emb_size, d_out)
        self.linear2 = nn.Linear(d_out, emb_size)

    def forward(self, x):
        # 1 torch.Size([64, 1000, 300])
        x = nn.functional.relu(self.linear1(x))
        x = nn.functional.dropout(x, p=0.1)

        # 2 torch.Size([64, 1000, 10])
        x = nn.functional.relu(self.linear1(x))
        x = nn.functional.dropout(x, p=0.1)

        # 3 torch.Size([64, 1000, 300])
        return self.linear2(x)


In [13]:
# MultiHeadSelfAttention with corrected implementation
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embedding_dim, qkv_dim, heads):
        super().__init__()
        self.to_q = nn.Linear(embedding_dim, qkv_dim)
        self.to_k = nn.Linear(embedding_dim, qkv_dim)
        self.to_v = nn.Linear(embedding_dim, qkv_dim)
        self.to_out = nn.Linear(qkv_dim, embedding_dim)

    def forward(self, x):
        N, sequence_len, embedding_dim = x.shape
        query, key, value = self.to_q(x), self.to_k(x), self.to_v(x)
        similarity = torch.einsum("NQE,NKE->NQK", [query, key])
        scaling = embedding_dim**0.5
        out = torch.softmax(similarity/scaling, dim=2)
        out = torch.einsum('NQK,NVE->NKE', [out, value])
        # torch.Size([64, 1000, 200])
        return self.to_out(out)

In [14]:
# Residual Block with corrected implementation
class ResidualBlock(nn.Module):
    def __init__(self, sub_layer, embedding_dim):
        super().__init__()
        self.sub_layer = sub_layer
        self.norm = nn.LayerNorm(embedding_dim)

    def forward(self, x):
        # Ensure input x is a tensor
        assert isinstance(x, torch.Tensor), "Input x must be a tensor"
        sub_layer_output = self.sub_layer(x)
        # Ensure sub_layer output is a tensor
        assert isinstance(sub_layer_output, torch.Tensor), "Sub layer output must be a tensor"

        # torch.Size([64, 1000, 300])
        x = x + nn.functional.dropout(sub_layer_output, p=0.1)

        # torch.Size([64, 1000, 300])
        x = self.norm(x)
        return x

In [15]:
class Encoder(nn.Module):
    def __init__(self, attention: MultiHeadSelfAttention, feed_forward: FeedForwardLayer, sequence_len, vocab_size, emb_size, device):
        super().__init__()
        self.encoding = InputEncoding(sequence_len, vocab_size, emb_size, device)
        self.attention = attention
        self.feed_forward = feed_forward
        self.residual1 = ResidualBlock(lambda x: self.attention(x), emb_size)
        self.residual2 = ResidualBlock(self.feed_forward, emb_size)
        self.norm = nn.LayerNorm(emb_size)
        self.multi_out = nn.Linear(emb_size * sequence_len, 10)
        self.out = nn.Linear(10, 2)


    def forward(self, x):
        # 1 torch.Size([64, 1000])
        x = self.encoding(x)
        # 2 torch.Size([64, 1000, 300])
        x = self.residual1(x)
        # 3 torch.Size([64, 1000, 300])
        x = self.residual2(x)
        # 4 torch.Size([64, 1000, 300])
        x = self.norm(x)
        # 5 torch.Size([64, 1000, 300])
        x = torch.flatten(x, 1)
        # 6 torch.Size([64, 300000])
        x = self.multi_out(x)
        # 7 torch.Size([64, 10])
        return self.out(x)
        # 8 torch.Size([64, 2])


    def get_multi_forward(self, x):
        # 1 torch.Size([64, 1000])
        x = self.encoding(x)
        # 2 torch.Size([64, 1000, 300])
        x = self.residual1(x)
        # 3 torch.Size([64, 1000, 300])
        x = self.residual2(x)
        # 4 torch.Size([64, 1000, 300])
        x = self.norm(x)
        # 5 torch.Size([64, 1000, 300])
        x = torch.flatten(x, 1)
        # 6 torch.Size([64, 300000])
        return self.multi_out(x)
        # 7 torch.Size([64, 10])




In [22]:
# Hyperparameters and model initialization
vocab_size = tokenizer.vocab_size
embedding_dim = 300 # or 300?
sequence_len = max_length
output_dim = 2
print_every = 200
batch_size = 64
qkv_dim = 200
heads = 8

model = Encoder(MultiHeadSelfAttention(embedding_dim, qkv_dim, heads), FeedForwardLayer(embedding_dim, embedding_dim), sequence_len, vocab_size, embedding_dim, device).to(device)


NameError: name 'max_length' is not defined

In [16]:
def train(num_epoch, model, train_loader, val_loader, device, loss_function, optimizer):
    for epoch in range(num_epoch):
        model.train()
        num_iters = 0
        for batch in train_loader:
            x, attention_mask, y = batch
            x, y = x.to(device), y.to(device)
            scores = model(x)
            loss = loss_function(scores, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if num_iters % print_every == 0:
                evaluate_predictor(model, epoch, val_loader, device)
            num_iters += 1


In [17]:
def evaluate_predictor(model, epoch, val_loader, device):
    model.eval()
    val_loss = 0
    correct = 0
    with torch.no_grad():
        for batch in val_loader:
            x, attention_mask, y = batch
            x, y = x.to(device), y.to(device)
            scores = model(x)
            val_loss += loss_function(scores, y).item()
            pred = scores.argmax(dim=1, keepdim=True)
            correct += pred.eq(y.view_as(pred)).sum().item()

    val_loss /= len(val_loader.dataset)
    accuracy = 100. * correct / len(val_loader.dataset)

    print(f'Epoch: {epoch}, Validation loss: {val_loss:.4f}, Accuracy: {accuracy:.2f}%')


In [25]:
# Define loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Start training
train(15, model, train_loader, val_loader, device, loss_function, optimizer)


Epoch: 0, Validation loss: 0.7389, Accuracy: 48.64%
Epoch: 0, Validation loss: 0.0125, Accuracy: 54.89%
Epoch: 0, Validation loss: 0.0109, Accuracy: 56.25%
Epoch: 0, Validation loss: 0.0104, Accuracy: 59.40%
Epoch: 0, Validation loss: 0.0100, Accuracy: 63.55%
Epoch: 0, Validation loss: 0.0098, Accuracy: 65.29%
Epoch: 0, Validation loss: 0.0101, Accuracy: 63.28%
Epoch: 0, Validation loss: 0.0096, Accuracy: 66.85%
Epoch: 1, Validation loss: 0.0094, Accuracy: 68.39%
Epoch: 1, Validation loss: 0.0098, Accuracy: 67.93%
Epoch: 1, Validation loss: 0.0094, Accuracy: 68.39%
Epoch: 1, Validation loss: 0.0094, Accuracy: 68.31%
Epoch: 1, Validation loss: 0.0094, Accuracy: 69.46%
Epoch: 1, Validation loss: 0.0092, Accuracy: 70.09%
Epoch: 1, Validation loss: 0.0090, Accuracy: 71.03%
Epoch: 1, Validation loss: 0.0089, Accuracy: 71.58%
Epoch: 2, Validation loss: 0.0088, Accuracy: 72.02%
Epoch: 2, Validation loss: 0.0102, Accuracy: 69.15%
Epoch: 2, Validation loss: 0.0096, Accuracy: 70.80%
Epoch: 2, Va

＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊
＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊
已完成訓練

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [82]:

torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/Project/SC201_Project//model_weights.pt')


In [84]:
import torch
import pandas as pd

state_dict = torch.load('/content/drive/MyDrive/Colab Notebooks/Project/SC201_Project//model_weights.pt')

weights_dict = {}

for key, value in state_dict.items():
    weights_dict[key] = value.cpu().numpy().flatten()

weights_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in weights_dict.items()]))

weights_df.to_csv('/content/drive/MyDrive/Colab Notebooks/Project/SC201_Project/model_weights.csv', index=False)

print("模型权重已保存为 .csv 文件")


模型权重已保存为 .csv 文件


In [18]:
vocab_size = tokenizer.vocab_size
embedding_dim = 300  
sequence_len = 300  
output_dim = 2
qkv_dim = 200
heads = 8
max_length = 300
model = Encoder(MultiHeadSelfAttention(embedding_dim, qkv_dim, heads), FeedForwardLayer(embedding_dim, embedding_dim), sequence_len, vocab_size, embedding_dim, device).to(device)

model.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/Project/SC201_Project//model_weights.pt'))

<All keys matched successfully>

In [19]:
# Get data & labels
test_data = pd.read_csv('train_data2023-2024.csv')

test_data['content'] = test_data['content'].fillna('').astype(str)
test_reviews = test_data['content'].values
test_labels = test_data['label'].values
print(test_data.shape)

(28848, 4)


In [20]:
test_reviews = preprocessing(test_reviews, patterns, replacements)

In [21]:
# Tokenize test reviews
test_tokenized_reviews = tokenize_reviews(test_reviews, tokenizer, max_length)
test_input_ids = test_tokenized_reviews['input_ids']
test_attention_masks = test_tokenized_reviews['attention_mask']

test_dataset = TensorDataset(test_input_ids, test_attention_masks, torch.tensor(test_labels))

#test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [22]:
test_loader = DataLoader(test_dataset, batch_size=1)

In [23]:
import numpy as np

def predict_and_evaluate(model, test_loader, device):
    model.eval()
    all_predictions = []
    all_true_labels = []
    with torch.no_grad():
        for batch in test_loader:
            x, attention_mask, y_true = batch
            x = x.to(device)
            y_true = y_true.to(device)
            scores = model(x)
            preds = scores.argmax(dim=1, keepdim=True).cpu().numpy()
            all_predictions.extend(preds.flatten())
            all_true_labels.extend(y_true.cpu().numpy())

    all_predictions = np.array(all_predictions)
    all_true_labels = np.array(all_true_labels)

    # Calculate accuracy
    accuracy = np.mean(all_true_labels == all_predictions) * 100
    return accuracy




In [24]:

accuracy = predict_and_evaluate(model, test_loader, device)
print(f'Test Accuracy: {accuracy:.2f}%')

Test Accuracy: 64.92%


In [26]:
def predict(model, test_loader, device):
    model.eval()
    all_multi_x = []
    predictions = []
    with torch.no_grad():
        for batch in test_loader:
            x, attention_mask, y_true  = batch
            x = x.to(device)
            scores = model(x)
            pred = scores.argmax(dim=1, keepdim=True)
            predictions.extend(pred.cpu().numpy())
            multi_x = model.get_multi_forward(x)
            all_multi_x.extend(multi_x.cpu().numpy())
    print(len(all_multi_x))
    # all_multi_x = np.array(all_multi_x)
    # print(all_multi_x.shape)
    return predictions,  all_multi_x


In [None]:

test_predictions, all_multi_x = predict(model, test_loader, device)
multi_x_data = pd.DataFrame()

test_predictions = np.array(test_predictions)


test_data['predicted_label'] = test_predictions

multi_x_data['stock'] = test_data['code']
multi_x_data['date'] = test_data['time']

multi_x_str = [','.join(map(str, vec)) for vec in all_multi_x]
multi_x_data['vector'] = multi_x_str

output_path = '/content/drive/MyDrive/Colab Notebooks/Project/SC201_Project/test_predictions.csv'
test_data.to_csv(output_path, index=False)
test_data.to_csv('test_predictions.csv', index=False)
print(test_data.head(5))

multi_x_data.to_csv(output_path)
multi_x_data.to_csv('multi_x_2023-2024.csv')
print(multi_x_data.head(5))

28848


In [None]:
print(multi_x_data.tail(5))
print(multi_x_data.shape)