In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [40]:
# Get to the folder we are at
FOLDERNAME = 'Colab\ Notebooks/Project/stock_price_forcasting/multimodel20240729/news_processing'
MODEL_FOLDERNAME = '/content/drive/MyDrive/Colab Notebooks/Project/stock_price_forcasting/multimodel20240729/news_processing/'
INPUT_FOLDERNAME = '/content/drive/My Drive/Colab Notebooks/Project/stock_price_forcasting/multimodel20240729/news_processing/input/'
OUTPUT_FOLDERNAME = '/content/drive/My Drive/Colab Notebooks/Project/stock_price_forcasting/multimodel20240729/news_processing/output/'

%cd drive/MyDrive/$FOLDERNAME/

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pandas as pd

In [4]:
# Seed for same output
torch.manual_seed(42)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## /-**********-Don't perform while testing processing-**********-/

In [6]:
# Reading in our file
raw_data = pd.read_csv(INPUT_FOLDERNAME+'train_data2014-2021.csv')

# Set the display options of pandas to force all columns to be displayed
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.expand_frame_repr', False)
print(raw_data.head(5))
print(raw_data.tail(5))
print(raw_data.shape)

   code        time                                            content  label
0  2884  2013-12-31  見證創投史 陳木在回母校講古 - Yahoo奇摩新聞\nnan\n@開發金控併購金鼎證券 辜...      0
1  2884  2014-01-02  見證創投史 陳木在回母校講古 - Yahoo奇摩新聞\nnan\n@開發金控併購金鼎證券 辜...      0
2  2884  2014-01-03  見證創投史 陳木在回母校講古 - Yahoo奇摩新聞\nnan\n@開發金控併購金鼎證券 辜...      0
3  2884  2014-01-06  見證創投史 陳木在回母校講古 - Yahoo奇摩新聞\nnan\n@開發金控併購金鼎證券 辜...      0
4  2884  2014-01-07  見證創投史 陳木在回母校講古 - Yahoo奇摩新聞\nnan\n@開發金控併購金鼎證券 辜...      0
       code        time                                            content  label
91232  1402  2021-12-23  遠東新 9月份合併營收計    20,059,024千元, 比去年同期成長       19...      1
91233  1402  2021-12-24  遠東新 9月份合併營收計    20,059,024千元, 比去年同期成長       19...      1
91234  1402  2021-12-27  遠東新 9月份合併營收計    20,059,024千元, 比去年同期成長       19...      1
91235  1402  2021-12-28  遠東新 9月份合併營收計    20,059,024千元, 比去年同期成長       19...      1
91236  1402  2021-12-29  遠東新 9月份合併營收計    20,059,024千元, 比去年同期成長       19...      0
(91237, 4)


## /-**********-Don't perform while testing processing-**********-/

In [7]:
# Get data & labels
raw_data['content'] = raw_data['content'].fillna('').astype(str)
reviews = raw_data['content'].values
labels = raw_data['label'].values

In [8]:
patterns = ['<br />', '--','。','，', '.', ',', '!', '?', ')', '(', ';', ':', '*', '~', '_', "'", '"', ' ', '\n', 'nan',
            '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
            '臺灣水泥', '亞洲水泥', '統一企業', '臺塑企業', '南亞塑膠', '臺灣化學纖維', '遠東新世紀', '中國鋼鐵', '正新橡膠', '裕隆', '和泰汽車', '裕日車', '光寶科技', '聯華電子', '臺達電子', '鴻海精密', '國巨',
            '臺積', '鴻準', '華碩', '廣達電腦', '南亞科技', '友達光電', '中華電信', '可成科技', '聯發科技', '陽明海運', '萬海航運', '臺灣高鐵', '彰化銀行', '中壽', '華南金融控股', '富邦金融控股',
            '國泰金融控股', '中華開發金融控股 ', '玉山金融控股', '元大金融控股', '兆豐金融控股', '臺新金融控股', '新光金融控股', '永豐金融控股', '中國信託金融控股', '第一金融控股', '統一超商', '大立光電', '臺灣大哥大',
            '群創光電', '晨星', '日月光半導體', '遠傳電信', '和碩', '中租控股', '合作金庫金融控股', '矽力傑', '臺塑石化', '緯穎科技', '力積電',
            '富邦媒體科技', '寶成', '豐泰企業',
            'Yahoo', '奇摩', '新聞']
replacements = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '', '', '', '',
            '', '', '', '', '', '', '', '', '', '',
            '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
            '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
            '', '', '', '', '', '', '', '', '', '', '', '', '',
            '', '', '', '', '', '', '', '', '', '', '',
            '', '', '',
            '', '', '']

In [9]:
def preprocessing(reviews, patterns, replacements):
  lst = []
  for i in range(len(reviews)):
    review = reviews[i].lower()
    for pattern, replacement in zip(patterns, replacements):
      review = review.replace(pattern, replacement)
    lst.append(review)
  return lst

## /-**********-Don't perform while testing processing-**********-/

In [10]:
reviews = preprocessing(reviews, patterns, replacements)
# print(reviews)

## /-**********-Don't perform while testing processing-**********-/

In [11]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
bert_model = AutoModelForMaskedLM.from_pretrained("bert-base-chinese").to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
from itertools import count
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset

# Tokenize reviews
def tokenize_reviews(reviews, tokenizer, max_length):
    all_input_ids = []
    all_token_type_ids = []
    all_attention_masks = []

    for review in reviews:
        split_review = review.split('@')

        avg_input_ids = torch.zeros(1, max_length, dtype=torch.long)
        avg_token_type_ids = torch.zeros(1, max_length, dtype=torch.long)
        avg_attention_mask = torch.zeros(1, max_length, dtype=torch.long)

        count = 0

        for sentence in split_review:
            encodings = tokenizer(sentence, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

            input_ids = encodings['input_ids']
            token_type_ids = encodings['token_type_ids']
            attention_mask = encodings['attention_mask']

            # Calculate the number of zeros that need to be padded
            pad_len = max_length - input_ids.size(1)

            # Fill the second element from the bottom with 0
            part1 = input_ids[:, :-1]  # Remove last element
            part2 = input_ids[:, -1:]  # Keep noly the last element
            padded_input_ids = F.pad(part1, (0, pad_len), value=0)
            padded_input_ids = torch.cat((padded_input_ids, part2), dim=1)
            avg_input_ids += padded_input_ids

            attention_mask = F.pad(attention_mask, (0, pad_len), value=1)
            avg_attention_mask += attention_mask

            token_type_ids = F.pad(token_type_ids, (0, pad_len), value=0)
            avg_token_type_ids += token_type_ids

            count += 1

        avg_input_ids = avg_input_ids / count
        avg_token_type_ids = avg_token_type_ids / count
        avg_attention_mask = avg_attention_mask / count

        all_input_ids.append(avg_input_ids)
        all_token_type_ids.append(avg_token_type_ids)
        all_attention_masks.append(avg_attention_mask)

    all_input_ids = torch.cat(all_input_ids, dim=0)
    all_token_type_ids = torch.cat(all_token_type_ids, dim=0)
    all_attention_masks = torch.cat(all_attention_masks, dim=0)

    encodings = {'input_ids': all_input_ids.long(),'token_type_ids': all_token_type_ids, 'attention_mask': all_attention_masks.long()}

    return encodings


In [14]:
max_length = 300

## /-**********-Don't perform while testing processing-**********-/
  This algorithm is used to average sentence vecctors and takes a very long time.

In [15]:

tokenized_reviews = tokenize_reviews(reviews, tokenizer, max_length)
input_ids = tokenized_reviews['input_ids']
attention_masks = tokenized_reviews['attention_mask']

# Create tensor datasets
dataset = TensorDataset(input_ids, attention_masks, torch.tensor(labels))



In [18]:
# Verify that the dataset length is correct
assert len(dataset) == 91237, "Dataset length does not match the expected total of train_size and val_size."

# Create tensor datasets
dataset = TensorDataset(input_ids, attention_masks, torch.tensor(labels))

# Split dataset into training and validation
train_size = 90000
val_size = 1237
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create data loaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

## /-**********-Don't perform while testing processing-**********-/
 This algorithm is used to average sentence vecctors and takes a very long time.

In [19]:
class PositionalEncoding(nn.Module):
    def __init__(self, sequence_len, embedding_dim, device):
        super().__init__()
        self.sequence_len = sequence_len
        self.embedding_dim = embedding_dim
        self.device = device

    def forward(self, x):
        pe = torch.arange(0, self.embedding_dim // 2).reshape(1, self.embedding_dim // 2).to(self.device)
        pe = pe / (self.embedding_dim // 2)
        pe = torch.pow(10000, pe)
        positions = torch.arange(0, self.sequence_len).reshape(self.sequence_len, 1).to(self.device)
        pe_sin = torch.sin(positions / pe)
        pe_cos = torch.cos(positions / pe)
        stacked = torch.stack((pe_sin, pe_cos), dim=2)
        out = torch.flatten(stacked, 1)
        return out


In [20]:
class InputEncoding(nn.Module):
    def __init__(self, sequence_len, vocab_size, embedding_dim, device):
        super().__init__()
        self.word_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_encoding = PositionalEncoding(sequence_len, embedding_dim, device)
        self.device = device

    def forward(self, x):
        emb1 = self.word_embedding(x)
        emb2 = self.positional_encoding(x)
        return emb1 + emb2

In [21]:
# neural network
class FeedForwardLayer(nn.Module):
    def __init__(self, emb_size, d_out):
        super().__init__()
        self.linear1 = nn.Linear(emb_size, d_out)
        self.linear2 = nn.Linear(d_out, emb_size)

    def forward(self, x):
        # 1 torch.Size([64, 1000, 300])
        x = nn.functional.relu(self.linear1(x))
        x = nn.functional.dropout(x, p=0.1)

        # 2 torch.Size([64, 1000, 10])
        x = nn.functional.relu(self.linear1(x))
        x = nn.functional.dropout(x, p=0.1)

        # 3 torch.Size([64, 1000, 300])
        return self.linear2(x)


In [22]:
# MultiHeadSelfAttention with corrected implementation
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embedding_dim, qkv_dim, heads):
        super().__init__()
        self.to_q = nn.Linear(embedding_dim, qkv_dim)
        self.to_k = nn.Linear(embedding_dim, qkv_dim)
        self.to_v = nn.Linear(embedding_dim, qkv_dim)
        self.to_out = nn.Linear(qkv_dim, embedding_dim)

    def forward(self, x):
        N, sequence_len, embedding_dim = x.shape
        query, key, value = self.to_q(x), self.to_k(x), self.to_v(x)
        similarity = torch.einsum("NQE,NKE->NQK", [query, key])
        scaling = embedding_dim**0.5
        out = torch.softmax(similarity/scaling, dim=2)
        out = torch.einsum('NQK,NVE->NKE', [out, value])
        # torch.Size([64, 1000, 200])
        return self.to_out(out)

In [23]:
# Residual Block with corrected implementation
class ResidualBlock(nn.Module):
    def __init__(self, sub_layer, embedding_dim):
        super().__init__()
        self.sub_layer = sub_layer
        self.norm = nn.LayerNorm(embedding_dim)

    def forward(self, x):
        # Ensure input x is a tensor
        assert isinstance(x, torch.Tensor), "Input x must be a tensor"
        sub_layer_output = self.sub_layer(x)
        # Ensure sub_layer output is a tensor
        assert isinstance(sub_layer_output, torch.Tensor), "Sub layer output must be a tensor"

        # torch.Size([64, 1000, 300])
        x = x + nn.functional.dropout(sub_layer_output, p=0.1)

        # torch.Size([64, 1000, 300])
        x = self.norm(x)
        return x

In [24]:
class Encoder(nn.Module):
    def __init__(self, attention: MultiHeadSelfAttention, feed_forward: FeedForwardLayer, sequence_len, vocab_size, emb_size, device):
        super().__init__()
        self.encoding = InputEncoding(sequence_len, vocab_size, emb_size, device)
        self.attention = attention
        self.feed_forward = feed_forward
        self.residual1 = ResidualBlock(lambda x: self.attention(x), emb_size)
        self.residual2 = ResidualBlock(self.feed_forward, emb_size)
        self.norm = nn.LayerNorm(emb_size)
        self.multi_out = nn.Linear(emb_size * sequence_len, 10)
        self.out = nn.Linear(10, 2)


    def forward(self, x):
        # 1 torch.Size([64, 1000])
        x = self.encoding(x)
        # 2 torch.Size([64, 1000, 300])
        x = self.residual1(x)
        # 3 torch.Size([64, 1000, 300])
        x = self.residual2(x)
        # 4 torch.Size([64, 1000, 300])
        x = self.norm(x)
        # 5 torch.Size([64, 1000, 300])
        x = torch.flatten(x, 1)
        # 6 torch.Size([64, 300000])
        x = self.multi_out(x)
        # 7 torch.Size([64, 10])
        return self.out(x)
        # 8 torch.Size([64, 2])


    def get_multi_forward(self, x):
        # 1 torch.Size([64, 1000])
        x = self.encoding(x)
        # 2 torch.Size([64, 1000, 300])
        x = self.residual1(x)
        # 3 torch.Size([64, 1000, 300])
        x = self.residual2(x)
        # 4 torch.Size([64, 1000, 300])
        x = self.norm(x)
        # 5 torch.Size([64, 1000, 300])
        x = torch.flatten(x, 1)
        # 6 torch.Size([64, 300000])
        return self.multi_out(x)
        # 7 torch.Size([64, 10])




In [25]:
# Hyperparameters and model initialization
vocab_size = tokenizer.vocab_size
embedding_dim = 300 # or 300?
sequence_len = max_length
output_dim = 2
print_every = 200
batch_size = 64
qkv_dim = 200
heads = 8

model = Encoder(MultiHeadSelfAttention(embedding_dim, qkv_dim, heads), FeedForwardLayer(embedding_dim, embedding_dim), sequence_len, vocab_size, embedding_dim, device).to(device)


In [26]:
def train(num_epoch, model, train_loader, val_loader, device, loss_function, optimizer):
    for epoch in range(num_epoch):
        model.train()
        num_iters = 0
        for batch in train_loader:
            x, attention_mask, y = batch
            x, y = x.to(device), y.to(device)
            scores = model(x)
            loss = loss_function(scores, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if num_iters % print_every == 0:
                evaluate_predictor(model, epoch, val_loader, device)
            num_iters += 1


In [27]:
def evaluate_predictor(model, epoch, val_loader, device):
    model.eval()
    val_loss = 0
    correct = 0
    with torch.no_grad():
        for batch in val_loader:
            x, attention_mask, y = batch
            x, y = x.to(device), y.to(device)
            scores = model(x)
            val_loss += loss_function(scores, y).item()
            pred = scores.argmax(dim=1, keepdim=True)
            correct += pred.eq(y.view_as(pred)).sum().item()

    val_loss /= len(val_loader.dataset)
    accuracy = 100. * correct / len(val_loader.dataset)

    print(f'Epoch: {epoch}, Validation loss: {val_loss:.4f}, Accuracy: {accuracy:.2f}%')


In [28]:
# Define loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Start training
train(15, model, train_loader, val_loader, device, loss_function, optimizer)


Epoch: 0, Validation loss: 0.9196, Accuracy: 50.53%
Epoch: 0, Validation loss: 0.0182, Accuracy: 50.53%
Epoch: 0, Validation loss: 0.0145, Accuracy: 49.47%
Epoch: 0, Validation loss: 0.0113, Accuracy: 51.50%
Epoch: 0, Validation loss: 0.0135, Accuracy: 50.44%
Epoch: 0, Validation loss: 0.0112, Accuracy: 54.41%
Epoch: 0, Validation loss: 0.0108, Accuracy: 58.93%
Epoch: 0, Validation loss: 0.0106, Accuracy: 61.20%
Epoch: 1, Validation loss: 0.0104, Accuracy: 63.38%
Epoch: 1, Validation loss: 0.0102, Accuracy: 64.19%
Epoch: 1, Validation loss: 0.0102, Accuracy: 64.35%
Epoch: 1, Validation loss: 0.0097, Accuracy: 67.66%
Epoch: 1, Validation loss: 0.0097, Accuracy: 68.31%
Epoch: 1, Validation loss: 0.0094, Accuracy: 69.93%
Epoch: 1, Validation loss: 0.0091, Accuracy: 71.22%
Epoch: 1, Validation loss: 0.0090, Accuracy: 71.14%
Epoch: 2, Validation loss: 0.0089, Accuracy: 72.84%
Epoch: 2, Validation loss: 0.0089, Accuracy: 72.35%
Epoch: 2, Validation loss: 0.0094, Accuracy: 71.30%
Epoch: 2, Va

＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊
＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊
已完成訓練

In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
# Save model weights to .pt file
torch.save(model.state_dict(), MODEL_FOLDERNAME+'model_weights.pt')


In [42]:
import torch
import pandas as pd

# Load model weights
state_dict = torch.load(MODEL_FOLDERNAME+'model_weights.pt')

# Convert weights to Dataframe and save as .csv file
weights_dict = {}

for key, value in state_dict.items():
    weights_dict[key] = value.cpu().numpy().flatten()

weights_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in weights_dict.items()]))

# Save as .csv file
weights_df.to_csv(MODEL_FOLDERNAME+'model_weights.csv', index=False)

print("Model weights saved as .csv file")


Model weights saved as .csv file


In [None]:
# Initialize model
vocab_size = tokenizer.vocab_size
embedding_dim = 300  
sequence_len = 300  
output_dim = 2
qkv_dim = 200
heads = 8
max_length = 300
model = Encoder(MultiHeadSelfAttention(embedding_dim, qkv_dim, heads), FeedForwardLayer(embedding_dim, embedding_dim), sequence_len, vocab_size, embedding_dim, device).to(device)

# 加载模型权重
model.load_state_dict(torch.load(FOLDERNAME+'model_weights.pt'))

<All keys matched successfully>

In [None]:
# Get data & labels
# time_range = '2022-2024'
time_range = '2014-2021'
test_data = pd.read_csv('train_data'+time_range+'.csv')

test_data['content'] = test_data['content'].fillna('').astype(str)
test_reviews = test_data['content'].values
test_labels = test_data['label'].values
print(test_data.shape)

(91207, 4)


In [None]:
test_reviews = preprocessing(test_reviews, patterns, replacements)

In [None]:
# Tokenize test reviews
test_tokenized_reviews = tokenize_reviews(test_reviews, tokenizer, max_length)
test_input_ids = test_tokenized_reviews['input_ids']
test_attention_masks = test_tokenized_reviews['attention_mask']


test_dataset = TensorDataset(test_input_ids, test_attention_masks, torch.tensor(test_labels))

#test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [None]:
test_loader = DataLoader(test_dataset, batch_size=1)

In [None]:
import numpy as np

def predict_and_evaluate(model, test_loader, device):
    model.eval()
    all_predictions = []
    all_true_labels = []
    with torch.no_grad():
        for batch in test_loader:
            x, attention_mask, y_true = batch
            x = x.to(device)
            y_true = y_true.to(device)
            scores = model(x)
            preds = scores.argmax(dim=1, keepdim=True).cpu().numpy()
            all_predictions.extend(preds.flatten())
            all_true_labels.extend(y_true.cpu().numpy())

    all_predictions = np.array(all_predictions)
    all_true_labels = np.array(all_true_labels)

    # Calculate accuracy
    accuracy = np.mean(all_true_labels == all_predictions) * 100
    return accuracy




In [None]:

accuracy = predict_and_evaluate(model, test_loader, device)
print(f'Test Accuracy: {accuracy:.2f}%')

Test Accuracy: 86.71%


In [None]:
def predict(model, test_loader, device):
    model.eval()
    all_multi_x = []
    predictions = []
    with torch.no_grad():
        for batch in test_loader:
            x, attention_mask, y_true  = batch
            x = x.to(device)
            scores = model(x)
            pred = scores.argmax(dim=1, keepdim=True)
            predictions.extend(pred.cpu().numpy())
            multi_x = model.get_multi_forward(x)
            all_multi_x.extend(multi_x.cpu().numpy())
    print(len(all_multi_x))
    # all_multi_x = np.array(all_multi_x)
    # print(all_multi_x.shape)
    return predictions,  all_multi_x


In [None]:

test_predictions, all_multi_x = predict(model, test_loader, device)
multi_x_data = pd.DataFrame()

test_predictions = np.array(test_predictions)

test_data['predicted_label'] = test_predictions

multi_x_data['stock'] = test_data['code']
multi_x_data['date'] = test_data['time']
multi_x_str = [','.join(map(str, vec)) for vec in all_multi_x]
multi_x_data['vector'] = multi_x_str
multi_x_data['label'] = test_data['label']

# /content/drive/MyDrive/Colab Notebooks/Project/SC201_Project
output_path = '/content/drive/MyDrive/Colab Notebooks/Project/SC201_Project/test_predictions.csv'
test_data.to_csv(output_path, index=False)
test_data.to_csv('test_predictions'+time_range+'.csv', index=False)
print(test_data.head(5))

multi_x_data.to_csv(output_path)
multi_x_data.to_csv('multi_x_'+time_range+'.csv')
print(multi_x_data.head(5))

91207
   code        time                                            content  label  predicted_label
0  2884  2013-12-31  見證創投史 陳木在回母校講古 - Yahoo奇摩新聞\nnan\n@開發金控併購金鼎證券 辜...      0                0
1  2884  2014-01-02  見證創投史 陳木在回母校講古 - Yahoo奇摩新聞\nnan\n@開發金控併購金鼎證券 辜...      0                0
2  2884  2014-01-03  見證創投史 陳木在回母校講古 - Yahoo奇摩新聞\nnan\n@開發金控併購金鼎證券 辜...      0                0
3  2884  2014-01-06  見證創投史 陳木在回母校講古 - Yahoo奇摩新聞\nnan\n@開發金控併購金鼎證券 辜...      0                0
4  2884  2014-01-07  見證創投史 陳木在回母校講古 - Yahoo奇摩新聞\nnan\n@開發金控併購金鼎證券 辜...      0                0
   stock        date                                             vector  label
0   2884  2013-12-31  13.776186,-13.32921,0.9579774,-7.7574215,-12.3...      0
1   2884  2014-01-02  15.618971,-14.433574,-1.2816042,-9.932479,-14....      0
2   2884  2014-01-03  15.056732,-14.499186,0.49779883,-8.599947,-12....      0
3   2884  2014-01-06  14.165905,-13.715194,-0.7969837,-8.842483,-13....      0
4   2884  2014-01-07  15.4538

In [None]:
print(multi_x_data.tail(5))
print(multi_x_data.shape)