In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
metadata = pd.read_csv('data/Top 2000/t2k_metadata.csv', index_col=0)
descriptions = metadata.loc['Description']

In [3]:
with open('data/Top 2000/t2k_descriptions.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(descriptions) + '\n')

In [4]:
from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files='Data/Top 2000/t2k_descriptions.txt', vocab_size=16000, min_frequency=2, special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])

In [5]:
tokenizer.save_model('Data/Top 2000/a2v')

['Data/Top 2000/a2v\\vocab.json', 'Data/Top 2000/a2v\\merges.txt']

In [10]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('tokenizer/')

In [7]:
tokens = tokenizer(metadata.loc['Description', 'AAPL'])
tokenizer.convert_ids_to_tokens(tokens['input_ids'])

['<s>',
 'Apple',
 ',',
 'ĠInc',
 '.',
 'Ġengages',
 'Ġin',
 'Ġthe',
 'Ġdesign',
 ',',
 'Ġmanufacture',
 ',',
 'Ġand',
 'Ġsale',
 'Ġof',
 'Ġsmartphones',
 ',',
 'Ġpersonal',
 'Ġcomputers',
 ',',
 'Ġtablets',
 ',',
 'Ġwearables',
 'Ġand',
 'Ġaccessories',
 ',',
 'Ġand',
 'Ġother',
 'Ġvarieties',
 'Ġof',
 'Ġrelated',
 'Ġservices',
 '.',
 'ĠIt',
 'Ġoperates',
 'Ġthrough',
 'Ġthe',
 'Ġfollowing',
 'Ġgeographical',
 'Ġsegments',
 ':',
 'ĠAmericas',
 ',',
 'ĠEurope',
 ',',
 'ĠGreater',
 'ĠChina',
 ',',
 'ĠJapan',
 ',',
 'Ġand',
 'ĠRest',
 'Ġof',
 'ĠAsia',
 'ĠPacific',
 '.',
 'ĠThe',
 'ĠAmericas',
 'Ġsegment',
 'Ġincludes',
 'ĠNorth',
 'Ġand',
 'ĠSouth',
 'ĠAmerica',
 '.',
 'ĠThe',
 'ĠEurope',
 'Ġsegment',
 'Ġconsists',
 'Ġof',
 'ĠEuropean',
 'Ġcountries',
 ',',
 'Ġas',
 'Ġwell',
 'Ġas',
 'ĠIndia',
 ',',
 'Ġthe',
 'ĠMiddle',
 'ĠEast',
 ',',
 'Ġand',
 'ĠAfrica',
 '.',
 'ĠThe',
 'ĠGreater',
 'ĠChina',
 'Ġsegment',
 'Ġcomprises',
 'ĠChina',
 ',',
 'ĠHong',
 'ĠKong',
 ',',
 'Ġand',
 'ĠTaiwan',
 '

In [8]:
tokens = [tokenizer(desc)['input_ids'] for desc in descriptions]
lengths = [len(token) for token in tokens]

In [9]:
max(lengths)

455

In [11]:
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig

roberta_config = RobertaConfig(
        vocab_size=15193,
        hidden_size=384,
        num_hidden_layers=6,
        intermediate_size=1536,
    )
TOKENIZER = RobertaTokenizer.from_pretrained('tokenizer/')
LM = RobertaModel(config=roberta_config)

In [15]:
tokens = tokenizer(metadata.loc['Description', 'AAPL'], return_tensors='pt')

In [21]:
LM(**tokens)[0][:, 0, :].shape

torch.Size([1, 384])