### Import Packages

In [1]:
import pandas as pd
import os
import time
import requests 
import tiktoken
import ray
import numpy as np

from math import ceil

# In order for ray to work, make sure you uninstall pydantic and reinstall this: pip install "pydantic<2"
from openai import OpenAI
from class_model.model import Model
from utils.system import *

import warnings
warnings.filterwarnings('ignore')



### Data

In [42]:
collect = []
for i in range(1, 6):
    collect.append(pd.read_parquet(get_format_data() / 'art' / f'wsj_art_{i}.parquet.brotli'))
wsj_multiple = pd.concat(collect, axis=0)

### Parallelized: Get number of tokens (per article)

In [47]:
@ray.remote
def get_token_count(article_text, encoding_param):
    encoding = tiktoken.get_encoding(encoding_param)
    token_count = len(encoding.encode(article_text))
    return token_count

def process_tokens_in_batches(df, column_name, encoding_param, batch_size):
    num_batches = np.ceil(len(df) / batch_size)
    all_token_counts = []
    print(f"Number of batches: {int(num_batches)}")

    for i in range(int(num_batches)):
        print(f"Processing batch: {i + 1}/{int(num_batches)}")
        start_index = i * batch_size
        end_index = start_index + batch_size
        batch = df[column_name][start_index:end_index]

        # Start asynchronous tasks for the batch
        futures = [get_token_count.remote(text, encoding_param) for text in batch]
        token_counts = ray.get(futures)
        all_token_counts.extend(token_counts)

    # Assign the token counts back to the DataFrame
    df['n_tokens'] = all_token_counts
    return df

In [44]:
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base" 
max_tokens = 8000

In [48]:
batch_size = 5000

# Process articles in batches
ray.init(num_cpus=16, ignore_reinit_error=True)
start_time = time.time()
wsj_multiple = process_tokens_in_batches(wsj_multiple, 'body_txt', embedding_encoding, batch_size)
elapsed_time = time.time() - start_time
print(f"Total time to get all tokens: {round(elapsed_time)} seconds")

# Shutdown Ray
ray.shutdown()

2023-12-17 19:15:59,079	INFO worker.py:1673 -- Started a local Ray instance.


Number of batches: 167
Processing batch: 1/167
Processing batch: 2/167
Processing batch: 3/167
Processing batch: 4/167
Processing batch: 5/167
Processing batch: 6/167
Processing batch: 7/167
Processing batch: 8/167
Processing batch: 9/167
Processing batch: 10/167
Processing batch: 11/167
Processing batch: 12/167
Processing batch: 13/167
Processing batch: 14/167
Processing batch: 15/167
Processing batch: 16/167
Processing batch: 17/167
Processing batch: 18/167
Processing batch: 19/167
Processing batch: 20/167
Processing batch: 21/167
Processing batch: 22/167
Processing batch: 23/167
Processing batch: 24/167
Processing batch: 25/167
Processing batch: 26/167
Processing batch: 27/167
Processing batch: 28/167
Processing batch: 29/167
Processing batch: 30/167
Processing batch: 31/167
Processing batch: 32/167
Processing batch: 33/167
Processing batch: 34/167
Processing batch: 35/167
Processing batch: 36/167
Processing batch: 37/167
Processing batch: 38/167
Processing batch: 39/167
Processing 

In [50]:
# Filter
print(f"Length before: {len(wsj_multiple)}")
wsj_multiple = wsj_multiple[wsj_multiple.n_tokens <= max_tokens]
print(f"Length after: {len(wsj_multiple)}")

Length before: 831199
Length after: 831077


In [51]:
# Export Data
chunks = np.array_split(wsj_multiple, 8)
for i, df in enumerate(chunks, 1):
    print(i)
    df.to_parquet(get_format_data() / 'token' / f'wsj_tokens_{i}.parquet.brotli', compression='brotli')

1
2
3
4
5
6
7
8


### Parallelized: Get embeddings (per article)

In [25]:
# Read in token dataset
collect = []
for i in range(1, 9):
    collect.append(pd.read_parquet(get_format_data() / 'token' / f'wsj_tokens_{i}.parquet.brotli'))
wsj_multiple = pd.concat(collect, axis=0)

In [26]:
wsj_multiple_token = wsj_multiple.copy(deep=True)

In [38]:
@ray.remote
def get_embedding_article(article_text, model):
    api_key = json.load(open(get_config() / 'api.json'))['openai_api_key']
    client = OpenAI(api_key=api_key)
    embedding = client.embeddings.create(input=[article_text.replace("\n", " ")], model=model).data[0].embedding
    return embedding

def process_articles_in_batches(df, column_name, model, batch_size, delay_per_batch):
    num_batches = np.ceil(len(df) / batch_size)
    all_embeddings = []
    all_indices = []
    print(f"Number of batches: {int(num_batches)}")
    for i in range(int(num_batches)):
        print(f"Processing batch: {i + 1}/{int(num_batches)}")
        start_index = i * batch_size
        end_index = min(start_index + batch_size, len(df))
        batch = df[column_name][start_index:end_index]
        
        # Start asynchronous tasks for the batch
        futures = [get_embedding_article.remote(text, model) for text in batch]
        embeddings = ray.get(futures)

        # Update lists
        all_embeddings.extend(embeddings)
        all_indices.extend(df.index[start_index:end_index])

        # Save Batch
        save_path = get_format_data() / 'openai' / f'wsj_emb_openai_{i}.parquet.brotli'
        print(f"Saving progress to {save_path}...")
        temp_df = pd.DataFrame({'ada_embedding': all_embeddings}, index=all_indices)
        temp_df.to_parquet(save_path, compression='brotli')             
        print("Progress saved")
        all_embeddings = []
        all_indices = []

    return None

In [39]:
# Parameters
model_name = 'text-embedding-ada-002'
batch_size = 1000
delay_per_batch = 60

# Process articles in batches
ray.init(num_cpus=16, ignore_reinit_error=True)

start_time = time.time()
process_articles_in_batches(wsj_multiple_token, 'body_txt', model_name, batch_size, delay_per_batch)
elapsed_time = time.time() - start_time
print(f"Total time to get all embeddings: {round(elapsed_time)} seconds")

# Shutdown Ray
ray.shutdown()

2023-12-16 11:13:18,995	INFO worker.py:1507 -- Calling ray.init() again after it has already been called.


Number of batches: 832
Processing batch: 1/832
Saving progress to C:\Jonathan\QuantResearch\AlgoTradingModels\narrativezoo\narrative_zoo\data\format\openai\wsj_emb_openai_0.parquet.brotli...
Progress saved
Processing batch: 2/832
Saving progress to C:\Jonathan\QuantResearch\AlgoTradingModels\narrativezoo\narrative_zoo\data\format\openai\wsj_emb_openai_1.parquet.brotli...
Progress saved
Processing batch: 3/832
Saving progress to C:\Jonathan\QuantResearch\AlgoTradingModels\narrativezoo\narrative_zoo\data\format\openai\wsj_emb_openai_2.parquet.brotli...
Progress saved
Processing batch: 4/832
Saving progress to C:\Jonathan\QuantResearch\AlgoTradingModels\narrativezoo\narrative_zoo\data\format\openai\wsj_emb_openai_3.parquet.brotli...
Progress saved
Processing batch: 5/832
Saving progress to C:\Jonathan\QuantResearch\AlgoTradingModels\narrativezoo\narrative_zoo\data\format\openai\wsj_emb_openai_4.parquet.brotli...
Progress saved
Processing batch: 6/832
Saving progress to C:\Jonathan\QuantRe