In [1]:
import os
os.chdir('../')
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from evaluator.gpt_evaluator import FinancialDataProcessor
import time
import ast

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
def wait_for_completion(job_id, processor, poll_interval=100):
    status = processor.check_status(job_id)
    while status.status not in ["completed", "failed"]:
        print(f"Current status: {status}. Waiting for {poll_interval} seconds...")
        time.sleep(poll_interval)
        status = processor.check_status(job_id)
    return status.status

# Process the financial Data

In [3]:
def process_data(file_name, input_dir, output_dir):
    # Load the data
    df = pd.read_csv(os.path.join(input_dir, file_name))

    # Step 1: Filter out rows where 'text' starts with "Access to this page has been denied"
    df_filtered = df[~df['text'].str.startswith('Access to this page has been denied', na=False)].copy()

    # Step 2: Ensure the timestamp column is in datetime format
    df_filtered['timestamp'] = pd.to_datetime(df_filtered['timestamp'])

    # Step 3: Convert the timestamp to just the date part
    df_filtered['date'] = df_filtered['timestamp'].dt.date

    # Step 4: Group by date and sample 5 entries from each group (or fewer if less than 5)
    sampled_data = df_filtered.groupby('date', group_keys=False).apply(
        lambda x: x.sample(min(len(x), 5))
    ).reset_index(drop=True)

    # Step 5: Sort the sampled data by date
    sampled_data = sampled_data.sort_values(by='date')

    # Step 6: Save the resulting DataFrame to a new file
    output_file = os.path.join(output_dir, file_name)
    sampled_data.to_csv(output_file, index=False)

# Financial Data

In [4]:
def get_gpt_results(output_dir, results_dir, file_name):
    financial_data_processor = FinancialDataProcessor()

    data = pd.read_csv(os.path.join(output_dir, file_name))

    jsonl_path = os.path.join(results_dir, "batch.jsonl")
    output_path = os.path.join(results_dir, "data.txt")

    batch_object_id = financial_data_processor.create_and_run_batch_job(data, jsonl_path, ticker_column="ticker", date_column="timestamp", input_text_column="text")

    job_status = wait_for_completion(batch_object_id, financial_data_processor)

    if job_status == "completed":
        print("Batch job completed successfully!")
        gpt_outputs = financial_data_processor.check_status_and_parse(batch_object_id , output_path)
        dict_list = [ast.literal_eval(item) for item in gpt_outputs]
        df = pd.DataFrame(dict_list)
        df_combined = df.groupby(['ticker', 'date'], as_index=False).agg({'text': ' '.join})
        df_combined.to_csv(os.path.join(results_dir, file_name), index=False)
    


In [8]:
input_dir = "/home/ubuntu/multimodal/Data/financial-raw/"
output_dir = "/home/ubuntu/multimodal/Data/financial-processed/"
results_dir = f"/home/ubuntu/multimodal/Data/financial-gpt/"

file_name = "NFLX.csv"
process_data(file_name, input_dir, output_dir)
get_gpt_results(output_dir, results_dir, file_name)

  sampled_data = df_filtered.groupby('date', group_keys=False).apply(


batch job created with batch_object_id 
 batch_CwflTdAa2qAlR0ZsqinUQ3PC
Current status: Batch(id='batch_CwflTdAa2qAlR0ZsqinUQ3PC', completion_window='24h', created_at=1725331282, endpoint='/v1/chat/completions', input_file_id='file-bES3FFLJPfNqGPCoJNDeikPV', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1725417682, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Multimodal Forecasting'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0)). Waiting for 100 seconds...
Current status: Batch(id='batch_CwflTdAa2qAlR0ZsqinUQ3PC', completion_window='24h', created_at=1725331282, endpoint='/v1/chat/completions', input_file_id='file-bES3FFLJPfNqGPCoJNDeikPV', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1725

In [2]:
import pandas as pd
import glob

# Step 1: Retrieve all CSV file paths in the specified directory
file_paths = glob.glob('/home/ubuntu/multimodal/Data/financial-gpt/*.csv')

# Step 2: Read all CSV files and concatenate them into a single DataFrame
all_data = pd.concat((pd.read_csv(file) for file in file_paths))

# Step 3: Ensure the 'date' column is in datetime format for proper grouping
all_data['date'] = pd.to_datetime(all_data['date'])
all_data['text'] = all_data['text'].astype(str)

# Step 4: Group by the 'date' column and concatenate the 'text' column for each group
grouped_data = all_data.groupby('date').agg({'text': ' '.join})


# Step 5: Reset the index to flatten the DataFrame
grouped_data = grouped_data.reset_index()

# Step 6: Save the aggregated data to a CSV file
grouped_data.to_csv('results_grouped_by_date.csv', index=False)
print(grouped_data.shape)

print("Data grouped by date and saved to 'results_grouped_by_date.csv'")

(791, 2)
Data grouped by date and saved to 'results_grouped_by_date.csv'
