Upload data to huggingface dataset

In [2]:
import pandas as pd
import os
import random
import numpy as np

In [8]:
dir = '../Dataset/financial_llama3_70B_summary/formatted_v0.1'
dataframe_summary_train = []
dataframe_numerical_train = []
dataframe_summary_test = []
dataframe_numerical_test = []
dataframe_summary_val = []
dataframe_numerical_val = []


for filename in os.listdir(dir):
    if filename.startswith('train') or filename.startswith('test') or filename.startswith('val'):
        split = filename.split('_')[0]
        path = os.path.join(dir, filename)
        summaries = pd.read_csv(path)
        # print(summaries.shape)
        summaries.rename(columns={'summary': 'input'}, inplace=True)
        summaries.rename(columns={'price': 'last_price'}, inplace=True)
        summaries['output'] = summaries['input'].shift(-1)
        summaries['current_price'] = summaries['last_price'].shift(-1)

        for index, row in summaries.iterrows():
            random_number_pair = [random.randint(1, 7), random.randint(1, 7)]
            random_number_pair[0] = np.min([random_number_pair[0], index+1])
            random_number_pair[1] = np.min([random_number_pair[1], len(summaries) - index-1])
            # Generate string of summaries for the last 5 days
            input_summaries = ", ".join(
                [f"day {i+1}: {summaries.at[index-i, 'input']}" for i in range(random_number_pair[0])])
            output_summaries = ", ".join(
                [f"day {i+1}: {summaries.at[index+i, 'output']}" for i in range(random_number_pair[1])])
            
            input_prices = ", ".join(
                [f"day {i+1}: {summaries.at[index-i, 'last_price']}" for i in range(random_number_pair[0])])
            output_prices = ", ".join(
                [f"day {i+1}: {summaries.at[index+i, 'current_price']}" for i in range(random_number_pair[1])])

            # Set the training input and output
            if random_number_pair[1] == 1:
                output_day_text = "the current trading day"
            else:
                output_day_text = f"the current trading day and the next {random_number_pair[1] - 1} trading days"
            
            if random_number_pair[0] == 1:
                input_day_text = "last trading day"
            else:
                input_day_text = f"last {random_number_pair[0]} trading days"
            
            

            summaries.at[index,
                        'summary_input'] = f"The financial summaries for the {input_day_text} are {input_summaries}, and the stock prices of the last {random_number_pair[0]} days are {input_prices}"
            summaries.at[index,
                        'summary_output'] = f"The financial summaries for {output_day_text} are {output_summaries}"
            summaries.at[index,
                        'summary_instruction'] = f"Given the financial summaries and stock prices from the {input_day_text}, generate financial summaries for {output_day_text}"
            
            summaries.at[index, 'numerical_output'] = f"The stock prices for {output_day_text} are {output_prices}"
            summaries.at[index, 'numerical_instruction'] = f"Given the financial summaries and stock prices from the {input_day_text}, generate stock prices for {output_day_text}"
        
        summary_df = summaries[['summary_input', 'summary_output', 'summary_instruction']].rename(
                columns={'summary_input': 'input', 'summary_output': 'output', 'summary_instruction': 'instruction'})
        numerical_df = summaries[['summary_input', 'numerical_output', 'numerical_instruction']].rename(
                columns={'summary_input': 'input', 'numerical_output': 'output', 'numerical_instruction': 'instruction'})
    
    if filename.startswith('test'):
        dataframe_summary_test.append(summary_df)
        dataframe_numerical_test.append(numerical_df)

    if filename.startswith('val'):
        dataframe_summary_val.append(summary_df)
        dataframe_numerical_val.append(numerical_df)
    if filename.startswith('train'):
        dataframe_summary_train.append(summary_df)
        dataframe_numerical_train.append(numerical_df)

Train dataframe shape:  (3897, 3)
Val dataframe shape:  (488, 3)
Test dataframe shape:  (492, 3)


In [None]:
completion_df_summary_train = pd.concat(dataframe_summary_train)
completion_df_numerical_train = pd.concat(dataframe_numerical_train)
train_summary_path = f'../parquet_dir/train_finance_summary.parquet'
train_numerical_path = f'../parquet_dir/train_finance_numerical.parquet'
completion_df_summary_train.to_parquet(train_summary_path, engine='pyarrow')
completion_df_numerical_train.to_parquet(train_numerical_path, engine='pyarrow')
print("Train dataframe shape: ", completion_df_summary_train.shape)

completion_df_summary_val = pd.concat(dataframe_summary_val)
completion_df_numerical_val = pd.concat(dataframe_numerical_val)
val_summary_path = f'../parquet_dir/val_finance_summary.parquet'
val_numerical_path = f'../parquet_dir/val_finance_numerical.parquet'
completion_df_summary_val.to_parquet(val_summary_path, engine='pyarrow')
completion_df_numerical_val.to_parquet(val_numerical_path, engine='pyarrow')
print("Val dataframe shape: ", completion_df_summary_val.shape)

completion_df_summary_test = pd.concat(dataframe_summary_test)
completion_df_numerical_test = pd.concat(dataframe_numerical_test)
test_summary_path = f'../parquet_dir/test_finance_summary.parquet'
test_numerical_path = f'../parquet_dir/test_finance_numerical.parquet'
completion_df_summary_test.to_parquet(test_summary_path, engine='pyarrow')
completion_df_numerical_test.to_parquet(test_numerical_path, engine='pyarrow')
print("Test dataframe shape: ", completion_df_summary_test.shape)

In [11]:
from datasets import load_dataset, DatasetDict
from huggingface_hub import HfApi
token = os.getenv("HF_TOKEN")
print(token)
# Load the dataset
train_dataset = load_dataset('parquet', data_files=train_summary_path, split='train')
test_dataset = load_dataset('parquet', data_files=test_summary_path, split='train')
val_dataset = load_dataset('parquet', data_files=val_summary_path, split = 'train')
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})
# Push the dataset to the Hugging Face Hub
dataset_dict.push_to_hub("Howard881010/financial-dataset-summary", token=token)

train_dataset = load_dataset('parquet', data_files=train_numerical_path, split='train')
test_dataset = load_dataset('parquet', data_files=test_numerical_path, split='train')
val_dataset = load_dataset('parquet', data_files=val_numerical_path, split = 'train')
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})
# Push the dataset to the Hugging Face Hub
dataset_dict.push_to_hub("Howard881010/financial-dataset-numerical", token=token)

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


Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 32.35ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  3.06it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 103.02ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  4.90it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 66.93ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  4.80it/s]
Generating train split: 3897 examples [00:00, 20922.34 examples/s]
Generating train split: 492 examples [00:00, 5074.29 examples/s]
Generating train split: 488 examples [00:00, 4499.66 examples/s]
Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 42.07ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.16s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 31.74ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.57it/s]
Creating par

CommitInfo(commit_url='https://huggingface.co/datasets/Howard881010/financial-dataset-numerical/commit/4a67e9677ddaa61106c8a173e4aebcfa47263202', commit_message='Upload dataset', commit_description='', oid='4a67e9677ddaa61106c8a173e4aebcfa47263202', pr_url=None, pr_revision=None, pr_num=None)