Upload data to huggingface dataset

In [1]:
import pandas as pd
import os
import random
import numpy as np

In [2]:
dir = '../Dataset/financial_llama3_70B_summary/formatted_v0.1'
days = 5
output_dir = f'../Dataset/financial_{days}days'
dir = '../Dataset/financial_llama3_70B_summary/formatted_v0.1'
dataframe_summary_train = []
dataframe_numerical_train = []
dataframe_summary_test = []
dataframe_numerical_test = []
dataframe_summary_val = []
dataframe_numerical_val = []


for filename in os.listdir(dir):
    if filename.startswith('train') or filename.startswith('test') or filename.startswith('val'):
        split = filename.split('_')[0]
        path = os.path.join(dir, filename)
        summaries = pd.read_csv(path)
        # print(summaries.shape)
        summaries.rename(columns={'summary': 'input'}, inplace=True)
        summaries.rename(columns={'price': 'last_price'}, inplace=True)
        summaries['output'] = summaries['input'].shift(-1)
        summaries['current_price'] = summaries['last_price'].shift(-1)
        days = 5

        for index, row in summaries.iterrows():
            if index < days - 1 or index > len(summaries) - days - 1:
                continue
            # Generate string of summaries for the last 5 days
            input_summaries = ", ".join(
                [f"day {i+1}: {summaries.at[index-i, 'input']}" for i in range(days)])
            output_summaries = ", ".join(
                [f"day {i+1}: {summaries.at[index+i, 'output']}" for i in range(days)])
            
            input_prices = ", ".join(
                [f"day {i+1}: {summaries.at[index-i, 'last_price']}" for i in range(days)])
            output_prices = ", ".join(
                [f"{summaries.at[index+i, 'current_price']}" for i in range(days)])

            summaries.at[index,
                        'summary_input'] = f"The financial summaries for the last trading {days} days are {input_summaries}, and the stock prices of the last trading {days} days are {input_prices}"
            summaries.at[index,
                        'summary_output'] = f"The financial summaries for the current day and next trading {days-1} days are {output_summaries}"
            summaries.at[index,
                        'summary_instruction'] = f"Given the financial summaries and stock prices from the last trading {days} days, generate financial summaries for current day and next trading {days-1} days"
            
            summaries.at[index, 'numerical_output'] = f"The stock prices for current trading day and next trading {days-1} trading days are {output_prices}"
            summaries.at[index, 'numerical_instruction'] = f"Given the financial summaries and stock prices from the last trading {days} days, generate stock prices for current day and next trading {days-1} days"
        
        summary_df = summaries[['summary_input', 'summary_output', 'summary_instruction']].rename(
                columns={'summary_input': 'input', 'summary_output': 'output', 'summary_instruction': 'instruction'})
        numerical_df = summaries[['summary_input', 'numerical_output', 'numerical_instruction']].rename(
                columns={'summary_input': 'input', 'numerical_output': 'output', 'numerical_instruction': 'instruction'})
        # skip the first and last historical_size days
        summary_df = summary_df.iloc[days:-days]
        numerical_df = numerical_df.iloc[days:-days]
    
    if filename.startswith('test'):
        dataframe_summary_test.append(summary_df)
        dataframe_numerical_test.append(numerical_df)

    if filename.startswith('val'):
        dataframe_summary_val.append(summary_df)
        dataframe_numerical_val.append(numerical_df)
    if filename.startswith('train'):
        dataframe_summary_train.append(summary_df)
        dataframe_numerical_train.append(numerical_df)

In [3]:
completion_df_summary_train = pd.concat(dataframe_summary_train)
completion_df_numerical_train = pd.concat(dataframe_numerical_train)
train_summary_path = f'../parquet_dir/train_finance_summary.parquet'
train_numerical_path = f'../parquet_dir/train_finance_numerical.parquet'
completion_df_summary_train.to_parquet(train_summary_path, engine='pyarrow')
completion_df_numerical_train.to_parquet(train_numerical_path, engine='pyarrow')
print("Train dataframe shape: ", completion_df_summary_train.shape)

completion_df_summary_val = pd.concat(dataframe_summary_val)
completion_df_numerical_val = pd.concat(dataframe_numerical_val)
val_summary_path = f'../parquet_dir/val_finance_summary.parquet'
val_numerical_path = f'../parquet_dir/val_finance_numerical.parquet'
completion_df_summary_val.to_parquet(val_summary_path, engine='pyarrow')
completion_df_numerical_val.to_parquet(val_numerical_path, engine='pyarrow')
print("Val dataframe shape: ", completion_df_summary_val.shape)

completion_df_summary_test = pd.concat(dataframe_summary_test)
completion_df_numerical_test = pd.concat(dataframe_numerical_test)
test_summary_path = f'../parquet_dir/test_finance_summary.parquet'
test_numerical_path = f'../parquet_dir/test_finance_numerical.parquet'
completion_df_summary_test.to_parquet(test_summary_path, engine='pyarrow')
completion_df_numerical_test.to_parquet(test_numerical_path, engine='pyarrow')
print("Test dataframe shape: ", completion_df_summary_test.shape)

Train dataframe shape:  (3777, 3)
Val dataframe shape:  (368, 3)
Test dataframe shape:  (372, 3)


In [4]:
from datasets import load_dataset, DatasetDict
from huggingface_hub import HfApi
token = os.getenv("HF_TOKEN")
print(token)
# Load the dataset
train_dataset = load_dataset('parquet', data_files=train_summary_path, split='train')
test_dataset = load_dataset('parquet', data_files=test_summary_path, split='train')
val_dataset = load_dataset('parquet', data_files=val_summary_path, split = 'train')
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})
# Push the dataset to the Hugging Face Hub
dataset_dict.push_to_hub(f"Howard881010/financial-dataset-summary-{days}days", token=token)

train_dataset = load_dataset('parquet', data_files=train_numerical_path, split='train')
test_dataset = load_dataset('parquet', data_files=test_numerical_path, split='train')
val_dataset = load_dataset('parquet', data_files=val_numerical_path, split = 'train')
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})
# Push the dataset to the Hugging Face Hub
dataset_dict.push_to_hub(f"Howard881010/financial-dataset-numerical-{days}days", token=token)

  from .autonotebook import tqdm as notebook_tqdm


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


Generating train split: 3777 examples [00:00, 9835.07 examples/s] 
Generating train split: 372 examples [00:00, 3364.85 examples/s]
Generating train split: 368 examples [00:00, 3578.04 examples/s]
Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00,  5.75ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.55s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 25.73ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.16it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 26.79ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.05it/s]
Generating train split: 3777 examples [00:00, 11256.26 examples/s]
Generating train split: 372 examples [00:00, 2857.70 examples/s]
Generating train split: 368 examples [00:00, 4026.59 examples/s]
Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 22.59ba/s]
Uploading the dataset shards: 100%|██████████| 1/1

CommitInfo(commit_url='https://huggingface.co/datasets/Howard881010/financial-dataset-numerical-5days/commit/d9c6cd7820344e925dd3a05a4ac917d421bf0af0', commit_message='Upload dataset', commit_description='', oid='d9c6cd7820344e925dd3a05a4ac917d421bf0af0', pr_url=None, pr_revision=None, pr_num=None)