Upload data to huggingface dataset

In [1]:
import pandas as pd
import os
import random
import numpy as np

In [2]:
dir = '../Dataset/financial_llama3_70B_summary/formatted_v0.1'
dataframe_summary = []
dataframe_numerical = []


for filename in os.listdir(dir):
    if filename.startswith('train'):
        path = os.path.join(dir, filename)
        summaries = pd.read_csv(path)
        # print(summaries.shape)
        summaries.rename(columns={'summary': 'input'}, inplace=True)
        summaries.rename(columns={'price': 'last_price'}, inplace=True)
        summaries['output'] = summaries['input'].shift(-1)
        summaries['current_price'] = summaries['last_price'].shift(-1)

        for index, row in summaries.iterrows():
            random_number_pair = [random.randint(1, 7), random.randint(1, 7)]
            random_number_pair[0] = np.min([random_number_pair[0], index+1])
            random_number_pair[1] = np.min([random_number_pair[1], len(summaries) - index-1])
            # Generate string of summaries for the last 5 days
            input_summaries = ", ".join(
                [f"day {i+1}: {summaries.at[index-i, 'input']}" for i in range(random_number_pair[0])])
            output_summaries = ", ".join(
                [f"day {i+1}: {summaries.at[index+i, 'output']}" for i in range(random_number_pair[1])])
            
            input_prices = ", ".join(
                [f"day {i+1}: {summaries.at[index-i, 'last_price']}" for i in range(random_number_pair[0])])
            output_prices = ", ".join(
                [f"day {i+1}: {summaries.at[index+i, 'current_price']}" for i in range(random_number_pair[1])])

            # Set the training input and output
            if random_number_pair[1] == 1:
                output_day_text = "the current trading day"
            else:
                output_day_text = f"the current trading day and the next {random_number_pair[1] - 1} trading days"
            
            if random_number_pair[0] == 1:
                input_day_text = "last trading day"
            else:
                input_day_text = f"last {random_number_pair[0]} trading days"
            
            

            summaries.at[index,
                        'summary_input'] = f"The financial summaries for the {input_day_text} are {input_summaries}, and the stock prices of the last {random_number_pair[0]} days are {input_prices}"
            summaries.at[index,
                        'summary_output'] = f"The financial summaries for {output_day_text} are {output_summaries}"
            summaries.at[index,
                        'summary_instruction'] = f"Given the financial summaries and stock prices from the {input_day_text}, generate financial summaries for {output_day_text}"
            
            summaries.at[index, 'numerical_output'] = f"The stock prices for {output_day_text} are {output_prices}"
            summaries.at[index, 'numerical_instruction'] = f"Given the financial summaries and stock prices from the {input_day_text}, generate stock prices for {output_day_text}"
        
        summary_df = summaries[['summary_input', 'summary_output', 'summary_instruction']].rename(
                columns={'summary_input': 'input', 'summary_output': 'output', 'summary_instruction': 'instruction'})
        numerical_df = summaries[['summary_input', 'numerical_output', 'numerical_instruction']].rename(
                columns={'summary_input': 'input', 'numerical_output': 'output', 'numerical_instruction': 'instruction'})
        
        dataframe_summary.append(summary_df)
        dataframe_numerical.append(numerical_df)

completion_df_summary = pd.concat(dataframe_summary)
completion_df_numerical = pd.concat(dataframe_numerical)
print(completion_df_summary.shape)
print(completion_df_numerical.shape)

summary_path = '../parquet_dir/finance_summary.parquet'
numerical_path = '../parquet_dir/finance_numerical.parquet'
completion_df_summary.to_parquet(summary_path, engine='pyarrow')
completion_df_numerical.to_parquet(numerical_path, engine='pyarrow')

(3897, 3)
(3897, 3)


In [3]:
from datasets import load_dataset
from huggingface_hub import HfApi

# Load the dataset
dataset = load_dataset('parquet', data_files=summary_path)
token = os.getenv("HF_TOKEN")
print(token)

# Push the dataset to the Hugging Face Hub
dataset.push_to_hub("Howard881010/financial-dataset-summary", token=token)

dataset = load_dataset('parquet', data_files=numerical_path)
# Push the dataset to the Hugging Face Hub
dataset.push_to_hub("howard881010/financial-dataset-numerical", token=token)

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 3897 examples [00:00, 12386.28 examples/s]


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


Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00,  9.44ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.56s/it]
Generating train split: 3897 examples [00:00, 16391.08 examples/s]
Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 35.82ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.27s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Howard881010/financial-dataset-numerical/commit/fecf3960b073efb8b5d6aea70d2701de8e53c0f8', commit_message='Upload dataset', commit_description='', oid='fecf3960b073efb8b5d6aea70d2701de8e53c0f8', pr_url=None, pr_revision=None, pr_num=None)