Upload data to huggingface dataset

In [1]:
import pandas as pd
import os
import random
import numpy as np
import json
from datasets import DatasetDict, Dataset, Features, Value
from glob import glob
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from datasets import load_dataset, DatasetDict
from huggingface_hub import HfApi

  from .autonotebook import tqdm as notebook_tqdm


Convert the json file to csv format

In [2]:
dir = '../Financial_Alpha_Vantage/Summary_with_price_v0.2/'
data_dict = {}

# Iterate through each JSON file in the directory
for company in os.listdir(dir):
    json_dir = os.path.join(dir, company)

# Get list of JSON files, sorted by filename (assuming they are named in chronological order)
    json_files = sorted(glob(os.path.join(json_dir, '*.json')))

    # List to store input and output data
    data = []

    # Iterate through JSON files
    for i in range(len(json_files) - 1):
        # Read current JSON file
        with open(json_files[i], 'r') as current_file:
            current_data = json.load(current_file)
        
        # Read next JSON file for the summary key
        with open(json_files[i + 1], 'r') as next_file:
            next_data = json.load(next_file)
        
        # Extract input and output data
        input_data = current_data  # Convert JSON to string
        input_price = {"share_price": current_data.get("share_price", '')}
        output_summary_price = {"summary": next_data.get("summary", ''), "share_price": next_data.get("share_price", '')}  # Get summary from next day's JSON
        output_price = {"share_price": next_data.get("share_price", '')}
        
        # Append to data list
        data.append({
            'input': json.dumps(input_data),
            'input_price': json.dumps(input_price),
            'output_summary_price': json.dumps(output_summary_price),
            'output_price': json.dumps(output_price)
        })
   
    # Convert data list to DataFrame
    df = pd.DataFrame(data)

    # Save DataFrame to CSV
    output_csv_path = f"../Dataset/Finance/v0.2/{company}.csv"
    df.to_csv(output_csv_path, index=False)
    

In [3]:
def split_data(file_path, train_ratio=0.8, validation_ratio=0.1, test_ratio=0.1):
    # Read the CSV file
    data = pd.read_csv(file_path)
    
    # Calculate the validation and test sizes
    val_size = validation_ratio / (test_ratio + validation_ratio)
    
    # Split the data into train and temporary datasets
    train_data, temp_data = train_test_split(data, test_size=(1 - train_ratio), random_state=42, shuffle=False)
    
    # Split the temporary dataset into validation and test datasets
    validation_data, test_data = train_test_split(temp_data, test_size=val_size, random_state=42, shuffle=False)
    
    # Save the datasets
    
    dir = file_path.split('/')[:-1]
    dir = '/'.join(dir)
    file_name = file_path.split('/')[-1]
    train_data.to_csv(f'{dir}/train_{file_name}', index=False)
    validation_data.to_csv(f'{dir}/val_{file_name}', index=False)
    test_data.to_csv(f'{dir}/test_{file_name}', index=False)

# List of your CSV files
dir = '../Dataset/Finance/v0.2'

# Loop through each file and split the data
for filename in os.listdir(dir):
    if not filename.startswith("train") and not filename.startswith("test") and not filename.startswith("val"):
        path = os.path.join(dir, filename)
        split_data(path)  # Adjust the directory path as needed

In [4]:
def convert_to_parquet(dataframe_test, dataframe_train, dataframe_val):
    train = pd.concat(dataframe_train)
    test = pd.concat(dataframe_test)
    val = pd.concat(dataframe_val)

    train_path = '../parquet_dir/train_finance.parquet'
    test_path = '../parquet_dir/test_finance.parquet'
    val_path = '../parquet_dir/val_finance.parquet'

    train.to_parquet(train_path, engine='pyarrow')
    test.to_parquet(test_path, engine='pyarrow')
    val.to_parquet(val_path, engine='pyarrow')
    # Load the dataset
    train_dataset = load_dataset('parquet', data_files=train_path, split='train')
    test_dataset = load_dataset('parquet', data_files=test_path, split='train')
    val_dataset = load_dataset('parquet', data_files=val_path, split = 'train')
    dataset_dict = DatasetDict({
        'train': train_dataset,
        'validation': val_dataset,
        'test': test_dataset
    })

    return dataset_dict

In [5]:
def load_from_huggingface(dataset_path, case):
    dataset = load_dataset(dataset_path)

# Access the train split
    for split in ['train', 'validation', 'test']:
        train_dataset = dataset[split]

        # Convert the dataset to a Pandas DataFrame
        df = train_dataset.to_pandas()

        # Save the DataFrame to a CSV file
        df.to_csv(f"../Data/Finance/1_day/{case}/{split}_all.csv", index=False)

In [6]:
train_all = []
test_all = []
val_all = []

# Case1 number -> number


In [7]:
dir = '../Dataset/Finance/v0.2/'
# case 1: number -> number
dataframe_train = []
dataframe_test = []
dataframe_val = []


for filename in os.listdir(dir):
    if filename.startswith('train') or filename.startswith('test') or filename.startswith('val'):
        split = filename.split('_')[0]
        path = os.path.join(dir, filename)
        summaries = pd.read_csv(path)
        instruction = "Given the share price for the current day, please predict the shared price in json format for next day. The output should be like {""share_price"":  <value>}"

        df = summaries[['input_price', 'output_price']].rename(columns={'input_price': 'input', 'output_price': 'output'})
        df['instruction'] = instruction
        # skip the first and last historical_size days
    
    if filename.startswith('test'):
        dataframe_test.append(df)
        test_all.append(df)
    if filename.startswith('train'):
        dataframe_train.append(df)
        train_all.append(df)
    if filename.startswith('val'):
        dataframe_val.append(df)
        val_all.append(df)

# dataset_dict = convert_to_parquet(dataframe_test, dataframe_train, dataframe_val)

# token = os.getenv("HF_TOKEN")
# # Push the dataset to the Hugging Face Hub
# dataset_dict.push_to_hub(f"Howard881010/finance-numerical", token=token)


# # Load the dataset from Hugging Face Hub
# load_from_huggingface("Howard881010/finance-numerical", "numerical")

# Case 2 number + text => number

In [8]:
dir = '../Dataset/Finance/v0.2/'
# case 1: number -> number
dataframe_train = []
dataframe_test = []
dataframe_val = []


for filename in os.listdir(dir):
    if filename.startswith('train') or filename.startswith('test') or filename.startswith('val'):
        split = filename.split('_')[0]
        path = os.path.join(dir, filename)
        summaries = pd.read_csv(path)
        instruction = "Given the financial report and the share price for the current day, please predict the shared price in json format for next day. The output should be like {""share_price"":  <value>}"

        df = summaries[['input', 'output_price']].rename(columns={'output_price': 'output'})
        df['instruction'] = instruction
        # skip the first and last historical_size days
    
    if filename.startswith('test'):
        dataframe_test.append(df)
        test_all.append(df)
    elif filename.startswith('train'):
        dataframe_train.append(df)
        train_all.append(df)
    elif filename.startswith('val'):
        dataframe_val.append(df)
        val_all.append(df)

# dataset_dict = convert_to_parquet(dataframe_test, dataframe_train, dataframe_val)
# token = os.getenv("HF_TOKEN")
# # Push the dataset to the Hugging Face Hub
# dataset_dict.push_to_hub(f"Howard881010/finance-mixed-numerical", token=token)


# # Load the dataset from Hugging Face Hub
# load_from_huggingface("Howard881010/finance-mixed-numerical", "mixed-numerical")

# Case 3 number + text => number + summary

In [11]:
dir = '../Dataset/Finance/v0.2/'
# case 1: number -> number
dataframe_train = []
dataframe_test = []
dataframe_val = []


for filename in os.listdir(dir):
    if filename.startswith('train') or filename.startswith('test') or filename.startswith('val'):
        split = filename.split('_')[0]
        path = os.path.join(dir, filename)
        summaries = pd.read_csv(path)
        instruction = "Given the financial report and share price for the current day, please predict the summary and shared price part in json format for next day. The output should be like {""summary"": <value>, ""share_price"":  <value>}"

        df = summaries[['input', 'output_summary_price']].rename(columns={'output_summary_price': 'output'})
        df['instruction'] = instruction
        # skip the first and last historical_size days
    
    if filename.startswith('test'):
        dataframe_test.append(df)
        test_all.append(df)
    elif filename.startswith('train'):
        dataframe_train.append(df)
        train_all.append(df)
    elif filename.startswith('val'):
        dataframe_val.append(df)
        val_all.append(df)

dataset_dict = convert_to_parquet(dataframe_test, dataframe_train, dataframe_val)
token = os.getenv("HF_TOKEN")
# Push the dataset to the Hugging Face Hub
dataset_dict.push_to_hub(f"Howard881010/finance-mixed-summary", token=token)

# Load the dataset from Hugging Face Hub
load_from_huggingface("Howard881010/finance-mixed-summary", "mixed-summary")

Generating train split: 128 examples [00:00, 1349.19 examples/s]
Generating train split: 17 examples [00:00, 215.32 examples/s]
Generating train split: 16 examples [00:00, 218.34 examples/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 159.34ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.47s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 861.08ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.71it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 573.93ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.75it/s]
Downloading readme: 100%|██████████| 554/554 [00:00<00:00, 769kB/s]
Downloading data: 100%|██████████| 251k/251k [00:00<00:00, 777kB/s]
Downloading data: 100%|██████████| 55.3k/55.3k [00:00<00:00, 236kB/s]
Downloading data: 100%|██████████| 46.0k/46.0k [00:00<00:00, 191kB/s]
Generating train split: 100%|██████████| 128/128 [00:00<

In [None]:
dataset_dict = convert_to_parquet(test_all, train_all, val_all)

token = os.getenv("HF_TOKEN")
# Push the dataset to the Hugging Face Hub
dataset_dict.push_to_hub(f"Howard881010/finance-all", token=token)

# Load the dataset from Hugging Face Hub
load_from_huggingface("Howard881010/finance-all", "all")

In [13]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("Howard881010/climate-1_day-mixed-mixed")

print(dataset['train'][0]['input'])


