Upload data to huggingface dataset

In [1]:
import pandas as pd
import os
import json
from datasets import DatasetDict
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from datasets import load_dataset, DatasetDict

In [2]:
file_path = "/home/ubuntu/multimodal/Dataset/Gas-raw/West_Coast_fact.csv"

df = pd.read_csv(file_path)

df['fut_price'] = df['Weekly West Coast All Grades All Formulations Retail Gasoline Prices  (Dollars per Gallon)'].shift(-1)
df['fut_summary'] = df['text']
df['summary'] = df['text'].shift(1)

df['time_period'] = df.apply(lambda x: x['start_date'] + " - " + x['end_date'], axis=1)
df['fut_time_period'] = df['time_period'].shift(-1)
# df = df.rename(columns={'text': 'summary'})
df = df.rename(columns={'Weekly West Coast All Grades All Formulations Retail Gasoline Prices  (Dollars per Gallon)': 'price'})
df['summary_price'] = df.apply(lambda x: json.dumps({"time-period": x['time_period'], "summary": x['summary'], "gas_price": x['price']}), axis=1)
df['fut_summary_price'] = df.apply(lambda x: json.dumps({"time-period": x['fut_time_period'], "summary": x['fut_summary'], "gas_price": x['fut_price']}), axis=1)
df['price'] = df.apply(lambda x: json.dumps({"time-period": x['time_period'], "gas_price": x['price']}), axis=1)
df['fut_price'] = df.apply(lambda x: json.dumps({"time-period": x['fut_time_period'], "gas_price": x['fut_price']}), axis=1)
df['summary'] = df.apply(lambda x: json.dumps({"time-period": x['time_period'], "summary": x['summary']}), axis=1)
df['fut_summary'] = df.apply(lambda x: json.dumps({"time-period": x['fut_time_period'], "summary": x['fut_summary']}), axis=1)

# print(df.head(5))
df.to_csv('/home/ubuntu/multimodal/Dataset/Gas/West_Coast_fact.csv', index=False)


In [3]:
def split_data(file_path, train_ratio=0.8, validation_ratio=0.1, test_ratio=0.1):
    # Read the CSV file
    data = pd.read_csv(file_path)
    
    # Calculate the validation and test sizes
    val_size = validation_ratio / (test_ratio + validation_ratio)
    
    # Split the data into train and temporary datasets
    train_data, temp_data = train_test_split(data, test_size=(1 - train_ratio), random_state=42, shuffle=False)
    
    # Split the temporary dataset into validation and test datasets
    validation_data, test_data = train_test_split(temp_data, test_size=val_size, random_state=42, shuffle=False)
    
    # Save the datasets
    
    dir = file_path.split('/')[:-1]
    dir = '/'.join(dir)
    file_name = file_path.split('/')[-1]
    train_data.to_csv(f'{dir}/train_{file_name}', index=False)
    validation_data.to_csv(f'{dir}/val_{file_name}', index=False)
    test_data.to_csv(f'{dir}/test_{file_name}', index=False)

# List of your CSV files
dir = '/home/ubuntu/multimodal/Dataset/Gas/fact_only'

# Loop through each file and split the data
for filename in os.listdir(dir):
    if not filename.startswith("train") and not filename.startswith("test") and not filename.startswith("val"):
        path = os.path.join(dir, filename)
        split_data(path)  # Adjust the directory path as needed

In [4]:
def convert_to_parquet(dataframe_test, dataframe_train, dataframe_val):
    train = pd.concat(dataframe_train)
    test = pd.concat(dataframe_test)
    val = pd.concat(dataframe_val)

    train_path = '../parquet_dir/train_finance.parquet'
    test_path = '../parquet_dir/test_finance.parquet'
    val_path = '../parquet_dir/val_finance.parquet'

    train.to_parquet(train_path, engine='pyarrow')
    test.to_parquet(test_path, engine='pyarrow')
    val.to_parquet(val_path, engine='pyarrow')
    # Load the dataset
    train_dataset = load_dataset('parquet', data_files=train_path, split='train')
    test_dataset = load_dataset('parquet', data_files=test_path, split='train')
    val_dataset = load_dataset('parquet', data_files=val_path, split = 'train')
    dataset_dict = DatasetDict({
        'train': train_dataset,
        'validation': val_dataset,
        'test': test_dataset
    })

    return dataset_dict

In [5]:
def load_from_huggingface(dataset_path, case, units):
    dataset = load_dataset(dataset_path)

    if not os.path.exists(f"../Data/Gas/{units}/{case}"):
        os.makedirs(f"../Data/Gas/{units}/{case}")

# Access the train split
    for split in ['train', 'validation', 'test']:
        train_dataset = dataset[split]

        # Convert the dataset to a Pandas DataFrame
        df = train_dataset.to_pandas()
        # Save the DataFrame to a CSV file
        df.to_csv(f"../Data/Gas/{units}/{case}/{split}_all.csv", index=False)

In [6]:
def combine_window(df, window_size, unit):
    json_data = []
    end_index = len(df) - 2 * window_size + 1


    for i in range(end_index):
        combined_input = {}
        combine_output = {}
        
        for j in range(window_size):
            input_key = f"{unit}_{j+1}"
            output_key = f"{unit}_{j+window_size+1}"
            combined_input[input_key] = json.loads(df.iloc[i + j]['input'])
            combine_output[output_key] = json.loads(df.iloc[i + j + window_size - 1]['output'])
        combine_json = {
            "input": json.dumps(combined_input),
            "output": json.dumps(combine_output),
            "instruction": df.iloc[i]['instruction']
        }
        json_data.append(combine_json)
    
    json_df = pd.DataFrame(json_data)
    return json_df

# Case 3 text + number  => text + number

In [8]:
dir = '../Dataset/Gas/fact_only'
dataframe_train = []
dataframe_test = []
dataframe_val = []
window_size = 3


for filename in os.listdir(dir):
    if filename.startswith('train') or filename.startswith('test') or filename.startswith('val'):
        split = filename.split('_')[0]
        path = os.path.join(dir, filename)
        summaries = pd.read_csv(path)

        if window_size == 1:
            window = "1 week"
        else:
            window = f"{window_size} weeks"
        example_output = {}
        for i in range(window_size):
            example_output[f"week_{i+1+window_size}"] = {"time-period": "<time-period>", "summary": "<summary>", "gas_price": "<gas_price>"}
        # example_output = "{\"week_n\":{\"time-period\": <time-period>,\"summary\": <summary>}, ... \"week_n\":{\"time-period\": <time-period>,\"summary\": <summary>}}"
        example_output = json.dumps(example_output)
        instruction = f"Given the summary and the gas price for {window}, please predict the next {window}'s gas price and summary in json format. And ouput only the json data, the response should only contain {example_output}"

        df = summaries[['summary_price', 'fut_summary_price']].rename(columns={'summary_price': 'input', 'fut_summary_price': 'output'})
        df['instruction'] = instruction
        # skip the first and last historical_size days
    df = combine_window(df, window_size, "week")
    
    if filename.startswith('test'):
        dataframe_test.append(df)
    elif filename.startswith('train'):
        dataframe_train.append(df)
    elif filename.startswith('val'):
        dataframe_val.append(df)
        
dataset_dict = convert_to_parquet(dataframe_test, dataframe_train, dataframe_val)
token = os.getenv("HF_TOKEN")
# Push the dataset to the Hugging Face Hub
dataset_dict.push_to_hub(f"Howard881010/gas-{window_size}_week-mixed-mixed-fact", token=token)

# Load the dataset from Hugging Face Hub
load_from_huggingface(f"Howard881010/gas-{window_size}_week-mixed-mixed-fact", "mixed-mixed-fact", f"{window_size}_week")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/561 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/561 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/290k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/43.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1031 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/124 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/125 [00:00<?, ? examples/s]

# Case 4 text => text


In [10]:
dir = '../Dataset/Gas/fact_only'
# case 1: number -> number
dataframe_train = []
dataframe_test = []
dataframe_val = []
window_size = 3


for filename in os.listdir(dir):
    if filename.startswith('train') or filename.startswith('test') or filename.startswith('val'):
        split = filename.split('_')[0]
        path = os.path.join(dir, filename)
        summaries = pd.read_csv(path)

        if window_size == 1:
            window = "1 week"
        else:
            window = f"{window_size} weeks"
        example_output = {}
        for i in range(window_size):
            example_output[f"week_{i+1+window_size}"] = {"time-period": "<time-period>", "summary": "<summary>"}
        # example_output = "{\"week_n\":{\"time-period\": <time-period>,\"summary\": <summary>}, ... \"week_n\":{\"time-period\": <time-period>,\"summary\": <summary>}}"
        example_output = json.dumps(example_output)
        instruction = f"Given the summary and time period for {window},please predict next {window} summary in json format. And ouput only the json data, the response should only contain {example_output}"

        df = summaries[['summary', 'fut_summary']].rename(columns={'summary': 'input', 'fut_summary': 'output'})
        df['instruction'] = instruction
        # skip the first and last historical_size days
    df = combine_window(df, window_size, "week")
    
    if filename.startswith('test'):
        dataframe_test.append(df)
    elif filename.startswith('train'):
        dataframe_train.append(df)
    elif filename.startswith('val'):
        dataframe_val.append(df)
        
dataset_dict = convert_to_parquet(dataframe_test, dataframe_train, dataframe_val)
token = os.getenv("HF_TOKEN")
# Push the dataset to the Hugging Face Hub
dataset_dict.push_to_hub(f"Howard881010/gas-{window_size}_week-text-text-fact", token=token)

# Load the dataset from Hugging Face Hub
load_from_huggingface(f"Howard881010/gas-{window_size}_week-text-text-fact", "text-text-fact", f"{window_size}_week")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/561 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/561 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/270k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/39.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1031 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/124 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/125 [00:00<?, ? examples/s]

In [21]:
dir = '../Dataset/Gas/fact_only'
# case 1: number -> number
dataframe_train = []
dataframe_test = []
dataframe_val = []
window_size = 3


for filename in os.listdir(dir):
    for size in range(1, window_size+1):
        if filename.startswith('train') or filename.startswith('test') or filename.startswith('val'):
            split = filename.split('_')[0]
            path = os.path.join(dir, filename)
            summaries = pd.read_csv(path)

            if size == 1:
                window = "1 week"
            else:
                window = f"{size} weeks"
            example_output = {}
            for i in range(size):
                example_output[f"week_{i+1+size}"] = {"time-period": "<time-period>", "summary": "<summary>"}
            # example_output = "{\"week_n\":{\"time-period\": <time-period>,\"summary\": <summary>}, ... \"week_n\":{\"time-period\": <time-period>,\"summary\": <summary>}}"
            example_output = json.dumps(example_output)
            instruction = f"Given the summary and time period for {window},please predict next {window} summary in json format. And ouput only the json data, the response should only contain {example_output}"

            df = summaries[['summary', 'fut_summary']].rename(columns={'summary': 'input', 'fut_summary': 'output'})
            df['instruction'] = instruction
            # skip the first and last historical_size days
        df = combine_window(df, size, "week")
        
        if filename.startswith('test'):
            dataframe_test.append(df)
        elif filename.startswith('train'):
            dataframe_train.append(df)
        elif filename.startswith('val'):
            dataframe_val.append(df)
        
dataset_dict = convert_to_parquet(dataframe_test, dataframe_train, dataframe_val)
token = os.getenv("HF_TOKEN")
# Push the dataset to the Hugging Face Hub
dataset_dict.push_to_hub(f"Howard881010/gas-week-text-text-fact", token=token)

# Load the dataset from Hugging Face Hub
load_from_huggingface(f"Howard881010/gas-week-text-text-fact", "text-text-fact", f"week")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading readme:   0%|          | 0.00/606 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/670k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3099 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/378 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/381 [00:00<?, ? examples/s]

In [19]:
dir = '../Dataset/Gas/fact_only'
dataframe_train = []
dataframe_test = []
dataframe_val = []
window_size = 3


for filename in os.listdir(dir):
    for size in range(1, window_size+1):
        if filename.startswith('train') or filename.startswith('test') or filename.startswith('val'):
            split = filename.split('_')[0]
            path = os.path.join(dir, filename)
            summaries = pd.read_csv(path)

            if size == 1:
                window = "1 week"
            else:
                window = f"{size} weeks"
            example_output = {}
            for i in range(size):
                example_output[f"week_{i+1+size}"] = {"time-period": "<time-period>", "summary": "<summary>", "gas_price": "<gas_price>"}
            # example_output = "{\"week_n\":{\"time-period\": <time-period>,\"summary\": <summary>}, ... \"week_n\":{\"time-period\": <time-period>,\"summary\": <summary>}}"
            example_output = json.dumps(example_output)
            instruction = f"Given the summary and the gas price for {window}, please predict the next {window}'s gas price and summary in json format. And ouput only the json data, the response should only contain {example_output}"

            df = summaries[['summary_price', 'fut_summary_price']].rename(columns={'summary_price': 'input', 'fut_summary_price': 'output'})
            df['instruction'] = instruction
            # skip the first and last historical_size days
        df = combine_window(df, size, "week")
        
        if filename.startswith('test'):
            dataframe_test.append(df)
        elif filename.startswith('train'):
            dataframe_train.append(df)
        elif filename.startswith('val'):
            dataframe_val.append(df)
        
dataset_dict = convert_to_parquet(dataframe_test, dataframe_train, dataframe_val)
token = os.getenv("HF_TOKEN")
# Push the dataset to the Hugging Face Hub
dataset_dict.push_to_hub(f"Howard881010/gas-week-mixed-mixed-fact", token=token)

# Load the dataset from Hugging Face Hub
load_from_huggingface(f"Howard881010/gas-week-mixed-mixed-fact", "mixed-mixed-fact", f"week")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading readme:   0%|          | 0.00/606 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/716k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/78.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/80.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3099 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/378 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/381 [00:00<?, ? examples/s]