In [1]:
%reload_ext autoreload
%autoreload 2
from MultimodalDataset.GasData import GasDataProcessor
from MultimodalDataset.MedicalData import MedicalDataProcessor
from MultimodalDataset.ClimateData import ClimateDataProcessor
from src.utils import load_config, get_max_token_size
import yaml
from datasets import Dataset, DatasetDict, load_dataset
import pandas as pd

# For Climate Dataset

In [6]:
file_path = "config/default/climate.yaml"
# Load the existing YAML file
with open(file_path, 'r') as file:
    config = yaml.safe_load(file)

# Iterate over input_window and output_window from 1 to 7
for i in range(1, 8):
    # Update the values in the config
    config['dataset']['input_window'] = i
    config['dataset']['output_window'] = i
    config['dataset']['hf_repo'] = f"Howard881010/climate-{i}day-inContext"
    
    # Save the updated configuration back to the original file
    with open(file_path, 'w') as outfile:
        yaml.dump(config, outfile, default_flow_style=False)
    
    print(f"Saved configuration with input_window={i} and output_window={i}.")
    climate_cfg = load_config('config/default/climate.yaml')

    processor = ClimateDataProcessor(climate_cfg['dataset'])
    climate_dataset = processor.get_dataset()
    max_length_1 = get_max_token_size(climate_dataset, 'input_text_time', 'output_text_time', 'instruction-2')
    max_length_2 = get_max_token_size(climate_dataset, 'input_text_time', 'output_time', 'instruction-4')

    # processor.push_to_huggingface(climate_dataset)
    print(max_length_1, max_length_2)


Saved configuration with input_window=1 and output_window=1.
instruction_1:  Given the weather information of the first 1 day, predict the weather information of the next 1 day. Output the result strictly in the following JSON format and no additional text:
{
    "day_2_date": "YYYY-MM-DD",
    "day_2_weather_forecast": "Weather description"
}
instruction_2:  Given the weather information of the first 1 day, predict the weather information of the next 1 day. Output the result strictly in the following JSON format and no additional text:
{
    "day_2_date": "YYYY-MM-DD",
    "day_2_weather_forecast": "Weather description",
    "day_2_temp": "A Float Number"
}
instruction_3:  Given the weather information of the first 1 day, predict the weather information of the next 1 day. Output the result strictly in the following JSON format and no additional text:
{
    "day_2_date": "YYYY-MM-DD",
    "day_2_weather_forecast": "Weather description"
}
instruction_4:  Given the weather information of

KeyboardInterrupt: 

In [None]:
# Load the dataset
for i in range(1, 8):
    data = load_dataset(f'Howard881010/climate-{i}day-finetuned')

    # Initialize dictionaries to hold processed data
    data_all = {'train': [], 'test': [], 'valid': []}

    # Process each split
    for split in ['train', 'test', 'valid']:
        data_split = pd.DataFrame(data[split])
        
        t2t = data_split[['input_text', 'output_text', 'instruction-1']].rename(columns={'instruction-1': 'instruction', 'input_text': 'input', 'output_text': 'output'})
        tT2tT = data_split[['input_text_time', 'output_text_time', 'instruction-2']].rename(columns={'instruction-2': 'instruction', 'input_text_time': 'input', 'output_text_time': 'output'})
        tT2t = data_split[['input_text_time', 'output_text', 'instruction-3']].rename(columns={'instruction-3': 'instruction', 'input_text_time': 'input', 'output_text': 'output'})
        tT2T = data_split[['input_text_time', 'output_time', 'instruction-4']].rename(columns={'instruction-4': 'instruction', 'input_text_time': 'input', 'output_time': 'output'})
        
        # Append the transformed data for each split
        data_all[split].append(t2t)
        data_all[split].append(tT2tT)
        data_all[split].append(tT2t)
        data_all[split].append(tT2T)
        
        # Concatenate all processed DataFrames for the current split
        data_all[split] = pd.concat(data_all[split])

    # Convert each processed DataFrame back to Hugging Face Dataset format
    dataset_dict = DatasetDict({
        'train': Dataset.from_pandas(data_all['train']),
        'test': Dataset.from_pandas(data_all['test']),
        'valid': Dataset.from_pandas(data_all['valid'])
    })

    # Push the processed dataset to Hugging Face Hub
    dataset_dict.push_to_hub(f'Howard881010/climate-{i}day')


# For Medical dataset

In [2]:
import yaml

file_path = "config/default/medical.yaml"
# Load the existing YAML file
with open(file_path, 'r') as file:
    config = yaml.safe_load(file)

# Iterate over input_window and output_window from 1 to 7
for i in range(1, 8):
    # Update the values in the config
    config['dataset']['input_window'] = i
    config['dataset']['output_window'] = i
    config['dataset']['hf_repo'] = f"Howard881010/medical-{i}day-inContext"
    
    # Save the updated configuration back to the original file
    with open(file_path, 'w') as outfile:
        yaml.dump(config, outfile, default_flow_style=False)
    
    print(f"Saved configuration with input_window={i} and output_window={i}.")
    medical_cfg = load_config('config/default/medical.yaml')

    processor = MedicalDataProcessor(medical_cfg['dataset'])
    medical_dataset = processor.get_dataset()
    max_length_1 = get_max_token_size(medical_dataset, 'input_text_time', 'output_text_time', 'instruction-2')
    max_length_2 = get_max_token_size(medical_dataset, 'input_text_time', 'output_time', 'instruction-4')
    processor.push_to_huggingface(medical_dataset)
    print(max_length_1, max_length_2)

Saved configuration with input_window=1 and output_window=1.
Given the medical information of the first 1 day, predict the medical information of the next 1 day. Output the result strictly in the following JSON format and no additional text:
{
    "day_2_date": "YYYY-MM-DD",
    "day_2_medical_notes": "Medical description"
}
Given the medical information of the first 1 day, predict the medical information of the next 1 day. Output the result strictly in the following JSON format and no additional text:
{
    "day_2_date": "YYYY-MM-DD",
    "day_2_medical_notes": "Medical description",
    "day_2_Heart_Rate": "A Float Number"
}
Given the medical information of the first 1 day, predict the medical information of the next 1 day. Output the result strictly in the following JSON format and no additional text:
{
    "day_2_date": "YYYY-MM-DD",
    "day_2_medical_notes": "Medical description"
}
Given the medical information of the first 1 day, predict the medical information of the next 1 day

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

1232 768
Saved configuration with input_window=2 and output_window=2.
Given the medical information of the first 2 day, predict the medical information of the next 2 day. Output the result strictly in the following JSON format and no additional text:
{
    "day_3_date": "YYYY-MM-DD",
    "day_3_medical_notes": "Medical description",
    "day_4_date": "YYYY-MM-DD",
    "day_4_medical_notes": "Medical description"
}
Given the medical information of the first 2 day, predict the medical information of the next 2 day. Output the result strictly in the following JSON format and no additional text:
{
    "day_3_date": "YYYY-MM-DD",
    "day_3_medical_notes": "Medical description",
    "day_3_Heart_Rate": "A Float Number",
    "day_4_date": "YYYY-MM-DD",
    "day_4_medical_notes": "Medical description",
    "day_4_Heart_Rate": "A Float Number"
}
Given the medical information of the first 2 day, predict the medical information of the next 2 day. Output the result strictly in the following JSON 

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

2097 1306
Saved configuration with input_window=3 and output_window=3.
Given the medical information of the first 3 day, predict the medical information of the next 3 day. Output the result strictly in the following JSON format and no additional text:
{
    "day_4_date": "YYYY-MM-DD",
    "day_4_medical_notes": "Medical description",
    "day_5_date": "YYYY-MM-DD",
    "day_5_medical_notes": "Medical description",
    "day_6_date": "YYYY-MM-DD",
    "day_6_medical_notes": "Medical description"
}
Given the medical information of the first 3 day, predict the medical information of the next 3 day. Output the result strictly in the following JSON format and no additional text:
{
    "day_4_date": "YYYY-MM-DD",
    "day_4_medical_notes": "Medical description",
    "day_4_Heart_Rate": "A Float Number",
    "day_5_date": "YYYY-MM-DD",
    "day_5_medical_notes": "Medical description",
    "day_5_Heart_Rate": "A Float Number",
    "day_6_date": "YYYY-MM-DD",
    "day_6_medical_notes": "Medical 

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

2915 1827
Saved configuration with input_window=4 and output_window=4.
Given the medical information of the first 4 day, predict the medical information of the next 4 day. Output the result strictly in the following JSON format and no additional text:
{
    "day_5_date": "YYYY-MM-DD",
    "day_5_medical_notes": "Medical description",
    "day_6_date": "YYYY-MM-DD",
    "day_6_medical_notes": "Medical description",
    "day_7_date": "YYYY-MM-DD",
    "day_7_medical_notes": "Medical description",
    "day_8_date": "YYYY-MM-DD",
    "day_8_medical_notes": "Medical description"
}
Given the medical information of the first 4 day, predict the medical information of the next 4 day. Output the result strictly in the following JSON format and no additional text:
{
    "day_5_date": "YYYY-MM-DD",
    "day_5_medical_notes": "Medical description",
    "day_5_Heart_Rate": "A Float Number",
    "day_6_date": "YYYY-MM-DD",
    "day_6_medical_notes": "Medical description",
    "day_6_Heart_Rate": "A F

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

3713 2245
Saved configuration with input_window=5 and output_window=5.
Given the medical information of the first 5 day, predict the medical information of the next 5 day. Output the result strictly in the following JSON format and no additional text:
{
    "day_6_date": "YYYY-MM-DD",
    "day_6_medical_notes": "Medical description",
    "day_7_date": "YYYY-MM-DD",
    "day_7_medical_notes": "Medical description",
    "day_8_date": "YYYY-MM-DD",
    "day_8_medical_notes": "Medical description",
    "day_9_date": "YYYY-MM-DD",
    "day_9_medical_notes": "Medical description",
    "day_10_date": "YYYY-MM-DD",
    "day_10_medical_notes": "Medical description"
}
Given the medical information of the first 5 day, predict the medical information of the next 5 day. Output the result strictly in the following JSON format and no additional text:
{
    "day_6_date": "YYYY-MM-DD",
    "day_6_medical_notes": "Medical description",
    "day_6_Heart_Rate": "A Float Number",
    "day_7_date": "YYYY-MM

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

4605 2733
Saved configuration with input_window=6 and output_window=6.
Given the medical information of the first 6 day, predict the medical information of the next 6 day. Output the result strictly in the following JSON format and no additional text:
{
    "day_7_date": "YYYY-MM-DD",
    "day_7_medical_notes": "Medical description",
    "day_8_date": "YYYY-MM-DD",
    "day_8_medical_notes": "Medical description",
    "day_9_date": "YYYY-MM-DD",
    "day_9_medical_notes": "Medical description",
    "day_10_date": "YYYY-MM-DD",
    "day_10_medical_notes": "Medical description",
    "day_11_date": "YYYY-MM-DD",
    "day_11_medical_notes": "Medical description",
    "day_12_date": "YYYY-MM-DD",
    "day_12_medical_notes": "Medical description"
}
Given the medical information of the first 6 day, predict the medical information of the next 6 day. Output the result strictly in the following JSON format and no additional text:
{
    "day_7_date": "YYYY-MM-DD",
    "day_7_medical_notes": "Medi

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

5495 3137
Saved configuration with input_window=7 and output_window=7.
Given the medical information of the first 7 day, predict the medical information of the next 7 day. Output the result strictly in the following JSON format and no additional text:
{
    "day_8_date": "YYYY-MM-DD",
    "day_8_medical_notes": "Medical description",
    "day_9_date": "YYYY-MM-DD",
    "day_9_medical_notes": "Medical description",
    "day_10_date": "YYYY-MM-DD",
    "day_10_medical_notes": "Medical description",
    "day_11_date": "YYYY-MM-DD",
    "day_11_medical_notes": "Medical description",
    "day_12_date": "YYYY-MM-DD",
    "day_12_medical_notes": "Medical description",
    "day_13_date": "YYYY-MM-DD",
    "day_13_medical_notes": "Medical description",
    "day_14_date": "YYYY-MM-DD",
    "day_14_medical_notes": "Medical description"
}
Given the medical information of the first 7 day, predict the medical information of the next 7 day. Output the result strictly in the following JSON format and 

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

6404 3623


In [9]:
# Load the dataset
for i in range(1, 8):
    data = load_dataset(f'Howard881010/medical-{i}day-finetuned')

    # Initialize dictionaries to hold processed data
    data_all = {'train': [], 'test': [], 'valid': []}

    # Process each split
    for split in ['train', 'test', 'valid']:
        data_split = pd.DataFrame(data[split])
        
        t2t = data_split[['input_text', 'output_text', 'instruction-1']].rename(columns={'instruction-1': 'instruction', 'input_text': 'input', 'output_text': 'output'})
        tT2tT = data_split[['input_text_time', 'output_text_time', 'instruction-2']].rename(columns={'instruction-2': 'instruction', 'input_text_time': 'input', 'output_text_time': 'output'})
        tT2t = data_split[['input_text_time', 'output_text', 'instruction-3']].rename(columns={'instruction-3': 'instruction', 'input_text_time': 'input', 'output_text': 'output'})
        tT2T = data_split[['input_text_time', 'output_time', 'instruction-4']].rename(columns={'instruction-4': 'instruction', 'input_text_time': 'input', 'output_time': 'output'})
        
        # Append the transformed data for each split
        data_all[split].append(t2t)
        data_all[split].append(tT2tT)
        data_all[split].append(tT2t)
        data_all[split].append(tT2T)
        
        # Concatenate all processed DataFrames for the current split
        data_all[split] = pd.concat(data_all[split])

    # Convert each processed DataFrame back to Hugging Face Dataset format
    dataset_dict = DatasetDict({
        'train': Dataset.from_pandas(data_all['train']),
        'test': Dataset.from_pandas(data_all['test']),
        'valid': Dataset.from_pandas(data_all['valid'])
    })

    # Push the processed dataset to Hugging Face Hub
    dataset_dict.push_to_hub(f'Howard881010/medical-{i}day')

Downloading readme:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.47M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6042 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/751 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/796 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/710 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5921 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/743 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/779 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/710 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.30M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5812 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/729 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/756 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/711 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.62M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5692 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/713 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/746 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/23 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/711 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/22.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5575 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/696 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/734 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/23 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/711 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/25.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.34M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5458 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/678 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/723 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/711 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.78M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.01M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5337 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/670 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/706 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/713 [00:00<?, ?B/s]

In [11]:
import re

data = '''{
    "day_3_date": "2023-09-25",
    "day_3_weather_forecast": "Rainfall will continue to shift eastward from the Pacific Northwest, with prolonged wet weather in the Mid-Atlantic due to the stalled cold front. Expect wet conditions in the Northeast from Friday to Sunday. Light to moderate precipitation is anticipated across the Midwest, northern Plains, and northern Great Lakes midweek through the weekend, with increasing snowfall at higher elevations. Models show a low pressure system tracking east into the Mid-Atlantic. Periodic heavy rains are expected, particularly in the Mid-Atlantic and New England, with the heaviest rainfall focus shifting north on Saturday. Temperatures will rise by 10 to 15 degrees above normal across the South through the Gulf Coast, while below normal temperatures will extend from the Northwest to the West. Below normal temperatures are also expected in the Midwest into Saturday. The humidity levels will remain high, particularly in the Southeast, influencing higher heat indices. Wind speeds are predicted to be moderate to breezy, with variations due to local storm activity. Overall, expect warm temperatures, increased chances of rain, and high humidity, leading to uncomfortable heat indices in the Southeast.",
    "day_4_date": "2023-09-26",
    "day_4_weather_forecast": "Rainfall will continue to shift northward from the Mid-Atlantic, with prolonged wet weather in the Northeast due to the stalled low pressure system. Expect wet conditions in the Great Lakes from Sunday to Tuesday. Light to moderate precipitation is anticipated across the Ohio Valley, northern Plains, and northern Rockies midweek through the weekend, with increasing snowfall at higher elevations. Models show a low pressure system tracking north into the Great Lakes. Periodic heavy rains are expected, particularly in the Great Lakes and New England, with the heaviest rainfall focus shifting north on Monday. Temperatures will rise by 10 to 15 degrees above normal across the South through the Gulf Coast, while below normal temperatures will extend from the Northwest to the West. Below normal temperatures are also expected in the Midwest into Monday. The humidity levels will remain high, particularly in the Southeast, influencing higher heat indices. Wind speeds are predicted to be moderate to breezy, with variations due to local storm activity. Overall, expect warm temperatures, increased chances of rain, and high humidity, leading to uncomfortable heat indices in the Southeast."'''

text_key_name = 'weather_forecast'
# Use a regular expression to find both date and weather forecast in one step
matches = re.findall(fr'("day_\d+_date":\s*"[^"]+").*?("day_\d+_{text_key_name}":\s*"[^"]+")', data, re.DOTALL)

# Process matches
combined_result = [f"{date}, {forecast}" for date, forecast in matches]

combined_result




['"day_3_date": "2023-09-25", "day_3_weather_forecast": "Rainfall will continue to shift eastward from the Pacific Northwest, with prolonged wet weather in the Mid-Atlantic due to the stalled cold front. Expect wet conditions in the Northeast from Friday to Sunday. Light to moderate precipitation is anticipated across the Midwest, northern Plains, and northern Great Lakes midweek through the weekend, with increasing snowfall at higher elevations. Models show a low pressure system tracking east into the Mid-Atlantic. Periodic heavy rains are expected, particularly in the Mid-Atlantic and New England, with the heaviest rainfall focus shifting north on Saturday. Temperatures will rise by 10 to 15 degrees above normal across the South through the Gulf Coast, while below normal temperatures will extend from the Northwest to the West. Below normal temperatures are also expected in the Midwest into Saturday. The humidity levels will remain high, particularly in the Southeast, influencing high