Upload data to huggingface dataset

In [18]:
import pandas as pd
import os
import json
import os
from utils import split_data, convert_to_parquet, combine_window
import ast
from datetime import datetime


# Handle the numerical data

In [3]:
# Initialize an empty DataFrame to hold the combined data
combined_df = pd.DataFrame()

file_path = "/home/ubuntu/multimodal/Dataset/Climate-raw/numerical_2021-05-04_2023-12-04.csv"
# Path to the directory containing the CSV files
climate_data = pd.read_csv(file_path)
df = climate_data[['name', 'datetime', 'temp', 'precip', 'longitude']]

result = []
# sort the state by longitude from left to right
latitude_dict = df.set_index('name')['longitude'].to_dict()
sorted_states = sorted(latitude_dict.keys(), key = lambda x: latitude_dict[x])

# Group by 'datetime' and create the dictionary for 'temp' for each group
for date, group in df.groupby('datetime'):
    temp_dict = group.set_index('name')['temp'].to_dict()
    precip_dict = group.set_index('name')['precip'].to_dict()
    
    # sorted_temp = {state: temp_dict[state] for state in sorted_states}
    # sorted_precip = {state: precip_dict[state] for state in sorted_states}
    # result.append({'datetime': date, 'temp': sorted_temp, 'precip': sorted_precip})
    # only for california
    cal_temp = {"California": temp_dict['California']}
    cal_precip = {"California": precip_dict['California']}
    result.append({'datetime': date, 'temp': cal_temp, 'precip': cal_precip})
    
    

# Create the final DataFrame
new_df = pd.DataFrame(result)
new_df.to_csv('/home/ubuntu/multimodal/Dataset/Climate-raw/temp_precip_cal.csv', index=False)


In [22]:
def add_day_of_week(date_str):
    date_obj = datetime.strptime(date_str, '%Y-%m-%d')
    day_of_week = date_obj.strftime('%A')
    return f"{date_str} ({day_of_week})"

def collapse_json(data):
    if pd.isna(data):
        return ""
    
    try:
        data = ast.literal_eval(data)
    except (ValueError, SyntaxError):
        print(f"Error parsing JSON string: {data}")
        return ""
    collapsed_string = ""
    
    for key, values in data.items():
        collapsed_string += f"{key}: " + " ".join(values)
    collapsed_string = collapsed_string.replace("_", " ")
    return collapsed_string.strip()


In [25]:
file_path = "/home/ubuntu/multimodal/Dataset/Climate-raw/summarized_temp_precip_v1.csv"
num_file_path = "/home/ubuntu/multimodal/Dataset/Climate-raw/temp_precip.csv"

df = pd.read_csv(file_path)   # from 05-01 - 12-01 
df_num = pd.read_csv(num_file_path)  # from 05-04 - 12-04

df = df.iloc[3:]
df_num = df_num.iloc[:-3]

df['temp'] = df_num['temp'].apply(ast.literal_eval)
df['precip'] = df_num['precip'].apply(ast.literal_eval)
df['summary'] = df['summary'].apply(collapse_json)
df['Time'] = df['Time'].apply(add_day_of_week)
df['summary_temp_precip'] = df.apply(lambda x: json.dumps({"Time": x['Time'], "summary": x['summary'], "temperature": x['temp'], "precipitation": x['precip']}), axis=1)
df['summary'] = df.apply(lambda x: json.dumps({"Time": x['Time'], "summary": x['summary']}), axis=1)

df['fut_summary'] = df['summary'].shift(-1)
df['fut_summary_temp_precip'] = df['summary_temp_precip'].shift(-1)


df_temp = df[['summary', 'fut_summary', 'summary_temp_precip', 'fut_summary_temp_precip']]
df_temp.to_csv('/home/ubuntu/multimodal/Dataset/Climate/temp_precip.csv', index=False)

3    {'Hawaii': 77.4, 'Alaska': 46.8, 'Oregon': 51....
4    {'Hawaii': 78.0, 'Alaska': 46.5, 'Oregon': 50....
5    {'Hawaii': 77.5, 'Alaska': 45.8, 'Oregon': 57....
6    {'Hawaii': 77.9, 'Alaska': 45.9, 'Oregon': 58....
7    {'Hawaii': 77.8, 'Alaska': 46.4, 'Oregon': 60....
Name: temp, dtype: object


In [26]:
file_path = '/home/ubuntu/multimodal/Dataset/Climate/temp_precip.csv'

# Loop through each file and split the data
split_data(file_path)  # Adjust the directory path as needed

# Case 3 text + number  => text + number

In [33]:
dir = '../Dataset/Climate'
window_sizes = [1, 2, 3]

def create_mixed_mixed(filename, window_size, unit):
    if (filename.startswith('train') or filename.startswith('test') or filename.startswith('val')):
        path = os.path.join(dir, filename)
        summaries = pd.read_csv(path)

        if window_size == 1:
            window = f"{window_size} {unit}"
        else:
            window = f"{window_size} {unit}s"
        example_output = {}
        example_output[f"{unit}_{1 + window_size}"] = {"Time": "...", "summary": "...", "temprature": "...", "precipitation": "..."}
        example_output = json.dumps(example_output)
        instruction = f"Given the weather summary, temprature, and precipitation of 50 states for {window}, please predict the next 1 day's temprature, precipitation, and weather summary within a JSON. The example output is {example_output}"
        
        df = summaries[['summary_temp_precip', 'fut_summary_temp_precip']].rename(columns={'summary_temp_precip': 'input', 'fut_summary_temp_precip': 'output'})
        df['instruction'] = instruction
        df['pred_output'] = "Not available"
        # skip the first and last historical_size days
        return combine_window(df, window_size, "day")

for window_size in window_sizes:
    dataframe_train = []
    dataframe_test = []
    dataframe_val = []
    for filename in os.listdir(dir):
        df = create_mixed_mixed(filename, window_size, "day")
        if filename.startswith('test'):
            dataframe_test.append(df)
        elif filename.startswith('train'):
            dataframe_train.append(df)
        elif filename.startswith('val'):
            dataframe_val.append(df)

    dataset_dict = convert_to_parquet(dataframe_test, dataframe_train, dataframe_val)
    token = os.getenv("HF_TOKEN")
    # Push the dataset to the Hugging Face Hub
    dataset_dict.push_to_hub(f"Howard881010/climate-{window_size}_day-mixed-mixed-precip", token=token)

    # Load the dataset from Hugging Face Hub
    # load_from_huggingface(f"Howard881010/climate-{window_size}_day-mixed-mixed-cal", "mixed-mixed-cal", f"{window_size}_day", "Climate")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/599 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/599 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/602 [00:00<?, ?B/s]

# Case 4 text => text


In [31]:
dir = '../Dataset/Climate'

window_sizes = [1, 2, 3]

def create_text_text(filename, window_size, unit):
    if (filename.startswith('train') or filename.startswith('test') or filename.startswith('val')):
        path = os.path.join(dir, filename)
        summaries = pd.read_csv(path)

        if window_size == 1:
            window = f"{window_size} {unit}"
        else:
            window = f"{window_size} {unit}s"
        example_output = {}
        example_output[f"{unit}_{1+window_size}"] = {"Time": "...", "summary": "..."}
        example_output = json.dumps(example_output)
        instruction = f"Given the weather summary for {window}, please predict the next 1 day's weather summary within a JSON. The example output is {example_output}"
        
        df = summaries[['summary', 'fut_summary']].rename(columns={'summary': 'input', 'fut_summary': 'output'})
        df['instruction'] = instruction
        df['pred_output'] = "Not available"
        # skip the first and last historical_size days
        return combine_window(df, window_size, "day")

for window_size in window_sizes:
    dataframe_train = []
    dataframe_test = []
    dataframe_val = []
    for filename in os.listdir(dir):
        df = create_text_text(filename, window_size, "day")
        if filename.startswith('test'):
            dataframe_test.append(df)
        elif filename.startswith('train'):
            dataframe_train.append(df)
        elif filename.startswith('val'):
            dataframe_val.append(df)
            
    dataset_dict = convert_to_parquet(dataframe_test, dataframe_train, dataframe_val)
    token = os.getenv("HF_TOKEN")
    # Push the dataset to the Hugging Face Hub
    dataset_dict.push_to_hub(f"Howard881010/climate-{window_size}_day-text-text-precip", token=token)

    # Load the dataset from Hugging Face Hub
    # load_from_huggingface(f"Howard881010/climate-{window_size}_day-text-text", "text-text", f"{window_size}_day", "Climate")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

# Put all data together

In [34]:
dir = '../Dataset/Climate'
dataframe_train = []
dataframe_test = []
dataframe_val = []
window_size = 3


for filename in os.listdir(dir):
    for size in range(1, window_size+1):
        df1 = create_text_text(filename, size, "day")
        df2 = create_mixed_mixed(filename, size, "day")
        if filename.startswith('test'):
            dataframe_test.append(df1)
            dataframe_test.append(df2)
        elif filename.startswith('train'):
            dataframe_train.append(df1)
            dataframe_train.append(df2)
        elif filename.startswith('val'):
            dataframe_val.append(df1)
            dataframe_val.append(df2)
        
dataset_dict = convert_to_parquet(dataframe_test, dataframe_train, dataframe_val)
token = os.getenv("HF_TOKEN")
# Push the dataset to the Hugging Face Hub
dataset_dict.push_to_hub(f"Howard881010/climate-precip", token=token)


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/607 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Howard881010/climate-precip/commit/fed5be6872ede6a505d31a77ecf57eb167237335', commit_message='Upload dataset', commit_description='', oid='fed5be6872ede6a505d31a77ecf57eb167237335', pr_url=None, pr_revision=None, pr_num=None)