Upload data to huggingface dataset

In [1]:
import pandas as pd
import os
import json
import os
from utils import split_data, convert_to_parquet, combine_window, combine_window_multiple_output
import ast
from datetime import datetime


# Handle the numerical data

In [2]:
# Initialize an empty DataFrame to hold the combined data
combined_df = pd.DataFrame()

file_path = "/home/ubuntu/multimodal/Dataset/Climate-raw/numerical_2021-05-04_2023-12-04.csv"
# Path to the directory containing the CSV files
climate_data = pd.read_csv(file_path)
df = climate_data[['name', 'datetime', 'temp', 'longitude']]

result = []
# sort the state by longitude from left to right
latitude_dict = df.set_index('name')['longitude'].to_dict()
sorted_states = sorted(latitude_dict.keys(), key = lambda x: latitude_dict[x])

# Group by 'datetime' and create the dictionary for 'temp' for each group
for date, group in df.groupby('datetime'):
    temp_dict = group.set_index('name')['temp'].to_dict()
    
    sorted_temp = {state: temp_dict[state] for state in sorted_states}
    result.append({'datetime': date, 'temp': sorted_temp})
    # only for california
    # cal_temp = {"California": temp_dict['California']}
    # result.append({'datetime': date, 'temp': cal_temp})
    
    

# Create the final DataFrame
new_df = pd.DataFrame(result)
new_df.to_csv('/home/ubuntu/multimodal/Dataset/Climate-raw/procoess_num.csv', index=False)


In [3]:
def add_day_of_week(date_str):
    date_obj = datetime.strptime(date_str, '%Y-%m-%d')
    day_of_week = date_obj.strftime('%A')
    return f"{date_str} ({day_of_week})"

def collapse_json(data):
    collapsed_string = ""
    data = ast.literal_eval(data)
    for key, values in data.items():
        collapsed_string += f"{key}: " + " ".join(values)
    collapsed_string = collapsed_string.replace("_", " ")
    return collapsed_string.strip()


In [4]:
file_path = "/home/ubuntu/multimodal/Dataset/Climate-raw/summarized_temperature_v1.csv"
num_file_path = "/home/ubuntu/multimodal/Dataset/Climate-raw/temp_cal.csv"

df = pd.read_csv(file_path)   # from 05-01 - 12-01 
df_num = pd.read_csv(num_file_path)  # from 05-04 - 12-04

# so the overall dataset is from 05-04 - 12-01

df = df.iloc[3:]
print(df['Time'].head(5))
print(df_num['datetime'].head(5))

df['temp'] = df_num['temp'].apply(ast.literal_eval).shift(3)
df['summary'] = df['summary'].apply(collapse_json)
df['Time'] = df['Time'].apply(add_day_of_week)
df['summary_temp'] = df.apply(lambda x: json.dumps({"Time": x['Time'], "summary": x['summary'], "temperature": x['temp']}), axis=1)
df['summary'] = df.apply(lambda x: json.dumps({"Time": x['Time'], "summary": x['summary']}), axis=1)

df['fut_summary'] = df['summary'].shift(-1)
df['fut_summary_temp'] = df['summary_temp'].shift(-1)


df_temp = df[['summary', 'fut_summary', 'summary_temp', 'fut_summary_temp']]
df_temp.to_csv('/home/ubuntu/multimodal/Dataset/Climate/cal/temp-only/temp_cal.csv', index=False)


3    2021-05-04
4    2021-05-05
5    2021-05-06
6    2021-05-07
7    2021-05-08
Name: Time, dtype: object
0    2021-05-04
1    2021-05-05
2    2021-05-06
3    2021-05-07
4    2021-05-08
Name: datetime, dtype: object


In [5]:
file_path = '/home/ubuntu/multimodal/Dataset/Climate/cal/temp-only/temp_cal.csv'

# Loop through each file and split the data
split_data(file_path)  # Adjust the directory path as needed

# Case 3 text + number  => text + number

In [11]:
dir = '../Dataset/Climate/cal/temp-only/'
window_sizes = [1, 2, 3]

def create_mixed_mixed(filename, window_size, unit):
    if (filename.startswith('train') or filename.startswith('test') or filename.startswith('val')):
        path = os.path.join(dir, filename)
        summaries = pd.read_csv(path)

        if window_size == 1:
            window = f"{window_size} {unit}"
        else:
            window = f"{window_size} {unit}s"
        example_output = {}
        for i in range(window_size):
            example_output[f"{unit}_{1 + i + window_size}"] = {"Time": "...", "summary": "...", "temprature": "..."}
        example_output = json.dumps(example_output)
        instruction = f"Given the weather summary and the temprature for {window}, please predict the next {window} day's temprature and weather summary within a JSON. The example output is {example_output}"
        
        df = summaries[['summary_temp', 'fut_summary_temp']].rename(columns={'summary_temp': 'input', 'fut_summary_temp': 'output'})
        df['instruction'] = instruction
        df['pred_output'] = "Not available"
        # skip the first and last historical_size days
        return combine_window_multiple_output(df, window_size, "day")

for window_size in window_sizes:
    dataframe_train = []
    dataframe_test = []
    dataframe_val = []
    for filename in os.listdir(dir):
        df = create_mixed_mixed(filename, window_size, "day")
        if filename.startswith('test'):
            dataframe_test.append(df)
        elif filename.startswith('train'):
            dataframe_train.append(df)
        elif filename.startswith('val'):
            dataframe_val.append(df)

    dataset_dict = convert_to_parquet(dataframe_test, dataframe_train, dataframe_val)
    token = os.getenv("HF_TOKEN")
    # Push the dataset to the Hugging Face Hub
    dataset_dict.push_to_hub(f"Howard881010/climate-{window_size}_day-mixed-mixed-cal-multi", token=token)

    # Load the dataset from Hugging Face Hub
    # load_from_huggingface(f"Howard881010/climate-{window_size}_day-mixed-mixed-cal", "mixed-mixed-cal", f"{window_size}_day", "Climate")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/598 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/599 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/599 [00:00<?, ?B/s]

# Case 4 text => text


In [10]:
dir = '../Dataset/Climate/cal/temp-only/'

window_sizes = [1, 2, 3]

def create_text_text(filename, window_size, unit):
    if (filename.startswith('train') or filename.startswith('test') or filename.startswith('val')):
        path = os.path.join(dir, filename)
        summaries = pd.read_csv(path)

        if window_size == 1:
            window = f"{window_size} {unit}"
        else:
            window = f"{window_size} {unit}s"
        example_output = {}
        for i in range(window_size):
            example_output[f"{unit}_{1 + i + window_size}"] = {"Time": "...", "summary": "..."}
        example_output = json.dumps(example_output)
        instruction = f"Given the weather summary for {window}, please predict the next {window_size} day's weather summary within a JSON. The example output is {example_output}"
        
        df = summaries[['summary', 'fut_summary']].rename(columns={'summary': 'input', 'fut_summary': 'output'})
        df['instruction'] = instruction
        df['pred_output'] = "Not available"
        # skip the first and last historical_size days
        return combine_window_multiple_output(df, window_size, "day")

for window_size in window_sizes:
    dataframe_train = []
    dataframe_test = []
    dataframe_val = []
    for filename in os.listdir(dir):
        df = create_text_text(filename, window_size, "day")
        if filename.startswith('test'):
            dataframe_test.append(df)
        elif filename.startswith('train'):
            dataframe_train.append(df)
        elif filename.startswith('val'):
            dataframe_val.append(df)
            
    dataset_dict = convert_to_parquet(dataframe_test, dataframe_train, dataframe_val)
    token = os.getenv("HF_TOKEN")
    # Push the dataset to the Hugging Face Hub
    dataset_dict.push_to_hub(f"Howard881010/climate-{window_size}_day-text-text-cal-multi", token=token)

    # Load the dataset from Hugging Face Hub
    # load_from_huggingface(f"Howard881010/climate-{window_size}_day-text-text", "text-text", f"{window_size}_day", "Climate")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/598 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/598 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/599 [00:00<?, ?B/s]

# Put all data together

In [12]:
dir = '../Dataset/Climate/cal/temp-only/'
dataframe_train = []
dataframe_test = []
dataframe_val = []
window_size = 3


for filename in os.listdir(dir):
    for size in range(1, window_size+1):
        df1 = create_text_text(filename, size, "day")
        df2 = create_mixed_mixed(filename, size, "day")
        if filename.startswith('test'):
            dataframe_test.append(df1)
            dataframe_test.append(df2)
        elif filename.startswith('train'):
            dataframe_train.append(df1)
            dataframe_train.append(df2)
        elif filename.startswith('val'):
            dataframe_val.append(df1)
            dataframe_val.append(df2)
        
dataset_dict = convert_to_parquet(dataframe_test, dataframe_train, dataframe_val)
token = os.getenv("HF_TOKEN")
# Push the dataset to the Hugging Face Hub
dataset_dict.push_to_hub(f"Howard881010/climate-cal-multi", token=token)


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/606 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Howard881010/climate-cal-multi/commit/a060d415b4f7a2dbc5fe139c1e16bcad71608860', commit_message='Upload dataset', commit_description='', oid='a060d415b4f7a2dbc5fe139c1e16bcad71608860', pr_url=None, pr_revision=None, pr_num=None)

# Count the maximum token

In [9]:
from transformers import AutoTokenizer
from datasets import load_dataset

# Replace 'mistral-7b-instruct' with the actual model name if available
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
hf_dataset = f"Howard881010/climate-cal-multi"
dataset = load_dataset(hf_dataset, split="test")
data = pd.DataFrame(dataset)
tokens = []
for idx, row in data.iterrows():
    token = tokenizer.encode(row['input'])
    num_tokens = len(token)
    tokens.append(num_tokens)
# Retrieve the maximum number of tokens

print(f"Maximum number of tokens: {max(tokens)}")


Maximum number of tokens: 1386
