In [3]:
from datasets import Dataset, DatasetDict
import pandas as pd
from icecream import ic
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


# Load Df

In [4]:
df = pd.DataFrame({
    'text': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'label': [True, False, True, False, True, False, True, False, False, False]
})
df

Unnamed: 0,text,label
0,1,True
1,2,False
2,3,True
3,4,False
4,5,True
5,6,False
6,7,True
7,8,False
8,9,False
9,10,False


# Train, Test Split

In [5]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

In [6]:
train_df

Unnamed: 0,text,label
0,1,True
7,8,False
2,3,True
9,10,False
4,5,True
3,4,False
6,7,True


In [7]:
test_df

Unnamed: 0,text,label
8,9,False
1,2,False
5,6,False


# Turing to Huggingface Dataset

In [8]:
# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
train_dataset = train_dataset.remove_columns(["__index_level_0__"])
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 7
})

In [9]:
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.remove_columns(["__index_level_0__"])
test_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 3
})

# Turning to Huggingface DatasetDict

In [10]:
# Create a DatasetDict from the single dataset
dataset_dict = DatasetDict({'train': train_dataset, 'test': test_dataset})
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 7
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3
    })
})

In [11]:
ic(dataset_dict)

ic| dataset_dict: DatasetDict({
                      train: Dataset({
                          features: ['text', 'label'],
                          num_rows: 7
                      })
                      test: Dataset({
                          features: ['text', 'label'],
                          num_rows: 3
                      })
                  })


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 7
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3
    })
})

In [12]:
ic(dataset_dict["train"][3])

ic| dataset_dict["train"][3]: {'label': False, 'text': 10}


{'text': 10, 'label': False}

# 함수정리

In [16]:
from datasets import Dataset, DatasetDict
import pandas as pd
from sklearn.model_selection import train_test_split

def df_to_hg_dataset_dict(df):
    '''
    SFT(Supervised Fine-Tuning을 위해 Pandas df를 Huggingface Dataset_dict으로 변환하는 함수
    df columns --> "text", "label"
    '''
    train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
    
    train_dataset = Dataset.from_pandas(train_df)
    train_dataset = train_dataset.remove_columns(["__index_level_0__"])

    test_dataset = Dataset.from_pandas(test_df)
    test_dataset = test_dataset.remove_columns(["__index_level_0__"])

    dataset_dict = DatasetDict({'train': train_dataset, 'test': test_dataset})
    return dataset_dict

In [17]:
datadict = df_to_hg_dataset_dict(df)
datadict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 7
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3
    })
})

In [15]:
ic(datadict["train"][3])

ic| datadict["train"][3]: {'label': False, 'text': 10}


{'text': 10, 'label': False}