# Import Library

In [2]:
# Import Library untuk load dataset dadri Hugging Face

import pandas as pd
import datasets
from datasets import load_dataset
from datasets import get_dataset_config_names

# Load Dataset XL-Sum dari Platform Huggingface

In [4]:
# Dataset XL-Sum terdiri dari banyak bahasa (multilingual language), pada project ini akan mengambil dataset XL-Sum bahasa Indonesia
train_dataset = load_dataset("csebuetnlp/xlsum", name="indonesian", split="train")
test_dataset = load_dataset("csebuetnlp/xlsum", name="indonesian", split="test")
val_dataset = load_dataset("csebuetnlp/xlsum", name="indonesian", split="validation")

print(f"TRAIN SET: {train_dataset} \n")
print(f"Test SET: {test_dataset} \n")
print(f"VALIDATION SET: {val_dataset} \n")

TRAIN SET: Dataset({
    features: ['id', 'url', 'title', 'summary', 'text'],
    num_rows: 38242
}) 

Test SET: Dataset({
    features: ['id', 'url', 'title', 'summary', 'text'],
    num_rows: 4780
}) 

VALIDATION SET: Dataset({
    features: ['id', 'url', 'title', 'summary', 'text'],
    num_rows: 4780
}) 



In [7]:
# Convert ke Pandas DataFrame untuk proses lebih lanjut

train_df = pd.DataFrame(train_dataset)
test_df = pd.DataFrame(test_dataset)
val_df = pd.DataFrame(val_dataset)

# Detail Dataset XL-Sum Bahasa Indonesia
total = len(train_df) + len(test_df) + len(val_df)

print(f"Total Data XLSUM Bahasa Indonesia: {total}")
print(f"Train Data: {len(train_df)}")
print(f"Test Data: {len(test_df)}")
print(f"Validation Data: {len(val_df)}")

Total Data XLSUM Bahasa Indonesia: 47802
Train Data: 38242
Test Data: 4780
Validation Data: 4780


# Clean Dataset
##### Setelah tahap load dataset, selanjutnya dataset XL-Sum akan dilakukan tahap cleaning data (cek null values, cek duplikasi, dan hapus kolom yang tidak digunakan)

## Cek Null Values

In [8]:
# Cek null values
print("Train dataset null values:")
print(f"{train_df.isnull().sum()}\n")

print("Test dataset null values:")
print(f"{test_df.isnull().sum()}\n")

print("Validation dataset null values:")
print(f"{val_df.isnull().sum()}\n")

Train dataset null values:
id         0
url        0
title      0
summary    0
text       0
dtype: int64

Test dataset null values:
id         0
url        0
title      0
summary    0
text       0
dtype: int64

Validation dataset null values:
id         0
url        0
title      0
summary    0
text       0
dtype: int64



## Cek Duplikasi data

In [10]:
# Cek data duplikat
print(f"Train dataset duplicate values: {train_df.duplicated().sum()}\n")

print(f"Test dataset duplicate values: {test_df.duplicated().sum()}\n")

print(f"Validation dataset duplicate values: {val_df.duplicated().sum()}\n")

Train dataset duplicate values: 0

Test dataset duplicate values: 0

Validation dataset duplicate values: 0



# Hapus kolom yang tidak digunakan
##### Dataset pada tugas peringkas teks otomatis hanya membutuhkan dua input yaitu "text" dan "summary" selain itu akan dihapus.

In [None]:
train_df = train_df.drop(columns=['id', 'url', 'title'])
test_df = test_df.drop(columns=['id', 'url', 'title'])
val_df = val_df.drop(columns=['id', 'url', 'title'])

print("DETAIL TRAIN DATASET:")
print(f"{train_df.info()}\n")

print("DETAIL TEST DATASET:")
print(f"{test_df.info()}\n")

print("DETAIL VALIDATION DATASET:")
print(f"{val_df.info()}\n")

DETAIL TRAIN DATASET:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38242 entries, 0 to 38241
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   summary  38242 non-null  object
 1   text     38242 non-null  object
dtypes: object(2)
memory usage: 597.7+ KB
None

DETAIL TEST DATASET:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4780 entries, 0 to 4779
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   summary  4780 non-null   object
 1   text     4780 non-null   object
dtypes: object(2)
memory usage: 74.8+ KB
None

DETAIL VALIDATION DATASET:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4780 entries, 0 to 4779
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   summary  4780 non-null   object
 1   text     4780 non-null   object
dtypes: object(2)
memory usage: 74.8+ KB
None



## Simpan Dataset

##### Agar semakin robust, seluruh dataset akan digabung dan disimpan dalam format .csv, selanjutnya akan digunakan pada tahap K-fold Cross Validation.

In [35]:
# Gabungkan dataset menjadi satu DataFrame
combined_df = pd.concat([train_df, test_df, val_df], ignore_index=True)

# Simpan DataFrame ke file CSV
combined_df.to_csv("output/xlsum_indonesia.csv", index=False)