In [15]:
from datasets import load_dataset

# Load and create the dataset

In [16]:
path_to_raw_data_in_bank = "rawData/in_bank.csv"
path_to_raw_data_in_school = "rawData/in_school.csv"
path_to_raw_data_us_bank = "rawData/us_bank.csv"
path_to_raw_data_us_school = "rawData/us_school.csv"

In [17]:
ds_in_bank = load_dataset("csv", data_files=path_to_raw_data_in_bank)
ds_in_school = load_dataset("csv", data_files=path_to_raw_data_in_school)
ds_us_bank = load_dataset("csv", data_files=path_to_raw_data_us_bank)
ds_us_school = load_dataset("csv", data_files=path_to_raw_data_us_school)

In [18]:
print(ds_in_bank)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 100
    })
})


In [19]:
print(ds_in_bank["train"][0])

{'text': '05-06-2025 Dear Arjun Krishnan Nair, Account XX4521 debited INR 18750.00 on 05-06-2025 09:15:22 IST by ACH-DR-TP ACH', 'label': 'IN_Bank'}


In [20]:
from datasets import DatasetDict

In [23]:
ds_in_bank_train_test_split = ds_in_bank['train'].train_test_split(test_size=0.2, shuffle=True, seed=42)
ds_in_bank_test_valid_split = ds_in_bank_train_test_split['test'].train_test_split(test_size=0.5, shuffle=True, seed=42)

ds_in_bank_split = DatasetDict({
    'train': ds_in_bank_train_test_split['train'],
    'validation': ds_in_bank_test_valid_split['train'],
    'test': ds_in_bank_test_valid_split['test']
})

ds_in_school_train_test_split = ds_in_school['train'].train_test_split(test_size=0.2, shuffle=True, seed=42)
ds_in_school_test_valid_split = ds_in_school_train_test_split['test'].train_test_split(test_size=0.5, shuffle=True, seed=42)

ds_in_school_split = DatasetDict({
    'train': ds_in_school_train_test_split['train'],
    'validation': ds_in_school_test_valid_split['train'],
    'test': ds_in_school_test_valid_split['test']
})

ds_us_bank_train_test_split = ds_us_bank['train'].train_test_split(test_size=0.2, shuffle=True, seed=42)
ds_us_bank_test_valid_split = ds_us_bank_train_test_split['test'].train_test_split(test_size=0.5, shuffle=True, seed=42)

ds_us_bank_split = DatasetDict({
    'train': ds_us_bank_train_test_split['train'],
    'validation': ds_us_bank_test_valid_split['train'],
    'test': ds_us_bank_test_valid_split['test']
})

ds_us_school_train_test_split = ds_us_school['train'].train_test_split(test_size=0.2, shuffle=True, seed=42)
ds_us_school_test_valid_split = ds_us_school_train_test_split['test'].train_test_split(test_size=0.5, shuffle=True, seed=42)

ds_us_school_split = DatasetDict({
    'train': ds_us_school_train_test_split['train'],
    'validation': ds_us_school_test_valid_split['train'],
    'test': ds_us_school_test_valid_split['test']
})


In [22]:
print(ds_in_bank_split["train"][0])

{'text': '05-12-2024 Dear Customer, INR 25000.00 credited to A/c XX3695 on 05-12-24 at 14:20:45 IST. Ref: NEFT/SALARY/EMP789456', 'label': 'IN_Bank'}


# Combining datasets

In [24]:
from datasets import concatenate_datasets

In [None]:
ds_combined_train = concatenate_datasets([
	ds_in_bank_split["train"],
	ds_in_school_split["train"],
	ds_us_bank_split["train"],
	ds_us_school_split["train"]
])

ds_combined_validate = concatenate_datasets([
	ds_in_bank_split["validation"],
	ds_in_school_split["validation"],
	ds_us_bank_split["validation"],
	ds_us_school_split["validation"]
])

ds_combined_test = concatenate_datasets([
	ds_in_bank_split["test"],
	ds_in_school_split["test"],
	ds_us_bank_split["test"],
	ds_us_school_split["test"]
])

ds_combined_train = ds_combined_train.shuffle(seed=42)
ds_combined_validate = ds_combined_validate.shuffle(seed=42)
ds_combined_test = ds_combined_test.shuffle(seed=42)

ds_combined = DatasetDict({
	"train": ds_combined_train,
	"validation": ds_combined_validate,
	"test": ds_combined_test
})

In [26]:
ds_combined.save_to_disk("./data/mail_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 335/335 [00:00<00:00, 53651.99 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 42/42 [00:00<00:00, 10497.01 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 43/43 [00:00<00:00, 17704.43 examples/s]


# Load dataset from disk

In [27]:
from datasets import load_from_disk
ds = load_from_disk("./data/mail_dataset")
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 335
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 42
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 43
    })
})

In [28]:
print(ds["train"].features)

{'text': Value(dtype='string', id=None), 'label': Value(dtype='string', id=None)}


# Fixing the label

In [29]:
lable_names = [
	"IN_Bank",
	"IN_School",
	"US_Bank",
	"US_School"
]

label2IdMap = {
	"IN_Bank": 0,
	"IN_School": 1,
	"US_Bank": 2,
	"US_School": 3
}

In [30]:
from datasets import ClassLabel

ds = ds.cast_column("label", ClassLabel(names=lable_names))
ds = ds.align_labels_with_mapping(label2IdMap, "label")

Casting the dataset: 100%|██████████| 335/335 [00:00<00:00, 50888.84 examples/s]
Casting the dataset: 100%|██████████| 42/42 [00:00<00:00, 20854.83 examples/s]
Casting the dataset: 100%|██████████| 43/43 [00:00<00:00, 12434.85 examples/s]
Aligning the labels: 100%|██████████| 335/335 [00:00<00:00, 94874.53 examples/s]
Aligning the labels: 100%|██████████| 42/42 [00:00<00:00, 24973.17 examples/s]
Aligning the labels: 100%|██████████| 43/43 [00:00<00:00, 24699.41 examples/s]


In [31]:
print(ds["train"].features)

{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['IN_Bank', 'IN_School', 'US_Bank', 'US_School'], id=None)}


In [32]:
ds.save_to_disk("./data/mail_dataset_labeled")

Saving the dataset (1/1 shards): 100%|██████████| 335/335 [00:00<00:00, 81748.42 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 42/42 [00:00<00:00, 17031.88 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 43/43 [00:00<00:00, 17695.75 examples/s]


In [33]:
ds_labeled = load_from_disk("./data/mail_dataset_labeled")
ds_labeled["train"].features


{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['IN_Bank', 'IN_School', 'US_Bank', 'US_School'], id=None)}