In [1]:
import os
import shutil
import datasets
import pandas as pd

from datasets import load_dataset
from tqdm.auto import tqdm

In [2]:
tqdm.pandas()

In [3]:
def download_data(subset):
    cache_dir = '.cache'

    if os.path.exists(cache_dir):
        shutil.rmtree(cache_dir)

    subset = load_dataset('glue', subset, cache_dir=cache_dir)

    return subset

In [5]:
sst2 = download_data('sst2')

sst2_train = sst2['train']
sst2_val = sst2['validation']
sst2_test = sst2['test']

sst2_train_df = pd.DataFrame(sst2_train)
sst2_val_df = pd.DataFrame(sst2_val)
sst2_test_df = pd.DataFrame(sst2_test)

print(f"SST2 Train Rows: {sst2_train_df.shape[0]} | Columns: {sst2_train_df.shape[1]}")
print(f"SST2 Val Rows: {sst2_val_df.shape[0]} | Columns: {sst2_val_df.shape[1]}")
print(f"SST2 Test Rows: {sst2_test_df.shape[0]} | Columns: {sst2_test_df.shape[1]}")

sst2_train_df.to_csv('./datasets/sst2/sst2_train.csv', index=False)
sst2_val_df.to_csv('./datasets/sst2/sst2_val.csv', index=False)
sst2_test_df.to_csv('./datasets/sst2/sst2_test.csv', index=False)

Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

SST2 Train Rows: 67349 | Columns: 3
SST2 Val Rows: 872 | Columns: 3
SST2 Test Rows: 1821 | Columns: 3


In [6]:
qnli = download_data('qnli')

qnli_train = qnli['train']
qnli_val = qnli['validation']
qnli_test = qnli['test']

qnli_train_df = pd.DataFrame(qnli_train)
qnli_val_df = pd.DataFrame(qnli_val)
qnli_test_df = pd.DataFrame(qnli_test)

print(f"QNLI Train Rows: {qnli_train_df.shape[0]} | Columns: {qnli_train_df.shape[1]}")
print(f"QNLI Val Rows: {qnli_val_df.shape[0]} | Columns: {qnli_val_df.shape[1]}")
print(f"QNLI Test Rows: {qnli_test_df.shape[0]} | Columns: {qnli_test_df.shape[1]}")

qnli_train_df.to_csv('./datasets/qnli/qnli_train.csv', index=False)
qnli_val_df.to_csv('./datasets/qnli/qnli_val.csv', index=False)
qnli_test_df.to_csv('./datasets/qnli/qnli_test.csv', index=False)

Downloading data:   0%|          | 0.00/10.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/104743 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5463 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5463 [00:00<?, ? examples/s]

QNLI Train Rows: 104743 | Columns: 4
QNLI Val Rows: 5463 | Columns: 4
QNLI Test Rows: 5463 | Columns: 4


In [7]:
qqp = download_data('qqp')

qqp_train = qqp['train']
qqp_val = qqp['validation']
qqp_test = qqp['test']

qqp_train_df = pd.DataFrame(qqp_train)
qqp_val_df = pd.DataFrame(qqp_val)
qqp_test_df = pd.DataFrame(qqp_test)


print(f"QQP Train Rows: {qqp_train_df.shape[0]} | Columns: {qqp_train_df.shape[1]}")
print(f"QQP Val Rows: {qqp_val_df.shape[0]} | Columns: {qqp_val_df.shape[1]}")
print(f"QQP Test Rows: {qqp_test_df.shape[0]} | Columns: {qqp_test_df.shape[1]}")

qqp_train_df.to_csv('./datasets/qqp/qqp_train.csv', index=False)
qqp_val_df.to_csv('./datasets/qqp/qqp_val.csv', index=False)
qqp_test_df.to_csv('./datasets/qqp/qqp_test.csv', index=False)

Downloading data:   0%|          | 0.00/41.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/363846 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/40430 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/390965 [00:00<?, ? examples/s]

QQP Train Rows: 363846 | Columns: 4
QQP Val Rows: 40430 | Columns: 4
QQP Test Rows: 390965 | Columns: 4


In [8]:
mnli = download_data('mnli')

mnli_train = mnli['train']
mnli_val = mnli['validation_matched']
mnli_test = mnli['test_matched']

mnli_train_df = pd.DataFrame(mnli_train)
mnli_val_df = pd.DataFrame(mnli_val)
mnli_test_df = pd.DataFrame(mnli_test)

print(f"MNLI Train Rows: {mnli_train_df.shape[0]} | Columns: {mnli_train_df.shape[1]}")
print(f"MNLI Val Rows: {mnli_val_df.shape[0]} | Columns: {mnli_val_df.shape[1]}")
print(f"MNLI Test Rows: {mnli_test_df.shape[0]} | Columns: {mnli_test_df.shape[1]}")

mnli_train_df.to_csv('./datasets/mnli/mnli_train.csv', index=False)
mnli_val_df.to_csv('./datasets/mnli/mnli_val.csv', index=False)
mnli_test_df.to_csv('./datasets/mnli/mnli_test.csv', index=False)

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

MNLI Train Rows: 392702 | Columns: 4
MNLI Val Rows: 9815 | Columns: 4
MNLI Test Rows: 9796 | Columns: 4
