In [1]:
# Import Python library for working with dataframes
import pandas as pd

# Import Dataset and DatasetDict classes from the datasets library that helps us prepare our own dataset for use in training and evaulating machine learning models
from datasets import Dataset, DatasetDict

# Import Python library that helps us extract certain target patterns from strings with regular expressions
import re

# Setting a seed helps us replicate results across multiple runs
SEED = 42

In [None]:
# Load the csv file
df = pd.read_csv('../data/All_adjudicated_ELL_data_1022.csv')

# Allows us to take a quick look at the first two rows of the loaded dataframe
df.head(2)

In [2]:
cols = ['Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology', 'Grammar', 'Conventions']
cols1 = [col + '_1' for col in cols]
cols2 = [col + '_2' for col in cols]

In [None]:
# Quick and dirty word counter
df.Text.str.split().str.len().describe()

## Build Dataset

Create a DatasetDict that will hold the dataset partitions. Saving this to disk promotes reproducibility by guaranteeing that different scripts are accessing the same data splits. I find that it also helps to organize our research code.

It is possible to tokenize at this stage, but I prefer to tokenize at the last minute. This affords us the flexibility of changing tokenization schemes, which could be useful if we want to test different pretrained models (that may use different tokenizers).

In [47]:
def build_datadict(df):
    # Create list of columns that we are interested in working with
    columns = ['ID', 'Text'] + cols1 + cols2
    
    # Use the above list to only select the datapoints in the columns we are interested in. 
    # We are also renaming the column 'clean_text' into 'text' since this is the only 'text' data we will be working with anyway.
    df = df[columns].rename(columns = {col: col.lower() for col in columns})
    
    # Use the Dataset class method to transform a dataframe into a Dataset object.
    ds = Dataset.from_pandas(df, preserve_index=False)

    # Split data into train, development, and test sets.
    # 70% train, 15% development, 15% test
    # Use the SEED we defined in the first cell to ensure reproducibility of the split.
    train_remains = ds.train_test_split(test_size=0.3, seed=SEED)
    train = train_remains['train']
    _remains = train_remains['test']
    
    dev_test = _remains.train_test_split(test_size=0.5, seed=SEED)
    dev = dev_test['train']
    test = dev_test['test']

    dd = DatasetDict({
        'train': train,
        'dev': dev, 
        'test': test})

    return dd

In [48]:
dd = build_datadict(df)

In [13]:
dd

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'overall_1', 'cohesion_1', 'syntax_1', 'vocabulary_1', 'phraseology_1', 'grammar_1', 'conventions_1', 'overall_2', 'cohesion_2', 'syntax_2', 'vocabulary_2', 'phraseology_2', 'grammar_2', 'conventions_2'],
        num_rows: 6216
    })
    dev: Dataset({
        features: ['id', 'text', 'overall_1', 'cohesion_1', 'syntax_1', 'vocabulary_1', 'phraseology_1', 'grammar_1', 'conventions_1', 'overall_2', 'cohesion_2', 'syntax_2', 'vocabulary_2', 'phraseology_2', 'grammar_2', 'conventions_2'],
        num_rows: 1332
    })
    test: Dataset({
        features: ['id', 'text', 'overall_1', 'cohesion_1', 'syntax_1', 'vocabulary_1', 'phraseology_1', 'grammar_1', 'conventions_1', 'overall_2', 'cohesion_2', 'syntax_2', 'vocabulary_2', 'phraseology_2', 'grammar_2', 'conventions_2'],
        num_rows: 1332
    })
})

In [14]:
# Save the DatasetDict object. We will be using it in the next notebooks.
dd.save_to_disk('../data/raw_ellipse.hf')

Saving the dataset (0/1 shards):   0%|          | 0/6216 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1332 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1332 [00:00<?, ? examples/s]

# Load dataset back in and save to csv

In [2]:
dd = DatasetDict.load_from_disk('../data/raw_ellipse.hf')

In [11]:
df = pd.concat([
    dd['train'].to_pandas(),
    dd['dev'].to_pandas(),
    dd['test'].to_pandas()
], names=['partition'], keys=['train', 'dev', 'test']).reset_index(level=0).reset_index(drop=True)
df.to_csv('../data/ellipse_partitioned.csv')

In [13]:
import zipfile
from io import BytesIO
import os

# Create a BytesIO object to hold the zip file in memory
zip_buffer = BytesIO()

# Create a ZipFile object
with zipfile.ZipFile(zip_buffer, 'a', zipfile.ZIP_DEFLATED, False) as zip_file:
    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        # Filename is e.g., train/{text_id}.txt
        partition = 'train' if row['partition'] in ['train', 'dev'] else 'test'
        folder_path = os.path.join(partition, f"{row['id']}.txt")

        zip_file.writestr(folder_path, row['text'])

with open('../data/ellipse_partitioned.zip', 'wb') as f:
    f.write(zip_buffer.getvalue())