In [1]:
# Import Python library for working with dataframes
import pandas as pd

# Import Dataset and DatasetDict classes from the datasets library that helps us prepare our own dataset for use in training and evaulating machine learning models
from datasets import Dataset, DatasetDict

# Import Python library that helps us extract certain target patterns from strings with regular expressions
import re

# Setting a seed helps us replicate results across multiple runs
SEED = 42

In [2]:
# Load the csv file
df = pd.read_csv('../data/All_adjudicated_ELL_data_1022.csv')

# Allows us to take a quick look at the first two rows of the loaded dataframe
df.head(2)

Unnamed: 0,Filename,ID,Text,Rater_1,Overall_1,Cohesion_1,Syntax_1,Vocabulary_1,Phraseology_1,Grammar_1,...,Identifying_Info_1,Rater_2,Overall_2,Cohesion_2,Syntax_2,Vocabulary_2,Phraseology_2,Grammar_2,Conventions_2,Identifying_Info_2
0,2021000071.txt,2021000071,"To the Principal,\r\n\r\nI think that policy 1...",hannah-page,3,3,4,4,4,3,...,0,alorapruitt,3,4,3,3,3,3,3,0
1,2021000501.txt,2021000501,"Dear, TEACHER_NAME\r\n\r\nI think phone policy...",hannah-page,3,3,2,3,3,3,...,0,alorapruitt,3,4,3,4,3,4,3,0


In [3]:
cols = ['Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology', 'Grammar', 'Conventions']
cols1 = [col + '_1' for col in cols]
cols2 = [col + '_2' for col in cols]

## Partitions by Specific Raters

In [4]:
# Get all rows where {rater_name} worked as a rater. Return new pd.DataFrame with their judgments on new columns.
def get_per_rater(rater_name):
    out1 = df[df["Rater_1"].str.match(rater_name)].assign(
        Overall = lambda x: x.Overall_1,
        Cohesion = lambda x: x.Cohesion_1,
        Syntax = lambda x: x.Syntax_1,
        Vocabulary = lambda x: x.Vocabulary_1,
        Phraseology = lambda x: x.Phraseology_1,
        Grammar = lambda x: x.Grammar_1,
        Conventions = lambda x: x.Conventions_1
    )
    out2 = df[df["Rater_2"].str.match(rater_name)].assign(
        Overall = lambda x: x.Overall_2,
        Cohesion = lambda x: x.Cohesion_2,
        Syntax = lambda x: x.Syntax_2,
        Vocabulary = lambda x: x.Vocabulary_2,
        Phraseology = lambda x: x.Phraseology_2,
        Grammar = lambda x: x.Grammar_2,
        Conventions = lambda x: x.Conventions_2
    )
    return pd.concat([out1, out2])

In [5]:
train_ap = get_per_rater("alorapruitt")
train_bb = get_per_rater("brittnybyrom")

In [6]:
def get_per_rater_pair(name1, name2):
    rater_name_pattern = "|".join([name1, name2])
    temp_df = df[df["Rater_1"].str.match(rater_name_pattern) & df["Rater_2"].str.match(rater_name_pattern)]
    out_df = temp_df.copy()
    name1_is_rater2 = out_df["Rater_2"].str.match(name2)
    name2_is_rater1 = out_df["Rater_1"].str.match(name1)
    out_df.loc[ name1_is_rater2, cols1 ] = temp_df[cols2].rename(columns={k: v for k,v in zip(cols2, cols1)})
    out_df.loc[ name2_is_rater1, cols2 ] = temp_df[cols1].rename(columns={k: v for k,v in zip(cols1, cols2)})
    return out_df

test = get_per_rater_pair('jbarton8', 'sulynnn')

In [37]:
# Variables `ap` and `bb` are probably `train_ap` and `train_bb` here? Changing the variable names. 
train_idx = set(df.index) - set(train_ap.index) - set(train_bb.index) - set(test.index)
train1 = df.loc[list(train_idx)]

## Build Dataset

Create a DatasetDict that will hold the dataset partitions. Saving this to disk promotes reproducibility by guaranteeing that different scripts are accessing the same data splits. I find that it also helps to organize our research code.

It is possible to tokenize at this stage, but I prefer to tokenize at the last minute. This affords us the flexibility of changing tokenization schemes, which could be useful if we want to test different pretrained models (that may use different tokenizers).

In [47]:
def build_datadict(df):
    # Create list of columns that we are interested in working with
    columns = ['ID', 'Text'] + cols1 + cols2
    
    # Use the above list to only select the datapoints in the columns we are interested in. 
    # We are also renaming the column 'clean_text' into 'text' since this is the only 'text' data we will be working with anyway.
    df = df[columns].rename(columns = {col: col.lower() for col in columns})
    
    # Use the Dataset class method to transform a dataframe into a Dataset object.
    ds = Dataset.from_pandas(df, preserve_index=False)

    # Split data into train, development, and test sets.
    # 70% train, 15% development, 15% test
    # Use the SEED we defined in the first cell to ensure reproducibility of the split.
    train_remains = ds.train_test_split(test_size=0.3, seed=SEED)
    train = train_remains['train']
    _remains = train_remains['test']
    
    dev_test = _remains.train_test_split(test_size=0.5, seed=SEED)
    dev = dev_test['train']
    test = dev_test['test']

    dd = DatasetDict({
        'train': train,
        'dev': dev, 
        'test': test})

    return dd

In [48]:
dd = build_datadict(df)

In [13]:
dd

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'overall_1', 'cohesion_1', 'syntax_1', 'vocabulary_1', 'phraseology_1', 'grammar_1', 'conventions_1', 'overall_2', 'cohesion_2', 'syntax_2', 'vocabulary_2', 'phraseology_2', 'grammar_2', 'conventions_2'],
        num_rows: 6216
    })
    dev: Dataset({
        features: ['id', 'text', 'overall_1', 'cohesion_1', 'syntax_1', 'vocabulary_1', 'phraseology_1', 'grammar_1', 'conventions_1', 'overall_2', 'cohesion_2', 'syntax_2', 'vocabulary_2', 'phraseology_2', 'grammar_2', 'conventions_2'],
        num_rows: 1332
    })
    test: Dataset({
        features: ['id', 'text', 'overall_1', 'cohesion_1', 'syntax_1', 'vocabulary_1', 'phraseology_1', 'grammar_1', 'conventions_1', 'overall_2', 'cohesion_2', 'syntax_2', 'vocabulary_2', 'phraseology_2', 'grammar_2', 'conventions_2'],
        num_rows: 1332
    })
})

In [14]:
# Save the DatasetDict object. We will be using it in the next notebooks.
dd.save_to_disk('../data/raw_ellipse.hf')

Saving the dataset (0/1 shards):   0%|          | 0/6216 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1332 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1332 [00:00<?, ? examples/s]