In [1]:
import pandas as pd
import seaborn as sns
sns.set_theme(font='Liberation Serif',
              rc={'figure.figsize': (7.5,3.75),
                  'font.size': 11,
                 })
import matplotlib.pyplot as plt
from pathlib import Path
df = pd.read_parquet('../data/efcamdat_shatz_distro.parquet')

# Downsample

In [58]:
languages = ['Portuguese', 'Mandarin', 'Spanish', 'Russian', 'German', 'Arabic']
num_to_sample = df.l1.value_counts()['Arabic'] # by text

# downsample other languages to number of observations for Arabic
df_sampled = (df[df.l1.isin(languages)]
              .groupby(['l1'])
              .sample(n=num_to_sample, random_state=42)
             )
df_sampled.head(2)

Unnamed: 0,writing_id,learner_id,learner_id_categorical,nationality,l1,cefr,cefr_numeric,level,unit,topic_id_original,...,secondary_topic,topic_to_keep,date,time,grade,wordcount,mtld,text,text_corrected,sample
588741,618611,84027,84027.0,sa,Arabic,A2,2,4,5,29,...,thank,hair,41662,00:21:49.077,95,36,60.48,"\n\t Dear Ali, I had really great time with y...","Dear Ali, I had really great time with your fa...",alternative
295439,316249,99467,99467.0,sa,Arabic,A2,2,5,7,39,...,went,both topics,41011,18:32:27.440,97,68,80.92,"\n\t Hi, friend I'm so sorry that I missed y...","Hi, friend I'm so sorry that I missed your wed...",main


# Store unused data for Domain Adaptation

In [4]:
unused_df = df.loc[df.index.difference(df_sampled.index)]

In [5]:
unused_df.to_parquet('../data/efcamdat_excluded_from_split.parquet')

# Split sampled data

In [6]:
from sklearn.model_selection import GroupShuffleSplit 

def split_by_group(dataframe, group_var, test_size=.2, random_state=42, threeway=True):
    splitter = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    split = splitter.split(dataframe, groups=dataframe[group_var].astype(int))
    A_inds, B_inds = next(split)
    train = dataframe.iloc[A_inds]
    dev_test = dataframe.iloc[B_inds]
    if not threeway:
        return train, dev_test
    else:
        splitter = GroupShuffleSplit(n_splits=1, test_size=.5, random_state=random_state)
        split = splitter.split(dev_test, groups=dev_test[group_var].astype(int))
        A_inds, B_inds = next(split)
        dev = dev_test.iloc[A_inds]
        test = dev_test.iloc[B_inds]
        return train, dev, test

train, dev, test = split_by_group(df_sampled, 'learner_id')

In [7]:
any(train.learner_id.isin(test.learner_id))

False

# Dataset Dict Single Texts

In [None]:
from datasets import Dataset, DatasetDict, ClassLabel

def make_dataset(dataframe):
    dataframe = dataframe[['writing_id',
                           'learner_id',
                           'l1',
                           'cefr',
                           'grade',
                           'text',
                          ]]

    return (Dataset
            .from_pandas(dataframe.reset_index(drop=True))
            .class_encode_column('l1')
            .rename_column('l1', 'labels')
           )

datadict = DatasetDict(
    {
        'train': make_dataset(train),
        'dev': make_dataset(dev),
        'test': make_dataset(test),
    }
)

Casting to class labels:   0%|          | 0/141 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/15 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/18 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/18 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

In [16]:
datadict.save_to_disk('../data/efcamdat_dataset')

# Dataset Dict Concatenated Texts

In [34]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', use_fast=True)

tokenize = lambda text: len(tokenizer(text)['token_ids'])

def concat_texts(dataframe):
    dataframe['tok_count'] = dataframe['text'].apply(tokenize)


In [52]:
grouped = test.groupby(['cefr', 'learner_id'])['text'].agg(lambda x: '\n'.join(x.str.strip()))

In [54]:
tokenize = lambda text: tokenizer(text, return_length=True)['length']
tok_counts = grouped.apply(tokenize)

Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors


In [55]:
tok_counts

cefr  learner_id
A1    20              35
      88              49
      118             60
      183             40
      255             68
                    ... 
C1    161478         192
      161597         785
      173136        1089
      173210         865
      173848         403
Name: text, Length: 7014, dtype: int64

In [37]:
train['text'].transform(tokenize)

588741    2
295439    2
47907     2
29443     2
95253     2
         ..
457324    2
407898    2
254091    2
493453    2
63941     2
Name: text, Length: 140469, dtype: int64

In [25]:
test['text'].str.split().str.len()

567502     50
106593     40
638882     94
236285     96
691168    104
         ... 
616134     57
414827     44
427060     43
683520    116
266538     73
Name: text, Length: 17711, dtype: int64