In [41]:
import pandas as pd
import seaborn as sns
sns.set(rc={'figure.figsize':(11,8)})
import matplotlib.pyplot as plt
from pathlib import Path

DATA = Path.cwd().parent / 'data' 
EFCAMDAT = DATA / 'efcamdat_shatz_distro.parquet'
df = pd.read_parquet(EFCAMDAT)

# Downsample

In [42]:
languages = ['Portuguese', 'Mandarin', 'Spanish', 'Russian', 'German', 'Arabic']
# num_to_sample = df.groupby(['l1']).learner_id.nunique()['Arabic'] # by learner
num_to_sample = df.l1.value_counts()['Arabic'] # by text

df_sampled = (df[df.l1.isin(languages)]
              .groupby(['l1'])
              .sample(n=num_to_sample, random_state=42)
             )
df_sampled

Unnamed: 0,writing_id,learner_id,learner_id_categorical,nationality,l1,cefr,cefr_numeric,level,unit,topic_id_original,...,secondary_topic,topic_to_keep,date,time,grade,wordcount,mtld,text,text_corrected,sample
588741,618611,84027,84027.0,sa,Arabic,A2,2,4,5,29,...,thank,hair,41662,00:21:49.077,95,36,60.480000,"\n\t Dear Ali, I had really great time with y...","Dear Ali, I had really great time with your fa...",alternative
295439,316249,99467,99467.0,sa,Arabic,A2,2,5,7,39,...,went,both topics,41011,18:32:27.440,97,68,80.920000,"\n\t Hi, friend I'm so sorry that I missed y...","Hi, friend I'm so sorry that I missed your wed...",main
47907,476692,37927,37927.0,sa,Arabic,A1,1,1,3,3,...,live,live,41072,16:30:39.113,85,25,43.750000,\n\t Hi Anna! My name's Metan. I'm from Riyad...,Hi Anna! My name's Melan. I'm from Riyadh in S...,main
29443,686258,41934,41934.0,sa,Arabic,A1,1,1,1,1,...,english,both topics,41341,09:53:35.030,80,24,40.320000,\n\t My name is Ahmed. I'm from saudi arabia....,My name is Ahmed. I'm from saudi arabia. I'm l...,main
95253,673691,26968,26968.0,sa,Arabic,A1,1,1,8,8,...,dessert,both topics,41356,12:29:33.260,80,21,61.740000,\n\t Hi This is the menu. Main course: chicke...,Hi This is the menu. Main course: chicken and ...,main
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407898,863294,162730,162730.0,mx,Spanish,A1,1,1,2,2,...,there,one,41450,23:42:12.423,90,23,17.191919,\n\t This is my Office. in my office there ar...,This is my Office. in my office there are a lo...,alternative
254091,447569,174421,174421.0,mx,Spanish,A2,2,4,6,30,...,born,born,41076,02:00:19.997,97,63,38.731991,\n\t I was born on April 7th 1984 in Veracruz...,I was born on April 7th 1984 in Veracruz. I mo...,main
493453,603934,63764,63764.0,mx,Spanish,A1,1,2,3,11,...,peopl,summer,41472,01:34:27.880,90,72,85.383529,"\n\t Hello my friend, in my country there are...","bello my friend, in my country there are a lot...",alternative
63941,929001,27027,27027.0,mx,Spanish,A1,1,1,4,4,...,live,live,41549,23:18:25.247,90,31,38.440000,\n\t Enrique is my collegues and he live in S...,Enrique is my colleagues and he live in San Lu...,main


# Store unused data for Domain Adaptation

In [47]:
unused_df = df.loc[df.index.difference(df_sampled.index)]

In [48]:
unused_df.to_parquet(DATA / 'efcamdat_excluded_from_split.parquet')

# Split sampled data

In [102]:
from sklearn.model_selection import GroupShuffleSplit 

def split_by_group(dataframe, group_var, test_size=.2, random_state=42, threeway=True):
    splitter = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    split = splitter.split(dataframe, groups=dataframe[group_var].astype(int))
    A_inds, B_inds = next(split)
    train = dataframe.iloc[A_inds]
    dev_test = dataframe.iloc[B_inds]
    if not threeway:
        return train, dev_test
    else:
        splitter = GroupShuffleSplit(n_splits=1, test_size=.5, random_state=random_state)
        split = splitter.split(dev_test, groups=dev_test[group_var].astype(int))
        A_inds, B_inds = next(split)
        dev = dev_test.iloc[A_inds]
        test = dev_test.iloc[B_inds]
        return train, dev, test

train, dev, test = split_by_group(df_sampled, 'learner_id')

In [104]:
train.to_parquet(DATA / 'efcamdat_train.parquet')
dev.to_parquet(DATA / 'efcamdat_dev.parquet')
test.to_parquet(DATA / 'efcamdat_test.parquet')

In [109]:
any(train.learner_id.isin(test.learner_id))

False

# Dataset Dict

In [110]:
from datasets import Dataset, DatasetDict


In [114]:
def make_dataset(dataframe):

    dataframe = dataframe[['writing_id',
                           'learner_id',
                           'l1',
                           'cefr',
                           'grade',
                           'text',
                          ]]
    
    dataset = Dataset.from_pandas(dataframe.reset_index(drop=True))  
    
    return dataset

datadict = DatasetDict(
    {
        'train': make_dataset(train),
        'dev': make_dataset(dev),
        'test': make_dataset(test),
    }
)

In [115]:
datadict

DatasetDict({
    train: Dataset({
        features: ['writing_id', 'learner_id', 'l1', 'cefr', 'grade', 'text'],
        num_rows: 140469
    })
    dev: Dataset({
        features: ['writing_id', 'learner_id', 'l1', 'cefr', 'grade', 'text'],
        num_rows: 17668
    })
    test: Dataset({
        features: ['writing_id', 'learner_id', 'l1', 'cefr', 'grade', 'text'],
        num_rows: 17711
    })
})