This notebook works for loading and reshaping the Eigsti and Nadig datasets.

In [82]:
# LOADING THE REQUIRED PACKAGES
import os
os.system('pip install pylangacq')
import pylangacq 
import pandas as pd
from sklearn.model_selection import train_test_split

Defaulting to user installation because normal site-packages is not writeable


In [52]:
# WRITING A FUNCTION FOR LOADING DATA
# This function loads each file in the directory individually, and turns it into a dataframe. Then it creates columns for age,
# id, and group. Finally, it binds the individual dataframes together.

## Accessing single files
def dataload(datapath):
    df = pd.DataFrame()

    for subject in os.listdir(datapath):
        #print(subject)
        pylang_obj = pylangacq.read_chat(path = datapath, match = subject)
        d = pd.DataFrame(pylang_obj.utterances())
        d["age"] = pylang_obj.ages(months=True)[0]
        d["id"] = pylang_obj.headers()[0]['PID']
        d["group"] = pylang_obj.headers()[0]['Participants']['CHI']['group']
        df = pd.concat([df, d])

    return(df)

In [53]:
# RUNNING THE DATALOAD FUNCTION
eigstig = dataload(os.path.join("/work", "exam", "ASD_classification", "data", "eigstig"))

In [54]:
# CREATING A COLUMN WHERE EACH UTTERANCE IS A STRING
# Accessing the word-keys in the nested dicts in the tokens column and appending them to a string in a new tokens column

words = ""
tokens2 = []

for row in eigstig['tokens']:
    for list in row:
        #print(list['word'])
        words += list['word'] + " "
    tokens2.append(words)
    words = ""

eigstig['tokens2'] = tokens2

In [55]:
# CLEANING THE DF
# Dropping unnecessary columns
eigstig = eigstig.drop(columns=['tokens'])
eigstig = eigstig.drop(columns=['tiers'])
eigstig = eigstig.drop(columns=['time_marks'])

In [56]:
# We can read from the description of the datasets on talkbank.org - and see it here - that Eigstig annotated the typically
# developing children with TD and Nadig used TYP. For consistency, we will recode all variables that are grouped TYP to TD.
# Eigstig also has a group called DD (for developmental delay). These children have developmental delay, but not ASD. This group
# will be filtered out.
print(eigstig['group'].unique())

# Recoding TYP to TD
eigstig = eigstig.replace('TYP','TD') # xxx not necessary in the eigstig data

# Dropping the rows from the DD group
eigstig = eigstig[eigstig.group != 'DD']

# Checking the variables
print(eigstig['group'].unique())

['TD' 'DD' 'ASD']
['TD' 'ASD']


In [57]:
eigstig

# Possible issue: In the Nadig data, there are a lot of rows/utterances that consist only of a punctuation, etc. xxx

Unnamed: 0,participant,age,id,group,tokens2
0,INV1,45.633333,11312/a-00032743-1,TD,we've got sort of a bumblebee theme here becau...
1,INV1,45.633333,11312/a-00032743-1,TD,.
2,INV1,45.633333,11312/a-00032743-1,TD,mmmm .
3,INV1,45.633333,11312/a-00032743-1,TD,hm .
4,INV1,45.633333,11312/a-00032743-1,TD,and you know what ?
...,...,...,...,...,...
380,INV1,48.400000,11312/a-00032761-1,ASD,"yeah , where's the hospital ?"
381,CHI,48.400000,11312/a-00032761-1,ASD,"I don't know , it's two blocks here ."
382,INV1,48.400000,11312/a-00032761-1,ASD,is that the hospital right there ?
383,CHI,48.400000,11312/a-00032761-1,ASD,and the vinɤs and clean is up the air .


In [58]:
# SAVING THE DF AS A CSV FILE
#eigstig.to_csv('df_eigstig.csv', index = True) # Not necessary to save untill cleaning is done

In [59]:
print(eigstig['participant'].unique())

# Dropping the rows that are not participant == CHI
eigstig = eigstig[eigstig.participant == 'CHI']

# Dropping the column participant (since this is always CHI now)
eigstig = eigstig.drop(columns=['participant'])

# Dummy coding a diagnosis column
eigstig['diagnosis'] = eigstig['group'].replace("TD", 0)
eigstig['diagnosis'] = eigstig['diagnosis'].replace("ASD", 1)

['INV1' 'INV2' 'MOT' 'CHI' 'FAT' 'INV' 'MOM']


In [60]:
def preprocess_tokens(df):
    """This function removes weird and redundant characters and spaces
    """

    # Remove punctuations and numbers
    df['tokens2'] = df['tokens2'].str.replace('[^a-zA-Z]', ' ', regex=True)

    # Single character removal
    df['tokens2'] = df['tokens2'].str.replace(r"\s+[a-zA-Z]\s+", ' ', regex=True)

    # Removing multiple spaces
    df['tokens2'] = df['tokens2'].str.replace(r'\s+', ' ', regex=True)

    # Drop spaces created when deleting single period values
    df = df[df.tokens2 != ' ']

    return df

eigstig = preprocess_tokens(eigstig)

In [61]:
eigstig.to_csv('data/dataframes/data_eigstig.csv', index = False)

eigstig

# Tokens2 has a lot of columns that have only a ".". I used the utterances() function to load the data, so it is
# only the verbal utterances that are included in this dataset. If we look in the original dataset, we see that
# the "."'s indicate rows where the child communicated non-verbally, e.g. by nodding, shaking their head, counting
# on their fingers, etc. The frequecy of these in the dataset could likely also tell us something about the differ-
# ence in language/communication between TD and ASD children, but for our purpose, we will remove these from the 
# dataset, because we are looking at verbal language.

Unnamed: 0,age,id,group,tokens2,diagnosis
110,45.633333,11312/a-00032743-1,TD,cow,0
135,45.633333,11312/a-00032743-1,TD,tree,0
141,45.633333,11312/a-00032743-1,TD,bandaid,0
147,45.633333,11312/a-00032743-1,TD,ow ipep,0
162,45.633333,11312/a-00032743-1,TD,brush,0
...,...,...,...,...,...
373,48.400000,11312/a-00032761-1,ASD,and the bunny,1
377,48.400000,11312/a-00032761-1,ASD,yeah,1
381,48.400000,11312/a-00032761-1,ASD,I don know it two blocks here,1
383,48.400000,11312/a-00032761-1,ASD,and the vin and clean is up the air,1


In [62]:
# What is the age range in the data?:
print(eigstig.min(axis=0))
print(eigstig.max(axis=0))
# the age range goes from 32.6 months to 78.3 months, or in years:
print(32.6/12)
print(78.3/12)
# Age range: 2.7-6.5 years old

age                        32.6
id           11312/a-00032742-1
group                       ASD
tokens2                  Mommy 
diagnosis                     0
dtype: object
age                        78.3
id           11312/a-00032789-1
group                        TD
tokens2                    zzz 
diagnosis                     1
dtype: object
2.716666666666667
6.5249999999999995


# Splitting the data up into age groups
Since the Eigstig data is collected from children between the ages of 2 and 6 years old, we will split the data into age groups and run the models on data from each separate age group. We do not want to train a model on the full age range, since we are comparing developmental delay of speech between the groups. This means that if we have too wide age groups, then the model may be confused and classify a 6 y.o. autistic child as a 3 y.o. typically developing child.

(NB: we may have too little data in the 2- and 6-year-old groups, since the youngest child is 2.7 years old and the oldest child is 6.5 years old).

We will have the following age groups:

    -   2 year olds: 24-36 months
    -   3 year olds: 36-48 months
    -   4 year olds: 48-60 months
    -   5 year olds: 60-72 months
    -   6 year olds: 72-84 months

In [77]:
# The age groups in months:
eigstig.to_csv('data/dataframes/data_eigstig.csv', index = False)

eigstig_age2 = eigstig[(eigstig.age >= 24) & (eigstig.age < 36)]
eigstig_age3 = eigstig[(eigstig.age >= 36) & (eigstig.age < 48)]
eigstig_age4 = eigstig[(eigstig.age >= 48) & (eigstig.age < 60)]
eigstig_age5 = eigstig[(eigstig.age >= 60) & (eigstig.age < 72)]
eigstig_age6 = eigstig[(eigstig.age >= 72) & (eigstig.age < 84)]
#eigstig_age3to4 = eigstig[(eigstig.age) >= 72 & (eigstig.age < 84)]
#eigstig_age5to6 = eigstig[(eigstig.age) >= 72 & (eigstig.age < 84)]

eigstig_age2.to_csv('data/dataframes/data_eigstig_age2.csv', index = False)
eigstig_age3.to_csv('data/dataframes/data_eigstig_age3.csv', index = False)
eigstig_age4.to_csv('data/dataframes/data_eigstig_age4.csv', index = False)
eigstig_age5.to_csv('data/dataframes/data_eigstig_age5.csv', index = False)
eigstig_age6.to_csv('data/dataframes/data_eigstig_age6.csv', index = False)

print(len(eigstig_age2))
print(len(eigstig_age3))
print(len(eigstig_age4))
print(len(eigstig_age5))
print(len(eigstig_age6))

210
2435
1286
729
209


# Preparing data for LR and NN models (Text and Label)

In [78]:
# Creating a dataframe with only text and label features
eigstig_age3_text_label = eigstig_age3.drop(columns=['age', 'id', 'group']).rename(columns = {'tokens2':'text', 'diagnosis':'label'})
print(eigstig_age3_text_label)
eigstig_age3_text_label.to_csv('data/dataframes/data_eigstig_age3_text_label.csv', index = False)

                    text  label
110                 cow       0
135                tree       0
141             bandaid       0
147             ow ipep       0
162               brush       0
..                   ...    ...
314  now we need hammer       0
317           your head       0
318          oooh grrrr       0
320         not hurt no       0
322  does that hurts no       0

[2435 rows x 2 columns]


# Preparing the data to fit the LogReg and NN classifiers
This means that the data must be in a shape of a class, which contains a tuple of three dictionaries: taining, validation, and test data. Inside each subset is a tuple with a dictionary which contains features (list) and number of rows (value). Features is a list which contains text and labels - so for our data, these would be tokens2, age, and group/diagnosis.

In [48]:
# # Class with functions - not working
# class createDatasetDict:

#     def split_data(self, data):
#         self.data = data

#         # Split dataset into train, test, val (70, 15, 15)
#         train, test = train_test_split(df, test_size=0.15)
#         train, val = train_test_split(train, test_size=0.15)

#         # Turning the split dataframes into dicts
#         train = Dataset.from_dict(train)
#         val = Dataset.from_dict(val)
#         test = Dataset.from_dict(test)

#         return(train, val, test)


#     def create_dicts(self, train, val, test):
#         corpus_dict = datasets.DatasetDict({
#             "train":self.train, 
#             "val":self.val, 
#             "test":self.test
#             })
    
#         return(corpus_dict)

# if __name__=="__main__":
#     createDatasetDict()

In [101]:
# For the logreg and nn
from sklearn.model_selection import train_test_split
import datasets
from datasets import Dataset

data = pd.read_csv("/work/exam/ASD_classification/data/dataframes/data_eigstig_age3_text_label.csv")

distribution = data.groupby(['label'])['label'].count()
print(distribution)

class createDatasetDict:

    # Split dataset into train, test, val (70, 15, 15)
    #def split_data():
    train, test = train_test_split(data, test_size=0.15)
    train, val = train_test_split(train, test_size=0.15)
    #    return (train, test, val)

    distribution = train.groupby(['label'])['label'].count()
    print(distribution)
    distribution = val.groupby(['label'])['label'].count()
    print(distribution)
    distribution = test.groupby(['label'])['label'].count()
    print(distribution)

    # Turning the split dataframes into dicts
    train = Dataset.from_dict(train)
    val = Dataset.from_dict(val)
    test = Dataset.from_dict(test)

    corpus_dict = datasets.DatasetDict({
        "train":train, 
        "val":val, 
        "test":test
        })

label
0    1730
1     705
Name: label, dtype: int64
label
0    1240
1     518
Name: label, dtype: int64
label
0    225
1     86
Name: label, dtype: int64
label
0    265
1    101
Name: label, dtype: int64


In [90]:
dd = createDatasetDict()
dd = dd.corpus_dict
dd

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1758
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 311
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 366
    })
})

In [88]:
# print(dd.corpus_dict)
# print(type(dd.corpus_dict))
#dd.corpus_dict['train']['tokens2']

# For BERT

In [None]:
import numpy as np
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import load_dataset, load_metric

In [None]:
#task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [None]:
label_list = ['TD', 'ASD']

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
# For the BERT
class createDatasetDict:

    model_checkpoint = "distilbert-base-uncased"

    # Split dataset into train, test, val (70, 15, 15)
    train, test = train_test_split(data, test_size=0.15)
    train, val = train_test_split(train, test_size=0.15)

    # Convert to numpy
    x_train = train['tokens2'].values[train_idx]
    x_test = test['tokens2'].values[test_idx]
    x_val = val['tokens2'].values[val_idx]

    # Turning the split dataframes into dicts
    x_train = Dataset.from_dict(x_train)
    x_val = Dataset.from_dict(x_val)
    x_test = Dataset.from_dict(x_test)

    # Create tokenizer from pretrained model
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    corpus_dict = datasets.DatasetDict({
        "train": [train, x_train], 
        "val": [val, x_val], 
        "test": [test, x_test]
        })


train_tok = tokenizer(list(x_train), truncation=True, padding=True, max_length=20)
val_tok = tokenizer(list(x_val), truncation=True, padding=True, max_length=20)
test_tok = tokenizer(list(x_test),  truncation=True, padding=True, max_length=20)