This notebook works for loading and reshaping the Eigsti and Nadig datasets.

In [118]:
# LOADING THE REQUIRED PACKAGES
import os
os.system('pip install pylangacq')
import pylangacq 
import pandas as pd

Defaulting to user installation because normal site-packages is not writeable


In [119]:
# WRITING A FUNCTION FOR LOADING DATA
# This function loads each file in the directory individually, and turns it into a dataframe. Then it creates columns for age,
# id, and group. Finally, it binds the individual dataframes together.

## Accessing single files
def dataload(datapath):
    df = pd.DataFrame()

    for subject in os.listdir(datapath):
        #print(subject)
        pylang_obj = pylangacq.read_chat(path = datapath, match = subject)
        d = pd.DataFrame(pylang_obj.utterances())
        d["age"] = pylang_obj.ages(months=True)[0]
        d["id"] = pylang_obj.headers()[0]['PID']
        d["group"] = pylang_obj.headers()[0]['Participants']['CHI']['group']
        df = pd.concat([df, d])

    return(df)

In [120]:
# RUNNING THE DATALOAD FUNCTION ON ALL FILES
all_data = dataload(os.path.join("/work", "exam", "ASD_classification", "data", "corpus"))

In [121]:
# CREATING A COLUMN WHERE EACH UTTERANCE IS A STRING
# Accessing the word-keys in the nested dicts in the tokens column and appending them to a string in a new tokens column

words = ""
tokens2 = []

for row in all_data['tokens']:
    for list in row:
        #print(list['word'])
        words += list['word'] + " "
    tokens2.append(words)
    words = ""

all_data['tokens2'] = tokens2

In [122]:
# CLEANING THE DF
# Dropping unnecessary columns
all_data = all_data.drop(columns=['tokens'])
all_data = all_data.drop(columns=['tiers'])
all_data = all_data.drop(columns=['time_marks'])

In [123]:
# We can read from the description of the datasets on talkbank.org - and see it here - that Eigstig annotated the typically
# developing children with TD and Nadig used TYP. For consistency, we will recode all variables that are grouped TYP to TD.
# Eigstig also has a group called DD (for developmental delay). These children have developmental delay, but not ASD. This group
# will be filtered out.
print(all_data['group'].unique())

# Recoding TYP to TD
all_data = all_data.replace('TYP','TD')

# Dropping the rows from the DD group
all_data = all_data[all_data.group != 'DD']

# Checking the variables
print(all_data['group'].unique())

['TD' 'DD' 'ASD' 'TYP']
['TD' 'ASD']


In [124]:
all_data

# Possible issue: In the Nadig data, there are a lot of rows/utterances that consist only of a punctuation, etc. 

Unnamed: 0,participant,age,id,group,tokens2
0,INV1,45.633333,11312/a-00032743-1,TD,we've got sort of a bumblebee theme here becau...
1,INV1,45.633333,11312/a-00032743-1,TD,.
2,INV1,45.633333,11312/a-00032743-1,TD,mmmm .
3,INV1,45.633333,11312/a-00032743-1,TD,hm .
4,INV1,45.633333,11312/a-00032743-1,TD,and you know what ?
...,...,...,...,...,...
309,MOT,28.800000,11312/a-00005262-1,TD,oh that feels great .
310,CHI,28.800000,11312/a-00005262-1,TD,haircut .
311,MOT,28.800000,11312/a-00005262-1,TD,thank_you .
312,CHI,28.800000,11312/a-00005262-1,TD,.


In [125]:
# SAVING THE DF AS A CSV FILE
all_data.to_csv('all_data.csv', index = True)

In [133]:
print(all_data['participant'].unique())

# Dropping the rows that are not participant == CHI
CHI_data = all_data[all_data.participant == 'CHI']

# Dropping the column participant (since this is always CHI now)
CHI_data = CHI_data.drop(columns=['participant'])

# Dummy coding a diagnosis column
CHI_data['diagnosis'] = CHI_data['group'].replace("TD", 0)
CHI_data['diagnosis'] = CHI_data['diagnosis'].replace("ASD", 1)

['INV1' 'INV2' 'MOT' 'CHI' 'FAT' 'INV' 'MOM']


In [134]:
def preprocess_tokens(df):
    """This function removes weird and redundant characters and spaces
    """

    # Remove punctuations and numbers
    df['tokens2'] = df['tokens2'].str.replace('[^a-zA-Z]', ' ', regex=True)

    # Single character removal
    df['tokens2'] = df['tokens2'].str.replace(r"\s+[a-zA-Z]\s+", ' ', regex=True)

    # Removing multiple spaces
    df['tokens2'] = df['tokens2'].str.replace(r'\s+', ' ', regex=True)

    # Drop spaces created when deleting single period values
    df = df[df.tokens2 != ' ']

    return df

CHI_data = preprocess_tokens(CHI_data)

In [135]:
CHI_data.to_csv('CHI_data.csv', index = False)

CHI_data

# Tokens2 has a lot of columns that have only a ".". I used the utterances() function to load the data, so it is
# only the verbal utterances that are included in this dataset. If we look in the original dataset, we see that
# the "."'s indicate rows where the child communicated non-verbally, e.g. by nodding, shaking their head, counting
# on their fingers, etc. The frequecy of these in the dataset could likely also tell us something about the differ-
# ence in language/communication between TD and ASD children, but for our purpose, we will remove these from the 
# dataset, because we are looking at verbal language.

Unnamed: 0,age,id,group,tokens2,diagnosis
110,45.633333,11312/a-00032743-1,TD,cow,0
135,45.633333,11312/a-00032743-1,TD,tree,0
141,45.633333,11312/a-00032743-1,TD,bandaid,0
147,45.633333,11312/a-00032743-1,TD,ow ipep,0
162,45.633333,11312/a-00032743-1,TD,brush,0
...,...,...,...,...,...
300,28.800000,11312/a-00005262-1,TD,love you haircut,0
304,28.800000,11312/a-00005262-1,TD,no Mama no not just ponytail,0
306,28.800000,11312/a-00005262-1,TD,I just your hair,0
310,28.800000,11312/a-00005262-1,TD,haircut,0


In [136]:
# Creating a short CHI dataframe with only text and label features
CHI_data_short = CHI_data.drop(columns=['age', 'id', 'group']).rename(columns = {'tokens2':'text', 'diagnosis':'label'})
print(CHI_data_short)
CHI_data_short.to_csv('CHI_data_short.csv', index = False)

                                 text  label
110                              cow       0
135                             tree       0
141                          bandaid       0
147                          ow ipep       0
162                            brush       0
..                                ...    ...
300                 love you haircut       0
304     no Mama no not just ponytail       0
306                 I just your hair       0
310                          haircut       0
313  now make some my something else       0

[7120 rows x 2 columns]


# Preparing the data to fit the LogReg and NN classifiers
This means that the data must be in a shape of a class, which contains a tuple of three dictionaries: taining, validation, and test data. Inside each subset is a tuple with a dictionary which contains features (list) and number of rows (value). Features is a list which contains text and labels - so for our data, these would be tokens2, age, and group/diagnosis.

In [31]:
# # Class with functions - not working
# class createDatasetDict:

#     def split_data(self, data):
#         self.data = data

#         # Split dataset into train, test, val (70, 15, 15)
#         train, test = train_test_split(df, test_size=0.15)
#         train, val = train_test_split(train, test_size=0.15)

#         # Turning the split dataframes into dicts
#         train = Dataset.from_dict(train)
#         val = Dataset.from_dict(val)
#         test = Dataset.from_dict(test)

#         return(train, val, test)


#     def create_dicts(self, train, val, test):
#         corpus_dict = datasets.DatasetDict({
#             "train":self.train, 
#             "val":self.val, 
#             "test":self.test
#             })
    
#         return(corpus_dict)

# if __name__=="__main__":
#     createDatasetDict()

In [114]:
# For the logreg and nn
class createDatasetDict:

    # Split dataset into train, test, val (70, 15, 15)
    train, test = train_test_split(data, test_size=0.15)
    train, val = train_test_split(train, test_size=0.15)

    # Turning the split dataframes into dicts
    train = Dataset.from_dict(train)
    val = Dataset.from_dict(val)
    test = Dataset.from_dict(test)

    corpus_dict = datasets.DatasetDict({
        "train":train, 
        "val":val, 
        "test":test
        })

In [111]:
from sklearn.model_selection import train_test_split
import datasets
from datasets import Dataset

data = pd.read_csv("/work/exam/ASD_classification/CHI_data.csv")

dd = createDatasetDict()

In [113]:
# print(dd.corpus_dict)
# print(type(dd.corpus_dict))
dd.corpus_dict['train']['tokens2']

['no . ',
 'I to fall down . ',
 'oh help ! ',
 'hey , a blocks ! ',
 'hey ! ',
 '. ',
 '. ',
 'yeah . ',
 "what's this for ? ",
 'put it on his hands and on his head . ',
 'alright . ',
 'yeah . ',
 'horsie ? ',
 "I don't know . ",
 "it can't work anywhere . ",
 "and you're making my face . ",
 '. ',
 'vroom . ',
 'touch your foot . ',
 'his face . ',
 'this . ',
 "they're like in summertime . ",
 '. ',
 "oh ‡ what's that ? ",
 'oooh , grrrr . ',
 'no look at me . ',
 'good night , lion . ',
 '. ',
 'and then , that . ',
 'no . ',
 '. ',
 'yeah . ',
 'put your clothes back on . ',
 '. ',
 '. ',
 '. ',
 "I don't know . ",
 'yeah . ',
 'he lost he lost lost his train . ',
 'she drinking milk . ',
 'tricky make these . ',
 'numbers and letters . ',
 'tea for two . ',
 'I want //. ',
 'no Daddy . ',
 'no . ',
 'ah . ',
 'Mommy . ',
 'yeah . ',
 'yeah . ',
 'Mama , I will glue it . ',
 '. ',
 'mimi . ',
 'no . ',
 'but the giraffe came out of his cage . ',
 "they can't get inside ! ",
 'fo

# For BERT

In [89]:
import numpy as np
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import load_dataset, load_metric

In [90]:
#task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [88]:
label_list = ['TD', 'ASD']

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
# For the BERT
class createDatasetDict:

    model_checkpoint = "distilbert-base-uncased"

    # Split dataset into train, test, val (70, 15, 15)
    train, test = train_test_split(data, test_size=0.15)
    train, val = train_test_split(train, test_size=0.15)

    # Convert to numpy
    x_train = train['tokens2'].values[train_idx]
    x_test = test['tokens2'].values[test_idx]
    x_val = val['tokens2'].values[val_idx]

    # Turning the split dataframes into dicts
    x_train = Dataset.from_dict(x_train)
    x_val = Dataset.from_dict(x_val)
    x_test = Dataset.from_dict(x_test)

    # Create tokenizer from pretrained model
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    corpus_dict = datasets.DatasetDict({
        "train": [train, x_train], 
        "val": [val, x_val], 
        "test": [test, x_test]
        })


train_tok = tokenizer(list(x_train), truncation=True, padding=True, max_length=20)
val_tok = tokenizer(list(x_val), truncation=True, padding=True, max_length=20)
test_tok = tokenizer(list(x_test),  truncation=True, padding=True, max_length=20)