This notebook works for loading and reshaping the Eigsti and Nadig datasets.

In [1]:
# LOADING THE REQUIRED PACKAGES
import os
os.system('pip install pylangacq')
import pylangacq 
import pandas as pd

Defaulting to user installation because normal site-packages is not writeable
Collecting pylangacq
  Downloading pylangacq-0.17.0-py3-none-any.whl (65 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 65.2/65.2 kB 688.2 kB/s eta 0:00:00
Collecting tabulate[widechars]>=0.8.9
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate, pylangacq
Successfully installed pylangacq-0.17.0 tabulate-0.9.0


In [2]:
# WRITING A FUNCTION FOR LOADING DATA
# This function loads each file in the directory individually, and turns it into a dataframe. Then it creates columns for age,
# id, and group. Finally, it binds the individual dataframes together.

## Accessing single files
def dataload(datapath):
    df = pd.DataFrame()

    for subject in os.listdir(datapath):
        #print(subject)
        pylang_obj = pylangacq.read_chat(path = datapath, match = subject)
        d = pd.DataFrame(pylang_obj.utterances())
        d["age"] = pylang_obj.ages(months=True)[0]
        d["id"] = pylang_obj.headers()[0]['PID']
        d["group"] = pylang_obj.headers()[0]['Participants']['CHI']['group']
        df = pd.concat([df, d])

    return(df)

In [3]:
# RUNNING THE DATALOAD FUNCTION ON ALL FILES
all_data = dataload(os.path.join("/work", "exam", "ASD_classification", "data", "corpus"))

In [4]:
# CREATING A COLUMN WHERE EACH UTTERANCE IS A STRING
# Accessing the word-keys in the nested dicts in the tokens column and appending them to a string in a new tokens column

words = ""
tokens2 = []

for row in all_data['tokens']:
    for list in row:
        #print(list['word'])
        words += list['word'] + " "
    tokens2.append(words)
    words = ""

all_data['tokens2'] = tokens2

In [5]:
# CLEANING THE DF
# Dropping unnecessary columns
all_data = all_data.drop(columns=['tokens'])
all_data = all_data.drop(columns=['tiers'])
all_data = all_data.drop(columns=['time_marks'])

In [6]:
# We can read from the description of the datasets on talkbank.org - and see it here - that Eigstig annotated the typically
# developing children with TD and Nadig used TYP. For consistency, we will recode all variables that are grouped TYP to TD.
# Eigstig also has a group called DD (for developmental delay). These children have developmental delay, but not ASD. This group
# will be filtered out.
print(all_data['group'].unique())

# Recoding TYP to TD
all_data = all_data.replace('TYP','TD')

# Dropping the rows from the DD group
all_data = all_data[all_data.group != 'DD']

# Checking the variables
print(all_data['group'].unique())

['TD' 'DD' 'ASD' 'TYP']
['TD' 'ASD']


In [7]:
all_data

# Possible issue: In the Nadig data, there are a lot of rows/utterances that consist only of a punctuation, etc. 

Unnamed: 0,participant,age,id,group,tokens2
0,INV1,45.633333,11312/a-00032743-1,TD,we've got sort of a bumblebee theme here becau...
1,INV1,45.633333,11312/a-00032743-1,TD,.
2,INV1,45.633333,11312/a-00032743-1,TD,mmmm .
3,INV1,45.633333,11312/a-00032743-1,TD,hm .
4,INV1,45.633333,11312/a-00032743-1,TD,and you know what ?
...,...,...,...,...,...
309,MOT,28.800000,11312/a-00005262-1,TD,oh that feels great .
310,CHI,28.800000,11312/a-00005262-1,TD,haircut .
311,MOT,28.800000,11312/a-00005262-1,TD,thank_you .
312,CHI,28.800000,11312/a-00005262-1,TD,.


In [9]:
# SAVING THE DF AS A CSV FILE
all_data.to_csv('all_data.csv', index = True)

In [14]:
print(all_data['participant'].unique())

# Dropping the rows that are not participant == CHI
CHI_data = all_data[all_data.participant == 'CHI']

# Dropping the column participant (since this is always CHI now)
CHI_data = CHI_data.drop(columns=['participant'])

CHI_data.to_csv('CHI_data.csv', index = False)

['INV1' 'INV2' 'MOT' 'CHI' 'FAT' 'INV' 'MOM']


In [18]:
CHI_data

# Tokens2 has a lot of columns that have only a ".". I used the utterances() function to load the data, so it is
# only the verbal utterances that are included in this dataset. If we look in the original dataset, we see that
# the "."'s indicate rows where the child communicated non-verbally, e.g. by nodding, shaking their head, counting
# on their fingers, etc. The frequecy of these in the dataset could likely also tell us something about the differ-
# ence in language/communication between TD and ASD children, but for our purpose, we will remove these from the 
# dataset, because we are looking at verbal language.

Unnamed: 0,age,id,group,tokens2
50,45.633333,11312/a-00032743-1,TD,.
61,45.633333,11312/a-00032743-1,TD,.
72,45.633333,11312/a-00032743-1,TD,.
86,45.633333,11312/a-00032743-1,TD,.
88,45.633333,11312/a-00032743-1,TD,.
...,...,...,...,...
304,28.800000,11312/a-00005262-1,TD,no Mama no not just ponytail .
306,28.800000,11312/a-00005262-1,TD,I just your hair .
310,28.800000,11312/a-00005262-1,TD,haircut .
312,28.800000,11312/a-00005262-1,TD,.


# Preparing the data to fit the LogReg and NN classifiers
This means that the data must be in a shape of a class, which contains a tuple of three dictionaries: taining, validation, and test data. Inside each subset is a tuple with a dictionary which contains features (list) and number of rows (value). Features is a list which contains text and labels - so for our data, these would be tokens2, age, and group/diagnosis.

In [31]:
# # Class with functions - not working
# class createDatasetDict:

#     def split_data(self, data):
#         self.data = data

#         # Split dataset into train, test, val (70, 15, 15)
#         train, test = train_test_split(df, test_size=0.15)
#         train, val = train_test_split(train, test_size=0.15)

#         # Turning the split dataframes into dicts
#         train = Dataset.from_dict(train)
#         val = Dataset.from_dict(val)
#         test = Dataset.from_dict(test)

#         return(train, val, test)


#     def create_dicts(self, train, val, test):
#         corpus_dict = datasets.DatasetDict({
#             "train":self.train, 
#             "val":self.val, 
#             "test":self.test
#             })
    
#         return(corpus_dict)

# if __name__=="__main__":
#     createDatasetDict()

In [40]:
class createDatasetDict:

    # Split dataset into train, test, val (70, 15, 15)
    train, test = train_test_split(df, test_size=0.15)
    train, val = train_test_split(train, test_size=0.15)

    # Turning the split dataframes into dicts
    train = Dataset.from_dict(train)
    val = Dataset.from_dict(val)
    test = Dataset.from_dict(test)

    corpus_dict = datasets.DatasetDict({
        "train":train, 
        "val":val, 
        "test":test
        })

In [None]:
from sklearn.model_selection import train_test_split
import datasets
from datasets import Dataset

data = pd.read_csv("/work/exam/ASD_classification/CHI_data.csv")

dd = createDatasetDict()

In [45]:
print(dd.corpus_dict)
print(type(dd.corpus_dict))

DatasetDict({
    train: Dataset({
        features: ['age', 'id', 'group', 'tokens2'],
        num_rows: 6035
    })
    val: Dataset({
        features: ['age', 'id', 'group', 'tokens2'],
        num_rows: 1065
    })
    test: Dataset({
        features: ['age', 'id', 'group', 'tokens2'],
        num_rows: 1253
    })
})
<class 'datasets.dataset_dict.DatasetDict'>
