This notebook works for loading and reshaping the Eigsti dataset.

In [1]:
# LOADING THE REQUIRED PACKAGES
import os
os.system('pip install pylangacq')
import pylangacq 
import pandas as pd
from sklearn.model_selection import train_test_split

Defaulting to user installation because normal site-packages is not writeable
Collecting pylangacq
  Downloading pylangacq-0.17.0-py3-none-any.whl (65 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 65.2/65.2 kB 180.1 kB/s eta 0:00:00
Collecting tabulate[widechars]>=0.8.9
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate, pylangacq
Successfully installed pylangacq-0.17.0 tabulate-0.9.0


In [2]:
# WRITING A FUNCTION FOR LOADING DATA
# This function loads each file in the directory individually, and turns it into a dataframe. Then it creates columns for age,
# id, and group. Finally, it binds the individual dataframes together.

## Accessing single files
def dataload(datapath):
    df = pd.DataFrame()

    for subject in os.listdir(datapath):
        pylang_obj = pylangacq.read_chat(path = datapath, match = subject)
        d = pd.DataFrame(pylang_obj.utterances())
        d["age"] = pylang_obj.ages(months=True)[0]
        d["id"] = pylang_obj.headers()[0]['PID']
        d["group"] = pylang_obj.headers()[0]['Participants']['CHI']['group']
        df = pd.concat([df, d])

    return(df)

In [3]:
# RUNNING THE DATALOAD FUNCTION
eigstig = dataload(os.path.join("..", "data", "eigstig"))

In [4]:
# CREATING A COLUMN WHERE EACH UTTERANCE IS A STRING
# Accessing the word-keys in the nested dicts in the tokens column and appending them to a string in a new tokens column

words = ""
tokens2 = []

for row in eigstig['tokens']:
    for list in row:
        #print(list['word'])
        words += list['word'] + " "
    tokens2.append(words)
    words = ""

eigstig['tokens2'] = tokens2

In [5]:
# CLEANING THE DF
# Dropping unnecessary columns
eigstig = eigstig.drop(columns=['tokens'])
eigstig = eigstig.drop(columns=['tiers'])
eigstig = eigstig.drop(columns=['time_marks'])

In [6]:
# The dataset also has a group called DD (for developmental delay). These children have developmental delay, but not ASD. This group
# will be filtered out.
print(eigstig['group'].unique())

# Dropping the rows from the DD group
eigstig = eigstig[eigstig.group != 'DD']

# Checking the variables
print(eigstig['group'].unique())

['TD' 'DD' 'ASD']
['TD' 'ASD']


In [7]:
print(eigstig['participant'].unique())

# Dropping the rows that are not participant == CHI
eigstig = eigstig[eigstig.participant == 'CHI']

# Dropping the column participant (since this is always CHI now)
eigstig = eigstig.drop(columns=['participant'])

# Dummy coding a diagnosis column
eigstig['diagnosis'] = eigstig['group'].replace("TD", 0)
eigstig['diagnosis'] = eigstig['diagnosis'].replace("ASD", 1)

['INV1' 'INV2' 'MOT' 'CHI' 'FAT' 'INV' 'MOM']


In [8]:
def preprocess_tokens(df):
    """This function removes weird and redundant characters and spaces
    """

    # Remove punctuations and numbers
    df['tokens2'] = df['tokens2'].str.replace('[^a-zA-Z]', ' ', regex=True)

    # Single character removal
    df['tokens2'] = df['tokens2'].str.replace(r"\s+[a-zA-Z]\s+", ' ', regex=True)

    # Removing multiple spaces
    df['tokens2'] = df['tokens2'].str.replace(r'\s+', ' ', regex=True)

    # Drop spaces created when deleting single period values
    df = df[df.tokens2 != ' ']

    return df

eigstig = preprocess_tokens(eigstig)

# Investigating the age groups of the data

In [None]:
# What is the age range in the data?:
print(eigstig.min(axis=0))
print(eigstig.max(axis=0))
# the age range goes from 32.6 months to 78.3 months, or in years:
print(32.6/12)
print(78.3/12)
# Age range: 2.7-6.5 years old

In [15]:

# The age groups in months:
eigstig_age2 = eigstig[(eigstig.age >= 24) & (eigstig.age < 36)]
eigstig_age3 = eigstig[(eigstig.age >= 36) & (eigstig.age < 48)]
eigstig_age4 = eigstig[(eigstig.age >= 48) & (eigstig.age < 60)]
eigstig_age5 = eigstig[(eigstig.age >= 60) & (eigstig.age < 72)]
eigstig_age6 = eigstig[(eigstig.age >= 72) & (eigstig.age < 84)]

4869

210
2435
1286
729
209


0    1730
1     705
Name: diagnosis, dtype: int64

In [21]:
# Inspecting the age groups
print(len(eigstig))
print("")
print(len(eigstig_age2))
print(len(eigstig_age3))
print(len(eigstig_age4))
print(len(eigstig_age5))
print(len(eigstig_age6))

eigstig['diagnosis'].value_counts()


4869

210
2435
1286
729
209


1    2514
0    2355
Name: diagnosis, dtype: int64

# Preparing data for the models (Text and Label)

In [10]:
# Creating a dataframe with only text and label features
eigstig_text_label = eigstig.drop(columns=['age', 'id', 'group']).rename(columns = {'tokens2':'text', 'diagnosis':'label'})

eigstig_text_label.to_csv('../data/dataframes/tester.csv', index = False)