In [15]:
from os import listdir
from os.path import isfile, join
import pandas as pd

# Datasets filenames
directory = "ados_datasets/"
files = [join(directory, f) for f in listdir(directory) \
         if isfile(join(directory, f)) and f.startswith("ados") and f.endswith(".txt")]
print(files)

dataframes = []  # Array of clean dataframes
for f in files:
    df = pd.read_table(f, header=1, sep="\t")  # Read dataset using 2nd row values as columns headers
    columns_to_drop = []  #
    
    # Delete columns with more than 25% of NaN values
    for col in sorted(df.columns):
        count_nan = df[col].isnull().sum()
        ratio_nan = count_nan / len(df)
        if ratio_nan > 0.25:
            del df[col]
    
    # Remove duplicate columns (just keep one of them)
    for col in sorted(df.columns):
        if col.endswith(".1"):
            if col[:-2] in df:
                del df[col]
            else:
                df.rename(columns = {col : col[:-2]}, inplace=True)
    
    dataframes.append(df)
        
# Concatenate dataframes
concat_df = pd.concat(dataframes, join="inner", ignore_index=True)
concat_df

# Results
print("\nTOTAL ROWS:", len(concat_df))
print("TOTAL COLUMNS:", len(concat_df.columns))

print("\nCOLUMNS:")
for col in sorted(concat_df.columns):
    print(col)
    
concat_df

['ados_datasets/ados2_200102.txt', 'ados_datasets/ados1_200102.txt']

TOTAL ROWS: 715
TOTAL COLUMNS: 32

COLUMNS:
ADOS Diagnosis Classification
Age in months at the time of the interview/test/sampling/imaging.
Anxiety
Communication + Social Interaction Total
Communication Total
Date on which the interview/genetic test/sampling/imaging/biospecimen was completed. MM/DD/YYYY
Facial Expressions Directed to Others
Hand and Finger and Other Complex Mannerisms
Imagination/Creativity
Immediate Echolalia
Overactivity
Overall ADOS Diagnosis
Pointing
Quality of Social Overtures
Response to Joint Attention
Response to Name
Self-Injurious Behavior
Sex of the subject
Shared Enjoyment in Interaction
Social Interaction Total
Spontaneous Initiation of Joint Attention
Stereotyped Behaviors and Restricted Interests Total
Stereotyped/Idiosyncratic Use of Words or Phrases
Subject ID how it's defined in lab/project
Tantrums, Aggression, Negative or Disruptive Behavior
The NDAR Global Unique Identifier (GUID

Unnamed: 0,collection_id,dataset_id,The NDAR Global Unique Identifier (GUID) for research subject,Subject ID how it's defined in lab/project,Date on which the interview/genetic test/sampling/imaging/biospecimen was completed. MM/DD/YYYY,Age in months at the time of the interview/test/sampling/imaging.,Sex of the subject,Immediate Echolalia,Stereotyped/Idiosyncratic Use of Words or Phrases,Pointing,...,"Tantrums, Aggression, Negative or Disruptive Behavior",Anxiety,Communication Total,Social Interaction Total,Communication + Social Interaction Total,Stereotyped Behaviors and Restricted Interests Total,ADOS Diagnosis Classification,Overall ADOS Diagnosis,collection_title,promoted_subjectkey
0,9,8161,NDARAD604YHN,N5V9N,12/06/2008,34,M,3.0,1.0,1.0,...,0.0,0.0,6,9,15,3.0,Autism,PDD-NOS,Biomarkers of Autism at 12 Months: From Brain ...,NDARAD604YHN
1,9,8161,NDARAF706DLN,P6C6P,08/19/2010,39,M,0.0,0.0,0.0,...,0.0,0.0,0,0,0,0.0,nonspectrum/typical,typical-speech for articulation,Biomarkers of Autism at 12 Months: From Brain ...,NDARAF706DLN
2,9,8161,NDARAF706DLN,P6C6P,02/18/2011,45,M,0.0,0.0,0.0,...,0.0,0.0,0,0,0,0.0,typical,typical,Biomarkers of Autism at 12 Months: From Brain ...,NDARAF706DLN
3,9,8161,NDARAG550PL8,L4Q5T,08/14/2009,33,M,0.0,0.0,0.0,...,0.0,0.0,1,1,2,1.0,nonspec,nonspec,Biomarkers of Autism at 12 Months: From Brain ...,NDARAG550PL8
4,9,8161,NDARAJ898KYP,L5S3Z,03/04/2011,35,M,0.0,1.0,1.0,...,0.0,0.0,6,12,18,4.0,Autism,AD (High Functioning),Biomarkers of Autism at 12 Months: From Brain ...,NDARAJ898KYP
5,9,8161,NDARAL181JRQ,R3E9X,01/06/2010,38,M,0.0,0.0,0.0,...,0.0,0.0,0,0,0,0.0,Typical,Control,Biomarkers of Autism at 12 Months: From Brain ...,NDARAL181JRQ
6,9,8161,NDARAL769UD4,T4H5K,04/07/2009,31,F,1.0,1.0,1.0,...,0.0,0.0,6,2,8,4.0,typical,GDD,Biomarkers of Autism at 12 Months: From Brain ...,NDARAL769UD4
7,9,8161,NDARAL769UD4,T4H5K,08/22/2009,36,F,1.0,0.0,0.0,...,0.0,0.0,4,1,5,3.0,Non-spectrum,Autistic Features (but no diag ASD),Biomarkers of Autism at 12 Months: From Brain ...,NDARAL769UD4
8,9,8161,NDARAM956UYD,Y4R4M,02/14/2012,35,M,1.0,0.0,0.0,...,1.0,0.0,3,4,7,1.0,non-spec,nonspec;wasASD;possADHD,Biomarkers of Autism at 12 Months: From Brain ...,NDARAM956UYD
9,9,8161,NDARAN001AMK,Q3K9L,02/25/2012,50,M,1.0,1.0,1.0,...,0.0,0.0,7,10,17,5.0,Autism,A.D.,Biomarkers of Autism at 12 Months: From Brain ...,NDARAN001AMK
