In [130]:
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np
import re
import json
from difflib import SequenceMatcher
from itertools import combinations

def get_filenames(path, exclude=[]):
    """
    Returns list of datasets filenames in path
    """
    directory = "ados_datasets/"
    exclude = []
    files = [join(directory, f) for f in listdir(directory) \
             if isfile(join(directory, f)) \
             and f.startswith("ados") and f.endswith(".txt") and f not in exclude]
    return files

def delete_nan_columns(df, max_nan_ratio):
    """
    Delete columns from df with more than max_nan_ratio of NaN values
    """
    for col in sorted(df.columns):
        count_nan = df[col].isnull().sum()
        ratio_nan = count_nan / len(df)
        if ratio_nan > max_nan_ratio:
            del df[col]
            
def has_required_columns(df, required_columns):
    """
    Check if df has required_columns
    """
    return set(required_columns) < set(df.columns)

def remove_preffix(df, col):
    """
    Handles appearances of preffixes in columns names
    """
    pattern = re.compile("^[a-eA-E]{1}[0-9]{1,2}[aA]?\.")
    match = pattern.match(col)
    if match:
        clean_col = col[match.span()[1]:].strip()
        if clean_col.lower() in map(str.lower, df.columns):
            del df[col]
        else:
            df.rename(columns = {col : clean_col}, inplace=True)
        return True
    return False

def remove_suffix(df, col):
    """
    Handles appearances of suffixes in columns names
    """
    pattern = re.compile(".*\.[1-9]$")
    match = pattern.match(col)
    if match:
        if col[:-2] in df:
            del df[col]
        else:
            df.rename(columns = {col : col[:-2]}, inplace=True)
            
def similarity(a, b):
    """
    Returns similarity ratio between strings a and b
    """
    return SequenceMatcher(None, a, b).ratio()

def generate_similar_names(df, min_similarity=0.9):
    """
    Returns a dictionary of similar names within df.columns
    """
    return {c1: c2 for (c1, c2) in combinations(df.columns, 2) if similarity(c1, c2) > min_similarity}
            
def merge_similar_columns(df, similar_names):
    """
    Merges columns with similar names, using the non-NaN values if possible
    """
    for col in df.columns:
        if col in similar_names:
            # Combine both columns into the first one
            df[col] = df[col].fillna(df[similar_names[col]])
            # Delete the second column
            del df[similar_names[col]]

In [131]:
files = get_filenames("ados_datasets/")
    
required_columns = [
    "ADOS Diagnosis Classification"
]

concat_df = pd.DataFrame()  # Concatenation of clean dataframes
for f in files:
    df = pd.read_table(f, header=1, sep="\t")  # Read dataset using 2nd row values as columns headers
    
    """
    if not has_required_columns(df, required_columns):
        continue
    """
    # Delete columns with more than 80% of NaN values
    delete_nan_columns(concat_df, max_nan_ratio=0.80)
    
    # Remove duplicate columns (just keep one of them)
    lower_columns = [x.lower() for x in df.columns]
    for col in sorted(df.columns):
        # Check for columns ending with ".1" or ".2" or ...
        remove_suffix(df, col)
                
        # Remove duplicated columns
        if lower_columns.count(col.lower()) > 1:
            del df[col]
            
        # Check for columns starting with a "A12.a" pattern and rename them
        remove_preffix(df, col)
        
    # Transform columns names to lowercase
    df.columns = map(str.lower, df.columns)
    
    # Concatenate dataframes
    if concat_df.empty:
        concat_df = df
    else:
        concat_df = pd.concat([concat_df, df], join="outer", ignore_index=True)
    
    print("Adding file: {}".format(f))

# Generate similar names dictionary
similar_names = generate_similar_names(concat_df)
print("\nSIMILAR NAMES:", json.dumps(similar_names, indent=1))

# Add special cases (not similar names, but corresponding columns)
# TODO
    
# Merge columns with similar names
merge_similar_columns(concat_df, similar_names)

# Delete columns with more than 25% of NaN values
delete_nan_columns(concat_df, 0.25)
    
# Stats
print("\nCOLUMNS:")
for col in sorted(concat_df.columns):
    print(col)
print("\nTOTAL COLUMNS:", len(concat_df.columns))
    
concat_df
print("\nNumber of rows with...")
for i in range(len(concat_df.columns) + 1):
    non_nan_rows = np.sum(concat_df.isnull().sum(axis=1) == i)
    print("\t... {} NaN features: {}".format(i, non_nan_rows))
print("\nTOTAL ROWS:", len(concat_df))

Adding file: ados_datasets/ados1_200102.txt
Adding file: ados_datasets/ados1_200102_1250.txt
Adding file: ados_datasets/ados1_200102_19.txt
Adding file: ados_datasets/ados1_200102_2382.txt
Adding file: ados_datasets/ados1_200102_9.txt
Adding file: ados_datasets/ados1_200701_1250.txt
Adding file: ados_datasets/ados1_200701_1946.txt
Adding file: ados_datasets/ados1_201201_1250.txt
Adding file: ados_datasets/ados1_201201_19.txt
Adding file: ados_datasets/ados1_201201_1946.txt
Adding file: ados_datasets/ados1_201201_2080.txt
Adding file: ados_datasets/ados1_201201_2368.txt
Adding file: ados_datasets/ados2_200102.txt
Adding file: ados_datasets/ados2_200102_1250.txt
Adding file: ados_datasets/ados2_200102_2382.txt
Adding file: ados_datasets/ados2_200102_9.txt
Adding file: ados_datasets/ados2_200701_1250.txt
Adding file: ados_datasets/ados2_200701_1946.txt
Adding file: ados_datasets/ados2_201201_1250.txt
Adding file: ados_datasets/ados2_201201_19.txt
Adding file: ados_datasets/ados2_201201_19

In [140]:
# come up
# -1: unknonw
# 0: non sepctrum
# 1: Autism Spectrum 
# 2: autism

def updateDiagnosis(diagnosis):
    if diagnosis in [0,1,2,3,'2']:
        return 0
    if diagnosis in [4,5]:
        return 1
    if diagnosis in [i for i in range(6,11)] + [str(i) for i in range(6,11)]:
        return 2
    new = str.lower(str(diagnosis))
    new = re.sub('[- ./]', '', new)
    if '?' in new or 'nan' in new or 'dd' in new or 'unknown' in new:
        return -1
    if 'non' in new or 'not' in new or 'typical' in new:
        return 0
    if 'spectrum' in new:
        return 1
    if 'aut' in new or 'asd' in new:
        return 2
    return new
    
concat_df['diagnosis'] = concat_df['ados diagnosis classification'].apply(updateDiagnosis)

In [141]:
concat_df.to_csv('ados_datasets/current_data.txt', sep='\t', mode='a', header=0)

In [142]:
concat_df.isnull().sum(axis=0)

ados diagnosis classification                                                                        83
age in months at the time of the interview/test/sampling/imaging.                                     0
anxiety                                                                                            1009
collection_id                                                                                         0
collection_title                                                                                      0
dataset_id                                                                                            0
date on which the interview/genetic test/sampling/imaging/biospecimen was completed. mm/dd/yyyy       0
hand and finger and other complex mannerisms                                                        986
imagination/creativity                                                                              975
immediate echolalia                                             

In [138]:
concat_df['diagnosis'].value_counts()

 0    2161
 2    1399
 1     338
-1     174
Name: diagnosis, dtype: int64

In [143]:
concat_df.shape

(4072, 21)