In [6]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re

# Importing the dataset with library pandas
dataset = pd.read_csv('TADPOLE_InputData.csv')
labels_train = pd.read_csv('TADPOLE_TargetData_train.csv')
labels_test = pd.read_csv('TADPOLE_TargetData_test.csv')
target = pd.read_csv('TADPOLE_PredictTargetData_valid.csv')

# Drop meaningless rows and columns. A good practice is to drop rows before columns.
dataset = dataset[~np.isnan(dataset['PTID_Key'])] # Drop patients with no ID, since they cannot be used for learning or prediction. [] slices the rows in dataframe.
dataset = dataset.dropna(axis=1, how='all') # Drop empty columns in dataset 

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
# These columns are time stamps that do not carry particular meaning, so dropped for now.
badColumns = ['update_stamp_UCSFFSL_02_01_16_UCSFFSL51ALL_08_01_16',
    'update_stamp_UCSFFSX_11_02_15_UCSFFSX51_08_01_16',
    'update_stamp_UCBERKELEYAV45_10_17_16',
    'update_stamp_DTIROI_04_30_14']
dataset = dataset.drop(badColumns,axis=1) # Remove this section from objDataset

In [8]:
 # Unfortunately, some numerical columns contain non-numerical character such as '>' in  '>1300'.
# The strategy is to convert these columns to floats by extracting only numbers. For example, '>1300' goes to 1300
columnsObjToNum = ['ABETA_UPENNBIOMK9_04_19_17','TAU_UPENNBIOMK9_04_19_17','PTAU_UPENNBIOMK9_04_19_17','COMMENT_UPENNBIOMK9_04_19_17']
for column in columnsObjToNum:
    colIdx = dataset.columns.get_loc(column)
    rowIdx = np.where(dataset[column].apply(type).values == str)[0] # Find all str type elements in each column, which may or may not contain non-numerical characters such as '<' or '>'.
    for row in rowIdx: # iterate through each row of string type element in the column
        dataset.iloc[row,colIdx] = float(re.sub("[^0-9.]","",dataset[column].values[row])) # Find the float/int number in the string, and cast to float type. 
dataset[columnsObjToNum] = dataset[columnsObjToNum].astype(float) # cast each column to float type

In [9]:
# Convert date columns to date format in dataset, since they are currently imported as object columns
for column in dataset: # variable 'column' is a string
    if dataset[column].dtype == 'object' and dataset[column].str.match('[0-9]+/[0-9]+/[0-9]+').sum() > 0: # returns true if this column contains at least one string that matches date format.
        dataset[column] = pd.to_datetime(dataset[column],format="%m/%d/%y",errors='coerce') # convert string to date

In [17]:
# Some numerical columns contain only one/few possible values, which are more likely to be categorical than numerical features.
# As a result, such columns are converted to objective dtype. e.g. some column with only -4 and nan.
columnsNumToCat = []
for column in dataset:
    psbVal = dataset[column].unique()
    if psbVal.dtype == 'float64' and psbVal.size <= 20:
        columnsNumToCat.append(column)
        validRowIdx = dataset[column].notnull()
        dataset[column].loc[validRowIdx] = dataset[column].loc[validRowIdx].astype(str)

In [20]:
# Tally the data types of all data columns, and then separate them according to dtype.
dtypeCounts = dataset.dtypes.value_counts(); # Count the number of columns for each data type. Turns out to be only 'float64' and 'object'.
numDataset = dataset.select_dtypes(include=['float'])
objDataset = dataset.select_dtypes(include=['object'])
dateDataset = dataset.select_dtypes(include=['datetime64']) # select dates from objDataset for variable dateDataset

In [21]:
# Imputing missing data in numDataset
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values = 'NaN', strategy = 'mean',axis = 0) # impute numerical columns
imp = imp.fit(numDataset)
numX = imp.transform(numDataset) # Extract data from numData as numX in dtype ndarray
#numAttributes = numDataset.columns.values # Names of columns in numX. Executed after imputation since all NaN columns are dropped.

In [22]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
objDataset.loc[-1,:] = np.repeat(np.nan,objDataset.shape[1]) # Append one NaN to the end of each column so that NaN must be a class for each column.
objDataset = objDataset.fillna(value=' ') # LabelEncoder does not work with NaN, so NaN is converted to a space ' ', which is always sorted as the first class by LabelEncoder.
le = LabelEncoder()
catClasses = np.array([]) # initiate an empty list of attribute names
catAttributes = np.array([]) # initiate an empty list of attribute names
for i in range(objDataset.shape[1]):
    objDataset.iloc[:,i] = le.fit_transform(objDataset.iloc[:,i]) # encode column i
    classes = le.classes_ # All the labels in column i, including ' ', which was translated from NaN
    classes[0] = 'NaN' # Replace ' ' with the attribute of the column
    catClasses = np.append(catClasses,classes) # Append column attribute followed by all its labels to catAttribute
    catAttributes = np.append(catAttributes, np.repeat(objDataset.columns.values[i],classes.size))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
