# Import basic packages (e.g., numpy and pandas)

In [257]:
import numpy as np
import pandas as pd

# Read in and prepare the data set 

In [258]:
# read in the check-in file
data = pd.read_csv("/Users/KVTran/Documents/Research/WPI/Slip_Buddy/REU_2017/data/questions.csv")
# show the columns' names
list(data)
# select only a number of informative columns
col = ['DAY', 'patientid', 'Gender', 'Age', 'Marital Status', 'Education (Numeric)', 
       'Hispanic', 'Caucasian', 'Black', 'Asian', 'NHPI', 'Native American', 'Multi', 'Others',
       'Household', 'Income', 
       'AM Stress', 'AM Hunger', 'AM Sleep', 'AM Sleep hours',
       'AM Weight', 'Percent Weight change (from prev week)', 'Percent Weight change (from prev day)',
       'PM Stress', 
       'EVE Stress',
       'Number of Episodes Previous Day',
       'Episode']
data = data[col]

# Convert data to numerical

In [259]:
# Convert the categorical variables into numerical one
# Use the fit_transform() method from sklean.preprocessing.LableEncoder()
# Notice that for binary data (Hispanc, Episode) the answer is Yes/No. Hence, we subtract 1 
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
def toNumeric(data, nominalVariables):
    """
    This function takes in a list of categorical variables' names (columns' names) 
    It convert those values into numeric, and assign it directly to data dataframe
    Return nothing
    """
    for i in nominalVariables:
        numericalVals = le.fit_transform(data[i])
        if (i == "Hispanic" or i == "Episode"):
            numericalVals -= 1
        data[i] = numericalVals
        
nominalVariables = ["Gender", "Marital Status","Hispanic", "Episode"]
toNumeric(data, nominalVariables)

# Find out which columns contain NaN

In [260]:
# Check for NaN values in every columns
def checkNaN(data, col):  
    """
    This function accesses column to column and checks if there is any NaN
    If yes, print out the column
    """
    nullCount = 0
    colWithNaN = []
    for i in col:
        if data[i].isnull().any():
            colWithNaN.append(i)
            #print("NaN found in " + i)
            nullCount += 1
    if nullCount == 0:
        print("no null values found")
    else:
        return colWithNaN
    
    
colWithNaN = checkNaN(data, col)
# colWithNaN

# Replace NaN with appropriate values

In [261]:
# We remove the very last row that was added in when the data being converted to numerical
data = data.drop(data.index[-1])

#
def replaceNaN(data, col):
    """
    Form a data block of rows as each patientid and each of the columns that contain NaN
    Compute the mean value of that data block
    replace the NaN value with that mean
    Return nothing
    """
    colWithNaN = checkNaN(data, col)
    patientID = list(set(data.patientid))
    patientIDIndex = 0
    for i in patientID:
        for j in colWithNaN:
            dataBlock = data[data.patientid == i][j]
            meanVal = np.mean(dataBlock)
            if j != 'Percent Weight change (from prev week)' and j != 'Percent Weight change (from prev day)':
                meanVal = round(meanVal)
            # replaceVal is a where() object of numpy.
            # check where the datablock contains NaN, replace that spot with meanVal.
            replaceVal = np.where(dataBlock.isnull(),
                                 meanVal,
                                 dataBlock)
            # Notice: using .loc[,] to assign value to the df itself instead of to a copy
            data.loc[data["patientid"] == i, j] = replaceVal

replaceNaN(data, col)

# Export the final data set

In [296]:
data.to_csv("/Users/KVTran/Documents/Research/WPI/Slip_Buddy/REU_2017/data/questions_cleaned.csv", index = False)

In [297]:
data.shape

(1151, 27)