# Data Preprocessing

In [9]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler, LabelEncoder
import pickle

# get cleaned data
file = open('../data/data_cleaned.save', 'rb')
df, X, y, subject_id = pickle.load(file)
file.close()


# create preprocessor
ordinal_ftrs = ['EDUC']
ordinal_cats = [list(range(1, 24))]

onehot_ftrs = ['Gender', 'SES']
minmax_ftrs = ['Age', 'MMSE', 'nWBV']
std_ftrs = ['eTIV', 'ASF']

# collect all the encoders
preprocessor = ColumnTransformer(
    transformers=[
        ('ord', OrdinalEncoder(categories = ordinal_cats), ordinal_ftrs),
        ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'), onehot_ftrs),
        ('minmax', MinMaxScaler(), minmax_ftrs),
        ('std', StandardScaler(), std_ftrs)])


file = open('../data/preprocessor.save', 'wb')
pickle.dump((preprocessor),file)
file.close()

In [13]:
from sklearn.model_selection import GroupShuffleSplit, GroupKFold#, StratifiedGroupKFold
from sklearn.pipeline import Pipeline

random_state = 5

# create split objects for initial GroupShuffleSplit and subsequent GroupKFold, random state to ensure reproducibility
gss = GroupShuffleSplit(n_splits=1, test_size=.2, random_state=random_state)
group_kfold_train = GroupKFold(n_splits=4)


# separate test set from other sets    
for i_other,i_test in gss.split(X, y, subject_id):
    X_other, y_other, groups_other = X.iloc[i_other], y.iloc[i_other], subject_id.iloc[i_other]
    X_test, y_test, groups_test = X.iloc[i_test], y.iloc[i_test], subject_id.iloc[i_test]

# check to see if each target class is represented in test
print(y_test.value_counts())
print(y_other.value_counts())

# do GroupKFolds split for training and validation data
for train_idx, val_idx in group_kfold_train.split(X_other, y_other, groups_other):
    X_train = X.iloc[train_idx, :]
    X_val = X.iloc[val_idx, :]
    y_train = y.iloc[train_idx]
    y_val = y.iloc[val_idx]

    # check to see if each target class is represented in train and val
    print("train:\n", y_train.value_counts())
    print("val:\n", y_val.value_counts())

    # preprocess the data
    clf = Pipeline(steps=[('preprocessor', preprocessor)])
    X_train_prep = clf.fit_transform(X_train)
    X_val_prep = clf.transform(X_val)
    X_test_prep = clf.transform(X_test)

#         #check shapes
#         print(X_train.shape)
#         print(X_train_prep.shape)
#         display(pd.DataFrame(X_train))
#         display(pd.DataFrame(X_train_prep))


#     file = open('../data/data_preprocessed.save', 'wb')
#     pickle.dump((X_train_prep, X_val_prep, X_test_prep, y_train, y_val, y_test),file)
#     file.close()

# # check distribution of data into sets
# print("train proportion: ", X_train_prep.shape[0]/X.shape[0])
# print("val proportion: ", X_val_prep.shape[0]/X.shape[0])
# print("test proportion: ", X_test_prep.shape[0]/X.shape[0])

df_train = pd.DataFrame(data = X_train_prep)
df_val = pd.DataFrame(data = X_val_prep)

# frac_missing = sum(df_train.isnull().sum(axis=1)!=0)/df_train.shape[0]
# frac_missing

# examine how much data is missing from the dataset
perc_missing_per_ftr = df_val.isnull().sum(axis=0)/df_val.shape[0]
print('fraction of missing values in features:')
print(perc_missing_per_ftr[perc_missing_per_ftr > 0])
frac_missing = sum(df_val.isnull().sum(axis=1)!=0)/df_val.shape[0]
print('fraction of points with missing values:',frac_missing)

0.0    31
0.5    30
1.0    11
Name: CDR, dtype: int64
0.0    175
0.5     93
1.0     28
Name: CDR, dtype: int64
train:
 0.0    122
0.5     77
1.0     23
Name: CDR, dtype: int64
val:
 0.0    44
0.5    24
1.0     6
Name: CDR, dtype: int64
train:
 0.0    127
0.5     76
1.0     19
Name: CDR, dtype: int64
val:
 0.0    39
0.5    25
1.0    10
Name: CDR, dtype: int64
train:
 0.0    133
0.5     68
1.0     21
Name: CDR, dtype: int64
val:
 0.5    33
0.0    33
1.0     8
Name: CDR, dtype: int64
train:
 0.0    116
0.5     82
1.0     24
Name: CDR, dtype: int64
val:
 0.0    50
0.5    19
1.0     5
Name: CDR, dtype: int64
fraction of missing values in features:
Series([], dtype: float64)
fraction of points with missing values: 0.0


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
y_train_prep = le.fit_transform(y_train)

log_reg = LogisticRegression(solver='saga', random_state=5, max_iter = 2000)

log_reg.fit(X_train_prep, y_train_prep)

LogisticRegression(max_iter=2000, random_state=5, solver='saga')