### 1. Loading data for training models

- The original data has been processed with MySQL for extract features.
- Instead of using MySQL, pymssql or pandas (in parallel) also can be used.

In [1]:
import csv
import re

# loading training data, seperate data and label
def load_data(filepath):
    data = []
    label = []
    with open(filepath) as f:
        f_csv = csv.reader(f)
        headers = next(f_csv)
        for row in f_csv: 
            row_data = []
            for item in row[2:9]:
                row_data.append(int(re.sub(r',','',item)))
            data.append(row_data)
            label.append(int(row[9]))   
    return data,label,headers

# loading final data set for prediction, return data and ID
def load_data_testfinal(filepath):
    ID = []
    data = []
    with open(filepath) as f:
        f_csv = csv.reader(f)
        headers = next(f_csv)
        for row in f_csv: 
            row_data = []
            ID.append(int(re.sub(r',','',row[0])))
            for item in row[2:9]:
                row_data.append(int(re.sub(r',','',item)))
            data.append(row_data)
    return data,ID

In [2]:
train_data,train_label,train_headers = load_data('./data/train/TRAIN.csv')
test_data,test_label,test_headers = load_data('./data/train/DEV1.csv')

In [3]:
print"The features which have been used for training models are: "
print train_headers[2:9]
print train_data[1]

The features which have been used for training models are: 
['year_of_birth', 'state_code', 'trans_all', 'trans_Lipids', 'trans_Hypertension', 'trans_Depression', 'trans_Diabetes']
[1900, 2, 92, 12, 46, 0, 6]



Comments for potential improvement:
- Extract more features for describing instances more in detail and improving the performance of the model.
- Add data augment to increase the size of training set for avoiding overfit. 

### 2. The evaluation of potential base models
- The models have been used: Naive Bayes, KNN, Logistic Regression, Decision Tree.
- Algorithm tuning: grid search.
- Measurement: AUC and accuracy.

In [4]:
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_auc_score

In [5]:
from sklearn.naive_bayes import GaussianNB

probabilities = []
model = GaussianNB()

# predict the label
model.fit(train_data,train_label)
predictions = list(model.predict(test_data))

# pick up the probability of prediction
proba = model.predict_proba(test_data)
for p in proba:
    probabilities.append(p[1])

# calculate auc
auc = roc_auc_score(test_label,probabilities)

print model
print 'AUC:',auc
print "Accuracy:",accuracy_score(test_label,predictions)
# print classification_report(test_label,predictions)

GaussianNB()
AUC: 0.904442768446
Accuracy: 0.918


In [6]:
from sklearn.neighbors import KNeighborsClassifier

def evaluation_metric_KNN(train_data,train_label,test_data,test_label):
    # set up parameters for grid search
    c_to_test = [10,30,50]
    
    models = [KNeighborsClassifier(leaf_size = c) for c in c_to_test]

    for model in models:
        probabilities = []
        
        # predict the label
        model.fit(train_data,train_label)
        predictions = list(model.predict(test_data))

        # pick up the probability of prediction
        proba = model.predict_proba(test_data)
        for p in proba:
            probabilities.append(p[1])
            
        # calculate auc
        auc = roc_auc_score(test_label,probabilities)
        
        print model
        print 'AUC:',auc
        print "Accuracy:",accuracy_score(test_label,predictions)
#         print classification_report(test_label,predictions)
        print
        
evaluation_metric_KNN(train_data,train_label,test_data,test_label)

KNeighborsClassifier(algorithm='auto', leaf_size=10, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
AUC: 0.943176414249
Accuracy: 0.9335

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
AUC: 0.943395832881
Accuracy: 0.9335

KNeighborsClassifier(algorithm='auto', leaf_size=50, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
AUC: 0.942795143089
Accuracy: 0.9335



In [7]:
from sklearn.linear_model import LogisticRegression

def evaluation_metric_LR(train_data,train_label,test_data,test_label):
    # set up parameters for grid search
    c_to_test = [0.0001,0.001]
    
    models = [LogisticRegression(C = 0.1, intercept_scaling=1, dual=False, fit_intercept=True, penalty='l2', tol=c) for c in c_to_test]

    for model in models:
        probabilities = []
        
        # predict the label
        model.fit(train_data,train_label)
        predictions = list(model.predict(test_data))

        # pick up the probability of prediction
        proba = model.predict_proba(test_data)
        for p in proba:
            probabilities.append(p[1])
            
        # calculate auc
        auc = roc_auc_score(test_label,probabilities)
        
        print model
        print 'AUC:',auc
        print "Accuracy:",accuracy_score(test_label,predictions)
#         print classification_report(test_label,predictions)
        print
        
evaluation_metric_LR(train_data,train_label,test_data,test_label)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
AUC: 0.955553126841
Accuracy: 0.9225

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.001,
          verbose=0, warm_start=False)
AUC: 0.954630400577
Accuracy: 0.9235



In [8]:
from sklearn.tree import DecisionTreeClassifier

def evaluation_metric_tree(train_data,train_label,test_data,test_label):
    # set up parameters for grid search
#     d_to_test = [5,6]
#     f_to_test = [5,6] 
#     c_to_test = [1,2,3]
#     ms_to_test = [7,10]
    s_to_test = [9334,12343,6217,7307]
    
#     models = [DecisionTreeClassifier(max_depth = d, max_features = f, min_samples_split = ms, min_samples_leaf = c, random_state = s) for d in d_to_test for f in f_to_test for ms in ms_to_test for c in c_to_test for s in s_to_test]
    models = [DecisionTreeClassifier(max_depth = 5, max_features = 5, min_samples_split = 10, min_samples_leaf = 3, random_state = s) for s in s_to_test]
    
    for model in models:
        probabilities = []
        
        # predict the label
        model.fit(train_data,train_label)
        predictions = list(model.predict(test_data))
        
        # pick up the probability of prediction
        proba = model.predict_proba(test_data)
        for p in proba:
            probabilities.append(p[1])
            
        # calculate auc
        auc = roc_auc_score(test_label,probabilities)
        
        if auc > 0.963:
            print model  
            print 'AUC:',auc
            print "Accuracy:",accuracy_score(test_label,predictions)
    #         print classification_report(test_label,predictions)
            print
        
evaluation_metric_tree(train_data,train_label,test_data,test_label)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=5, max_leaf_nodes=None, min_samples_leaf=3,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            presort=False, random_state=9334, splitter='best')
AUC: 0.963508929421
Accuracy: 0.9445

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=5, max_leaf_nodes=None, min_samples_leaf=3,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            presort=False, random_state=12343, splitter='best')
AUC: 0.963488072136
Accuracy: 0.9405

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=5, max_leaf_nodes=None, min_samples_leaf=3,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            presort=False, random_state=6217, splitter='best')
AUC: 0.963275327835
Accuracy: 0.9435

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
     

Comments:
- Performance: NB < KNN < LR < Decision Tree.
- NB: Generative classifier, features have to be strictly independent.
- KNN: When the classification is unbalanced, the performance is poor.
- LR: Hard to handle nonlinear relationships between features.
- Decision Tree: Consider the interaction between features, able to handle nonlinear relationships between features.

### 3. Ensemble methods
- Boosting: Gradient boosting.
- Stacking: simple voting.

In [9]:
from sklearn.ensemble import GradientBoostingClassifier

def evaluation_metric_GBDT(train_data,train_label,test_data,test_label):
    # grid search
    s_to_test = [124,179]
    
#     models = [GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=5, max_features=5, min_samples_split=10, subsample=0.8, random_state = s) for s in xrange(100,300)]
    models = [GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=5, max_features=5, min_samples_split=10, subsample=0.8, random_state = s) for s in s_to_test]
    
    for model in models:
        probabilities = []
        
        # predict the label
        model.fit(train_data,train_label)
        predictions = list(model.predict(test_data))
        
        # pick up the probability of prediction
        proba = model.predict_proba(test_data)
        for p in proba:
            probabilities.append(p[1])
            
        # calculate auc
        auc = roc_auc_score(test_label,probabilities)
        
        if auc > 0.96:
            print model  
            print 'AUC:',auc
            print "Accuracy:",accuracy_score(test_label,predictions)
    #         print classification_report(test_label,predictions)
            print
        
evaluation_metric_GBDT(train_data,train_label,test_data,test_label)

GradientBoostingClassifier(init=None, learning_rate=1.0, loss='deviance',
              max_depth=5, max_features=5, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=10,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=124, subsample=0.8, verbose=0,
              warm_start=False)
AUC: 0.962204097706
Accuracy: 0.9425

GradientBoostingClassifier(init=None, learning_rate=1.0, loss='deviance',
              max_depth=5, max_features=5, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=10,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=179, subsample=0.8, verbose=0,
              warm_start=False)
AUC: 0.961936290173
Accuracy: 0.9445



In [10]:
from sklearn.ensemble import VotingClassifier

estimators = []
model1 = DecisionTreeClassifier(max_depth = 5, max_features = 5, min_samples_leaf = 2, min_samples_split = 7, random_state = 9334)
estimators.append(('tree9334', model1))
model2 = DecisionTreeClassifier(max_depth = 5, max_features = 5, min_samples_leaf = 3, min_samples_split = 10, random_state = 12343)
estimators.append(('tree12343', model2))
model3 = DecisionTreeClassifier(max_depth = 5, max_features = 5, min_samples_leaf = 2, min_samples_split = 7, random_state = 6217)
estimators.append(('tree6217', model3))
model4 = DecisionTreeClassifier(max_depth = 5, max_features = 5, min_samples_leaf = 3, min_samples_split = 10, random_state = 7307)
estimators.append(('tree7307', model4))

# create the ensemble model
ensemble = VotingClassifier(estimators,voting='soft')

probabilities = []

# predict the label
ensemble.fit(train_data,train_label)
predictions = list(ensemble.predict(test_data))
print ensemble

# # pick up the probability of prediction
proba = ensemble.predict_proba(test_data)
for p in proba:
    probabilities.append(p[1])

# calculate auc
auc = roc_auc_score(test_label,probabilities)

# print model
print 'AUC:',auc
print "Accuracy:",accuracy_score(test_label,predictions)
print

VotingClassifier(estimators=[('tree9334', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=5, max_leaf_nodes=None, min_samples_leaf=2,
            min_samples_split=7, min_weight_fraction_leaf=0.0,
            presort=False, random_state=9334, splitter='best')), ('tre...t=10, min_weight_fraction_leaf=0.0,
            presort=False, random_state=7307, splitter='best'))],
         voting='soft', weights=None)
AUC: 0.964480044585
Accuracy: 0.9455



Comments:
- Performance: Boosting < Stacking.
- Boosting: Iterative sampling, target "hard" instances. However, the influence is limited.
- Stacking: Multiple models.

### 4. Final prediction
The final model is the stacking method above.

In [11]:
train_final_data,train_final_label,train_final_headers = load_data('./data/train/TRAIN_Final.csv')
test_final_data,test_final_ID = load_data_testfinal('./data/train/TEST_Final.csv')

In [12]:
model = ensemble
probabilities = []
model.fit(train_data,train_label)
proba = model.predict_proba(test_final_data)
for p in proba:
    probabilities.append(p[1])

In [13]:
headers = ['Patient_ID','Diabetes']
rows = []
for i in range(len(probabilities)):
    item = (test_final_ID[i],probabilities[i])
    rows.append(item)

with open("./data/prediction/preds.csv", "w") as f:    
    preds_csv = csv.writer(f)
    preds_csv.writerow(headers)
    preds_csv.writerows(rows)
print("Wrote predictions.")

Wrote predictions.
