In [16]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn import metrics
import tensorflow as tf 
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.datasets import mnist
from sklearn.model_selection import KFold
from tensorflow.python.keras import backend as K
from keras import optimizers
from sklearn.svm import SVC
from keras.layers import BatchNormalization
import csv


def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=',')

In [17]:
# load training data
training_data = load_data('train_2008.csv', 1)
np.random.shuffle(training_data)
# get all of the parameters
train_data = training_data[:,:-1]
# get all the targets
train_target = training_data[:, -1]

# load testing data
test_data_08 = load_data('test_2008.csv', 1)
test_data_12 = load_data('test_2012.csv', 1)

# find any column which contains only one value in all its rows (such as interview month/year)
useless_columns = np.all(train_data == train_data[0,:], axis=0)
indices_to_remove = []
for i in range(len(useless_columns)):
    if useless_columns[i]:
        indices_to_remove.append(i)
        
#delete those columns
train_data = np.delete(train_data, indices_to_remove, 1)
test_data_08 = np.delete(test_data_08, indices_to_remove, 1)
test_data_12 = np.delete(test_data_12, indices_to_remove, 1)

# train_target = np.reshape(training_data[:, -1], (-1,1))

In [33]:
# get top k features
num_params = 15
test = SelectKBest(k=num_params)
fit = test.fit(train_data, train_target)

scores = fit.scores_.tolist()
scores_copy = fit.scores_.tolist()
best_scores = []
best_indices = []

for i in range(num_params):
    m = max(scores)
    best_scores.append(m)
    best_indices.append(scores_copy.index(m))
    scores.remove(m)
print(best_scores, best_indices)

[6354.274281844813, 2001.0085936420126, 1769.558765044904, 1740.0016910066265, 1591.5415341456326, 1512.8712783768926, 1367.2313990321109, 1363.064708408233, 1361.5278545849462, 1270.6635529152434, 1208.6145868555109, 979.8610848576337, 925.6815819690952, 874.2389835343976, 841.0199437722187] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]


In [34]:
train_data = train_data[:, best_indices]
test_data_08 = test_data_08[:, best_indices]
test_data_12 = test_data_12[:, best_indices]

In [35]:
# 5-fold validation
kf = KFold(n_splits=5)
models = []
aucs = []

for train_index, test_index in kf.split(train_data):
    X_train, y_train = train_data[train_index], train_target[train_index]
    X_test, y_test = train_data[test_index], train_target[test_index]
    
    # test = tree.DecisionTreeClassifier(criterion='gini')
    
    #--------------------------------------------
    
#     clf = SVC(gamma='auto')
#     clf.fit(X_train, y_train) 
    
#     pred = clf.predict_proba(X_test)[:,1]
    
    #--------------------------------------------
    
    model = Sequential()
    model.add(Dense(600, input_shape=(len(best_indices),), kernel_initializer='normal'))
    model.add(BatchNormalization())
    model.add(Activation('tanh'))
    model.add(Dropout(0.2))
    model.add(Dense(300))
    model.add(BatchNormalization())
    model.add(Activation('tanh'))
    model.add(Dropout(0.2))
    model.add(Dense(100))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(1))
    model.add(BatchNormalization())
    model.add(Activation('sigmoid'))

    #--------------------------------------------
#     adam = optimizers.adam(lr = 0.005, decay = 0.0000001)


#     model = Sequential()
#     model.add(Dense(480, input_dim=X_train.shape[1],
#                     kernel_initializer='normal',
#                     #kernel_regularizer=regularizers.l2(0.02),
#                     activation="relu"))
#     model.add(Dropout(0.2))
#     model.add(Dense(240,
#                     #kernel_regularizer=regularizers.l2(0.02),
#                     activation="tanh"))
#     model.add(Dropout(0.3))
#     model.add(Dense(1))
#     model.add(Activation("sigmoid"))
    
#     #--------------------------------------------
    
    model.compile(loss="binary_crossentropy", optimizer='adam')

    history = model.fit(X_train, y_train, validation_split=0.2, epochs=10, batch_size=150)
    
    #--------------------------------------------

    ## Printing a summary of the layers and weights in your model
#     model.summary()

#     model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

#     fit = model.fit(np.array(X_train), np.array(y_train), batch_size=150, nb_epoch=10,
#        verbose=1)

#     ## Printing the accuracy of our model, according to the loss function specified in model.compile above
#     score = model.evaluate(np.array(X_test), np.array(y_test), verbose=0)
#     print('Test score:', score[0])
#     print('Test accuracy:', score[1])
    
    pred = model.predict(X_test)
    models.append(model)

#     test.fit(X_train, y_train)
    
#     pred = model.predict_proba(X_test)[:,1]

    false_pos_rate, true_pos_rate, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
    print(metrics.auc(false_pos_rate, true_pos_rate))
    aucs.append(metrics.auc(false_pos_rate, true_pos_rate))
    
max_auc = max(aucs)
best_index = aucs.index(max_auc)
best_model = models[best_index]

Train on 41386 samples, validate on 10347 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.7546193859088351
Train on 41386 samples, validate on 10347 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.7402173065802008
Train on 41387 samples, validate on 10347 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.7515264281742701
Train on 41387 samples, validate on 10347 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.7546760302079677
Train on 41387 samples, validate on 10347 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.7651763442834025


In [36]:
# 2008 predictions
predictions = best_model.predict(test_data_08).flatten()
with open('test_2008_predictions.csv', mode='w') as test_2008_predictions:
    prediction_writer = csv.writer(test_2008_predictions, delimiter=',')
    prediction_writer.writerow(['id', 'target'])
    for row in range(len(predictions)):
        prediction_writer.writerow([row, predictions[row]])

In [37]:
# 2012 predictions
predictions = best_model.predict(test_data_12).flatten()
with open('test_2012_predictions.csv', mode='w') as test_2012_predictions:
    prediction_writer = csv.writer(test_2012_predictions, delimiter=',')
    prediction_writer.writerow(['id', 'target'])
    for row in range(len(predictions)):
        prediction_writer.writerow([row, predictions[row]])