In [59]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
def create_model(optimizer='rmsprop', init='glorot_uniform'):
    # create model
    model = Sequential()
    model.add(Dense(12, input_dim=4, kernel_initializer=init, activation='relu'))
    model.add(Dense(8, kernel_initializer=init, activation='relu'))
    model.add(Dense(1, kernel_initializer=init, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [4]:
# fix random seed for reproducibility
seed = 117
np.random.seed(seed)

In [78]:
dataset = pd.read_csv('../data/400k_TRAIN_All_features.csv')
dataset.head()

Unnamed: 0,source,sink,common_neighbors,num_cnb,t1,t2,t3,t4,t1BYnum_cnb,t2BYnum_cnb,...,source_in_degree,source_out_degree,sink_in_degree,sink_out_degree,source_outBYin_ratio,source_inBYout_ratio,sink_outBYin_ratio,sink_inBYout_ratio,AAprediction,JAprediction
0,1709040,1695026,"[4575620, 3709961, 3503374, 4332049, 4761107, ...",39,38,0,38,0,0.974359,0.0,...,640,35279,52,0,55.123438,0.018141,0.0,0.0,3.669171,0.001105
1,20388,4700562,"[3553485, 2726331, 3550837]",3,3,0,3,0,1.0,0.0,...,4841,761793,4,0,157.362735,0.006355,0.0,0.0,0.262151,4e-06
2,4067692,2495732,[],0,0,0,0,0,0.0,0.0,...,109,10458,1,0,95.944954,0.010423,0.0,0.0,0.0,0.0
3,2912811,345595,[],0,0,0,0,0,0.0,0.0,...,83,4829,3,0,58.180723,0.017188,0.0,0.0,0.0,0.0
4,3077895,3951788,"[3637241, 3376555, 3149884]",3,3,0,3,0,1.0,0.0,...,355,1149,5,0,3.23662,0.308964,0.0,0.0,0.421203,0.002562


In [22]:
test = pd.read_csv('../data/2k_TEST_All_features.csv')

In [23]:
testFM = test.drop(['source', 'sink', 'common_neighbors','num_cnb', 't1', 't2', 't3', 't4',
       't1BYnum_cnb', 't2BYnum_cnb', 't3BYnum_cnb', 't4BYnum_cnb','source_in_degree', 'source_out_degree','sink_out_degree', 'source_inBYout_ratio', 'sink_outBYin_ratio'], axis=1)

In [24]:
testFM.head()

Unnamed: 0,Id,sink_in_degree,source_outBYin_ratio,sink_inBYout_ratio,AAprediction,JAprediction
0,1,3,0.813725,0.0,0.0,0.0
1,2,289,8.717949,0.0,0.407705,0.00626
2,3,2,16.0,0.0,0.0,0.0
3,4,24,5.25,1.714286,1.238898,0.0625
4,5,165,8.9,2.844828,0.802812,0.012072


In [25]:
testFM['sink_in_degree'] = np.log(testFM['sink_in_degree']+1)
testFM['source_outBYin_ratio'] = np.log(testFM['sink_in_degree']+1)
testFM.head()

Unnamed: 0,Id,sink_in_degree,source_outBYin_ratio,sink_inBYout_ratio,AAprediction,JAprediction
0,1,1.386294,0.869742,0.0,0.0,0.0
1,2,5.669881,1.897602,0.0,0.407705,0.00626
2,3,1.098612,0.741276,0.0,0.0,0.0
3,4,3.218876,1.439569,1.714286,1.238898,0.0625
4,5,5.111988,1.810252,2.844828,0.802812,0.012072


In [63]:
dataset = dataset[(dataset['source_out_degree']<500)|(dataset['source_out_degree']>20000)]

In [79]:
dataset1 = dataset[(dataset['source_out_degree']<500)&(dataset['exist_in_training_dataset']==0)].sample(16000)

In [80]:
dataset2 = dataset[(dataset['source_out_degree']<500)&(dataset['exist_in_training_dataset']==1)].sample(16000)

In [81]:
dataset3 = dataset[(dataset['source_out_degree']>500)&(dataset['exist_in_training_dataset']==0)].sample(300)

In [82]:
dataset4 = dataset[(dataset['source_out_degree']>500)&(dataset['exist_in_training_dataset']==1)].sample(300)

In [83]:
dataset = pd.concat(objs=[dataset1,dataset2,dataset3,dataset4], axis=0)
dataset.head()

Unnamed: 0,source,sink,common_neighbors,num_cnb,t1,t2,t3,t4,t1BYnum_cnb,t2BYnum_cnb,...,source_in_degree,source_out_degree,sink_in_degree,sink_out_degree,source_outBYin_ratio,source_inBYout_ratio,sink_outBYin_ratio,sink_inBYout_ratio,AAprediction,JAprediction
243741,4428300,1146716,[],0,0,0,0,0,0.0,0.0,...,30,118,1,0,3.933333,0.254237,0.0,0.0,0.0,0.0
225743,279424,4052835,[],0,0,0,0,0,0.0,0.0,...,10,52,61,0,5.2,0.192308,0.0,0.0,0.0,0.0
238440,1524739,1782435,[],0,0,0,0,0,0.0,0.0,...,5,28,2,0,5.6,0.178571,0.0,0.0,0.0,0.0
213554,4031191,1232988,[],0,0,0,0,0,0.0,0.0,...,18,125,5,0,6.944444,0.144,0.0,0.0,0.0,0.0
382918,1908990,3786750,[],0,0,0,0,0,0.0,0.0,...,2,59,1,0,29.5,0.033898,0.0,0.0,0.0,0.0


In [51]:
dataset.columns

Index(['source', 'sink', 'common_neighbors', 'num_cnb', 't1', 't2', 't3', 't4',
       't1BYnum_cnb', 't2BYnum_cnb', 't3BYnum_cnb', 't4BYnum_cnb',
       'exist_in_training_dataset', 'source_in_degree', 'source_out_degree',
       'sink_in_degree', 'sink_out_degree', 'source_outBYin_ratio',
       'source_inBYout_ratio', 'sink_outBYin_ratio', 'sink_inBYout_ratio',
       'AAprediction', 'JAprediction'],
      dtype='object')

In [84]:
trainFM = dataset.drop(['source', 'sink', 'common_neighbors','num_cnb', 't1', 't2', 't3', 't4',
       't1BYnum_cnb', 't2BYnum_cnb', 't3BYnum_cnb', 't4BYnum_cnb','source_in_degree', 'source_out_degree','sink_out_degree', 'source_inBYout_ratio', 'sink_outBYin_ratio'], axis=1)

In [85]:
trainFM.head()

Unnamed: 0,exist_in_training_dataset,sink_in_degree,source_outBYin_ratio,sink_inBYout_ratio,AAprediction,JAprediction
243741,0,1,3.933333,0.0,0.0,0.0
225743,0,61,5.2,0.0,0.0,0.0
238440,0,2,5.6,0.0,0.0,0.0
213554,0,5,6.944444,0.0,0.0,0.0
382918,0,1,29.5,0.0,0.0,0.0


In [54]:
trainFM['sink_in_degree'] = np.log(trainFM['sink_in_degree']+1)

In [55]:
trainFM['source_outBYin_ratio'] = np.log(trainFM['sink_in_degree']+1)
trainFM.head()

Unnamed: 0,exist_in_training_dataset,sink_in_degree,source_outBYin_ratio,sink_inBYout_ratio,AAprediction,JAprediction
344683,0,1.098612,0.741276,0.0,0.0,0.0
308116,0,1.386294,0.869742,0.0,0.0,0.0
239593,0,0.693147,0.526589,0.0,0.0,0.0
265912,0,0.693147,0.526589,0.0,0.0,0.0
365872,0,0.693147,0.526589,0.0,0.0,0.0


In [None]:
# load pima indians dataset
#dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")
# split into input (X) and output (Y) variables
X = trainFM.drop(['exist_in_training_dataset','sink_inBYout_ratio'], axis=1)
Y = trainFM['exist_in_training_dataset']
# create model
model = KerasClassifier(build_fn=create_model)
model = Sequential()
model.add(Dense(12, input_dim=4, init='uniform', activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(8, init='uniform', activation='relu'))
model.add(Dropout(0.05))
model.add(Dense(4, init='uniform', activation='tanh'))
model.add(Dense(8, init='uniform', activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(4, init='uniform', activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(1, init='uniform', activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(X, Y, epochs=150, batch_size=10,  verbose=2)
# calculate predictions
predictions = model.predict(testFM.drop(['sink_inBYout_ratio','Id'], axis=1))

  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]
  
  app.launch_new_instance()


Epoch 1/150
 - 6s - loss: 0.4585 - acc: 0.8106
Epoch 2/150
 - 5s - loss: 0.3816 - acc: 0.8456
Epoch 3/150
 - 5s - loss: 0.3711 - acc: 0.8503
Epoch 4/150
 - 5s - loss: 0.3660 - acc: 0.8518
Epoch 5/150
 - 5s - loss: 0.3641 - acc: 0.8521
Epoch 6/150
 - 5s - loss: 0.3600 - acc: 0.8533
Epoch 7/150
 - 5s - loss: 0.3602 - acc: 0.8530
Epoch 8/150
 - 5s - loss: 0.3590 - acc: 0.8540
Epoch 9/150
 - 5s - loss: 0.3573 - acc: 0.8535
Epoch 10/150
 - 5s - loss: 0.3560 - acc: 0.8554
Epoch 11/150
 - 5s - loss: 0.3555 - acc: 0.8561
Epoch 12/150
 - 5s - loss: 0.3572 - acc: 0.8552
Epoch 13/150
 - 5s - loss: 0.3580 - acc: 0.8551
Epoch 14/150
 - 5s - loss: 0.3575 - acc: 0.8548
Epoch 15/150
 - 5s - loss: 0.3563 - acc: 0.8540
Epoch 16/150
 - 5s - loss: 0.3525 - acc: 0.8581
Epoch 17/150
 - 5s - loss: 0.3542 - acc: 0.8565
Epoch 18/150
 - 5s - loss: 0.3532 - acc: 0.8559
Epoch 19/150
 - 5s - loss: 0.3538 - acc: 0.8553
Epoch 20/150
 - 5s - loss: 0.3536 - acc: 0.8557
Epoch 21/150
 - 5s - loss: 0.3535 - acc: 0.8558
E