In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from datetime import datetime,timedelta
import keras
from keras.models import Sequential,Model
from keras.layers import Dense,Activation,Dropout,Input
from keras.layers.advanced_activations import LeakyReLU, PReLU
from keras.optimizers import Adam
from keras.layers.recurrent import LSTM
# from keras_tqdm import TQDMNotebookCallback
# from ipywidgets import interact

from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
output_notebook()

% matplotlib inline

print("All dependencies imported!! TF: {} ; Keras :{}".format(tf.__version__,keras.__version__))

!(date +%d\ %B\ %G)

All dependencies imported!! TF: 1.0.1 ; Keras :2.0.1
20 June 2017


In [2]:
def plot_data_stats(rucio_data):
    sns.set_context('poster')
    
    ax = sns.countplot(x='activity',data= rucio_data)
    ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
    plt.show()
    gx= sns.countplot(x='transfer-endpoint', data = rucio_data)
    gx.set_xticklabels(gx.get_xticklabels(), rotation=30)
    plt.show()
    vx = sns.countplot(x='protocol', data = rucio_data)
    plt.show()
    bx= sns.countplot(x='src-type', data=rucio_data)
    plt.show()
    cx= sns.countplot(x='dst-type', data=rucio_data)
    plt.show()
    
def preprocess_data(rucio_data):
    
    fields_to_drop = ['account','reason','checksum-adler','checksum-md5','guid','request-id','transfer-id','tool-id',
                      'transfer-link','name','previous-request-id','src-url','dst-url', 'Unnamed: 0']
    timestamps = ['started_at', 'submitted_at','transferred_at']

    #DROP FIELDS , CHANGE TIME FORMAT
    rucio_data = rucio_data.drop(fields_to_drop, axis=1)
    for timestamp in timestamps:
        rucio_data[timestamp]= pd.to_datetime(rucio_data[timestamp], infer_datetime_format=True)
    rucio_data['delay'] = rucio_data['started_at'] - rucio_data['submitted_at']
    rucio_data['delay'] = rucio_data['delay'].astype('timedelta64[s]')
    
    rucio_data = rucio_data.sort_values(by='submitted_at')

    rucio_data = rucio_data.drop(timestamps, axis=1)
    
    src_encoder = LabelEncoder()
    dst_encoder = LabelEncoder()
    scope_encoder = LabelEncoder()
    type_encoder = LabelEncoder()
    activity_encoder = LabelEncoder()
    protocol_encoder = LabelEncoder()
    t_endpoint_encoder = LabelEncoder()

    src_encoder.fit(rucio_data['src-rse'].unique())
    dst_encoder.fit(rucio_data['dst-rse'].unique())
    scope_encoder.fit(rucio_data['scope'].unique())
    type_encoder.fit(rucio_data['src-type'].unique())
    activity_encoder.fit(rucio_data['activity'].unique())
    protocol_encoder.fit(rucio_data['protocol'].unique())
    t_endpoint_encoder.fit(rucio_data['transfer-endpoint'].unique())

    rucio_data['src-rse'] = src_encoder.transform(rucio_data['src-rse'])
    rucio_data['dst-rse'] = dst_encoder.transform(rucio_data['dst-rse'])
    rucio_data['scope'] = scope_encoder.transform(rucio_data['scope'])
    rucio_data['src-type'] = type_encoder.transform(rucio_data['src-type'])
    rucio_data['dst-type'] = type_encoder.transform(rucio_data['dst-type'])
    rucio_data['activity'] = activity_encoder.transform(rucio_data['activity'])
    rucio_data['protocol'] = protocol_encoder.transform(rucio_data['protocol'])
    rucio_data['transfer-endpoint'] = t_endpoint_encoder.transform(rucio_data['transfer-endpoint'])
    
    return rucio_data

# Load and preprocess data

In [3]:
rucio_data = pd.read_csv('may.csv')
rucio_data = rucio_data[0:30000]
rucio_data.head(10)

Unnamed: 0.1,Unnamed: 0,account,activity,bytes,checksum-adler,checksum-md5,dst-rse,dst-type,dst-url,duration,...,src-rse,src-type,src-url,started_at,submitted_at,tool-id,transfer-endpoint,transfer-id,transfer-link,transferred_at
0,0,,Production Input,533918,aadc03c6,,CERN-PROD_DATADISK,DISK,gsiftp://eosatlassftp.cern.ch:2811/eos/atlas/a...,10,...,TOKYO-LCG2_DATADISK,DISK,srm://lcg-se01.icepp.jp:8446/srm/managerv2?SFN...,2017-05-28 07:00:38,2017-05-28 06:45:22,rucio-conveyor,https://fts3.cern.ch:8446,c9f91f7b-f91b-5df5-ae11-e9dcd249c244,https://fts3.cern.ch:8449/fts3/ftsmon/#/job/c9...,2017-05-28 07:00:48
1,1,,Production Input,591160,68fe0316,,CERN-PROD_DATADISK,DISK,gsiftp://eosatlassftp.cern.ch:2811/eos/atlas/a...,9,...,TOKYO-LCG2_DATADISK,DISK,srm://lcg-se01.icepp.jp:8446/srm/managerv2?SFN...,2017-05-28 07:03:02,2017-05-28 06:56:11,rucio-conveyor,https://fts3.cern.ch:8446,ff43c9e0-9716-5176-9c1e-3f0990f87745,https://fts3.cern.ch:8449/fts3/ftsmon/#/job/ff...,2017-05-28 07:03:11
2,2,,Production Input,512581,3deb51ec,,CERN-PROD_DATADISK,DISK,gsiftp://eosatlassftp.cern.ch:2811/eos/atlas/a...,9,...,TOKYO-LCG2_DATADISK,DISK,srm://lcg-se01.icepp.jp:8446/srm/managerv2?SFN...,2017-05-28 06:58:12,2017-05-28 06:45:31,rucio-conveyor,https://fts3.cern.ch:8446,3d1d5437-700d-54a3-a0ba-46d0b3f11360,https://fts3.cern.ch:8449/fts3/ftsmon/#/job/3d...,2017-05-28 06:58:21
3,3,,Production Input,478343,62736ced,,CERN-PROD_DATADISK,DISK,gsiftp://eosatlassftp.cern.ch:2811/eos/atlas/a...,8,...,TOKYO-LCG2_DATADISK,DISK,srm://lcg-se01.icepp.jp:8446/srm/managerv2?SFN...,2017-05-28 07:02:51,2017-05-28 06:56:11,rucio-conveyor,https://fts3.cern.ch:8446,ff43c9e0-9716-5176-9c1e-3f0990f87745,https://fts3.cern.ch:8449/fts3/ftsmon/#/job/ff...,2017-05-28 07:02:59
4,4,,Production Input,513541,965123c5,,NET2_DATADISK,DISK,srm://atlas.bu.edu:8443/srm/v2/server?SFN=/gpf...,3,...,CERN-PROD_DATADISK,DISK,gsiftp://eosatlassftp.cern.ch:2811/eos/atlas/a...,2017-05-28 07:02:10,2017-05-28 06:56:06,rucio-conveyor,https://fts.usatlas.bnl.gov:8446,853cf990-478d-58b6-8669-d8a3396efd11,https://fts.usatlas.bnl.gov:8449/fts3/ftsmon/#...,2017-05-28 07:02:13
5,5,,Production Input,540985,8f11a53a,,CERN-PROD_DATADISK,DISK,gsiftp://eosatlassftp.cern.ch:2811/eos/atlas/a...,9,...,TOKYO-LCG2_DATADISK,DISK,srm://lcg-se01.icepp.jp:8446/srm/managerv2?SFN...,2017-05-28 07:01:43,2017-05-28 06:45:34,rucio-conveyor,https://fts3.cern.ch:8446,934b88e7-2ec3-552a-80c6-7f0d1822d175,https://fts3.cern.ch:8449/fts3/ftsmon/#/job/93...,2017-05-28 07:01:52
6,6,,Production Input,458226,eb1f491b,,CA-VICTORIA-WESTGRID-T2_DATADISK,DISK,srm://charon01.westgrid.ca:8443/srm/managerv2?...,4,...,INFN-T1_DATADISK,DISK,srm://storm-fe.cr.cnaf.infn.it:8444/srm/manage...,2017-05-28 06:59:24,2017-05-28 06:56:07,rucio-conveyor,https://fts.usatlas.bnl.gov:8446,fbf37172-2014-5a55-84da-a52af118bd2a,https://fts.usatlas.bnl.gov:8449/fts3/ftsmon/#...,2017-05-28 06:59:28
7,7,,Production Input,531690,114fc3a3,,NET2_DATADISK,DISK,srm://atlas.bu.edu:8443/srm/v2/server?SFN=/gpf...,8,...,IFIC-LCG2_DATADISK,DISK,srm://srmv2.ific.uv.es:8443/srm/managerv2?SFN=...,2017-05-28 07:01:52,2017-05-28 06:56:09,rucio-conveyor,https://fts.usatlas.bnl.gov:8446,a0a67cc1-eafe-5e5c-abc6-041e1061ec33,https://fts.usatlas.bnl.gov:8449/fts3/ftsmon/#...,2017-05-28 07:02:00
8,8,,Production Input,527060,c920ba10,,CERN-PROD_DATADISK,DISK,gsiftp://eosatlassftp.cern.ch:2811/eos/atlas/a...,9,...,TOKYO-LCG2_DATADISK,DISK,srm://lcg-se01.icepp.jp:8446/srm/managerv2?SFN...,2017-05-28 06:58:09,2017-05-28 06:56:11,rucio-conveyor,https://fts3.cern.ch:8446,19f8a214-b283-5e0e-ba9b-551cd5004f55,https://fts3.cern.ch:8449/fts3/ftsmon/#/job/19...,2017-05-28 06:58:18
9,9,,Production Input,478330207,191570e4,,RRC-KI-T1_DATADISK,DISK,srm://sdrm.t1.grid.kiae.ru:8443/srm/managerv2?...,27,...,MWT2_DATADISK,DISK,srm://uct2-dc1.uchicago.edu:8443/srm/managerv2...,2017-05-28 07:01:58,2017-05-28 05:16:40,rucio-conveyor,https://fts.usatlas.bnl.gov:8446,fa222ff4-c5d6-52bf-aa2b-85e1dd058474,https://fts.usatlas.bnl.gov:8449/fts3/ftsmon/#...,2017-05-28 07:02:25


In [4]:
rucio_data = preprocess_data(rucio_data)
durations = rucio_data['duration']
rucio_data = rucio_data.drop(['duration'], axis=1)

In [7]:
print(rucio_data.shape, durations.shape)

(30000, 10) (30000,)


In [32]:
def get_and_preprocess_data(path='may.csv'):
    
    rucio_data = pd.read_csv(path)
    rucio_data = rucio_data[0:30000]
    rucio_data = preprocess_data(rucio_data)
    durations = rucio_data['duration']
    rucio_data = rucio_data.drop(['duration'], axis=1)
    inputs = rucio_data.as_matrix()
    outputs = durations.as_matrix()
    print(inputs.shape, outputs.shape)
    return inputs, outputs

# splitting data into test and training set

In [41]:
def split_data(rucio_data,durations, batch_size=512, num_timesteps=1, split_frac=0.9):
    
    n_batches = int(rucio_data.shape[0] / batch_size)
    
    rucio_data = rucio_data[0:n_batches*batch_size]
    durations = durations[0:n_batches*batch_size]
    
    x = np.stack(np.split(rucio_data, batch_size))
    y = np.stack(np.split(durations, batch_size))
    
    print(x.shape, y.shape)
    
    split_idx = int(batch_size*split_frac)
    trainX, trainY = x[:split_idx], y[:split_idx]
    testX, testY = x[split_idx:], y[split_idx:]
#     print(trainX.shape)
    return trainX, trainY, testX, testY

# trainX, trainY, testX, testY = split_data(inputs, outputs)

# Build model

In [42]:
def build_model():

    model = Sequential()
    layers = [10, 10, 1]
    
    model.add(LSTM(layers[0], input_shape=(None, 10), return_sequences=True))
    model.add(Dropout(0.4))
    
    model.add(LSTM(layers[1], return_sequences=False))
    model.add(Dropout(0.4))
    
    model.add(Dense(layers[2]))
    model.add(Activation("linear"))
    
    start = time.time()
    model.compile(loss="mse", optimizer="rmsprop")
    print ("Compilation Time : ", time.time() - start)
    return model

import keras.callbacks as cb

class LossHistory(cb.Callback):
    def on_train_begin(self, logs={}):
        self.losses = []

    def on_batch_end(self, batch, logs={}):
        batch_loss = logs.get('loss')
        self.losses.append(batch_loss)



# def build_model(n_steps):
    
#     layers = [10, 10, 1]
#     model_inputs = Input(shape=[None,n_steps, 10])
    
#     layer_1 = LSTM(layers[0], return_sequences=True)(model_inputs)
#     layer_2 = LSTM(layers[1], return_sequences=False)(layer_1)
    
#     model_output = Dense(layer[2], activation='linear')
    
#     model = Model(input=model_inputs, output=model_output)
    
    

In [43]:
import time
def run_network(model=None,data=None, epochs=1, batch=128):
    
    print('\n Loading data...')
    if data is None:
        rucio_data, durations = get_and_preprocess_data()

        print('\n Data Loaded and preprocesses!!....')
        print('\n Moving on to splitting and reshaping data...')
#         trainX, trainY, testX, testY = split_data(inputs, outputs,batch_size=512, split_frac=0.9)
        print('\n Data split into train and test sets.. ')
    else:
        trainX, trainY, testX, testY = data
    

    try:
        start_time = time.time()
        
        if model is None:
            model = build_model()

            history = LossHistory()

            print('Training model...')
            training = model.fit(trainX, trainY, epochs=epochs, batch_size=batch,
                                 validation_split=0.1, callbacks=[history], verbose=1)

            print("Training duration : {0}".format(time.time() - start_time))
            score = model.evaluate(trainX, trainY, verbose=0)

            print("Network's training score [MSE]: {0}".format(score))
            print("Training finished !!!!!!")
            return training, data, model, history.losses
        
    except KeyboardInterrupt:
        print('KeyboardInterrupt')
        return model, history.losses
    
def plot_losses(losses):
    sns.set_context('poster')
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(losses)
    ax.set_title('Loss per batch')
    fig.show()
    print(len(losses))

In [44]:
training, data, model, losses = run_network()
plot_losses(losses)


 Loading data...
(30000, 10) (30000,)

 Data Loaded and preprocesses!!....

 Moving on to splitting and reshaping data...
(512, 58, 10) (512, 58)

 Data split into train and test sets.. 
Compilation Time :  0.031991004943847656
Training model...


ValueError: Error when checking model target: expected activation_3 to have shape (None, 1) but got array with shape (460, 58)