# Between nodes

In [1]:
import pandas as pd
import numpy as np
import time

def get_delta(row):
    if (row['Method']=='send_file_to_filenode'):
        return 0
    t1 = row['Time']
    t2 = row['Id']
    t = t1-t2  # pandas._libs.tslibs.timedeltas.Timedelta
    t = t.seconds * 1000000 + t.microseconds
    return t/1000000

#path = './data/mean_110ms_latency/50x10/'
#path = './data/mean_110ms_latency/25x1000/'
path = './data/mean_110ms_latency/25x100000/'
path1 = path + 'nameNode.log'
path2 = path + 'client.log'

# Load name node data from csv file
data1 = pd.read_csv(path1, delimiter=';', header=None, skiprows=2)
data1.columns = ['LogTime', 'Time', 'Node', 'Method', 'Id'] 
data2 = pd.read_csv(path2, delimiter=';', header=None)
data2.columns = ['LogTime', 'Time', 'Node', 'Method', 'Id'] 
data = pd.concat([data1, data2])


# Manipulate
data['Time'] =  pd.to_datetime(data['Time'])
data['Id'] =  pd.to_datetime(data['Id'])
data['Method'] = data.apply(lambda row: ('0 step' if row['Node'] == 'Client' else ('1 step' if row['Method'] == 'update_filelist' else '2 step')), axis=1)
data = data.sort_values(['Id', 'Time'], ascending=[1, 1])
data['Delta'] = data.apply(get_delta, axis=1)

# Drop unnecessary columns
data = data.drop("Time", axis=1)
data = data.drop("LogTime", axis=1)

# Pivot
data = data.pivot(index='Id', columns='Method', values='Delta')
data.head(5)

Method,0 step,1 step,2 step
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-11-20 09:06:21.720720,0.0,2.481955,5.388917
2019-11-20 09:06:28.933955,0.0,2.597015,5.401449
2019-11-20 09:06:36.142227,0.0,2.465766,5.340151
2019-11-20 09:06:43.411038,0.0,2.515043,5.393263
2019-11-20 09:06:50.756400,0.0,2.525574,5.314126


In [2]:
# First file list update
data['1 step'].describe()

count    25.000000
mean      2.530602
std       0.046588
min       2.464067
25%       2.500503
50%       2.531885
75%       2.549729
max       2.673954
Name: 1 step, dtype: float64

In [3]:
# Second file list update
data['2 step'].describe()

count    25.000000
mean      5.409429
std       0.055534
min       5.314126
25%       5.378177
50%       5.403222
75%       5.454978
max       5.559336
Name: 2 step, dtype: float64

# Inter arrival rate

In [4]:

def get_delta(row):
    if (pd.isnull(row['PrevTime'])):
        return 0
    t1 = row['Time']
    t2 = row['PrevTime']
    t = t1-t2  # pandas._libs.tslibs.timedeltas.Timedelta
    t = (t.seconds * 1000000 + t.microseconds)/1000000
    return t if t < 10 else 0  # Special case when method changes

path = './data/mean_110ms_latency/25x1000/'
path1 = path + 'nameNode.log'

# We just need name node data
data = pd.read_csv(path1, delimiter=';', header=None, skiprows=2)
data.columns = ['LogTime', 'Time', 'Node', 'Method', 'Id'] 

# Manipulate
data['Time'] =  pd.to_datetime(data['Time'])
data['Id'] =  pd.to_datetime(data['Id'])

# Now we need to sort by Method and then Time
data = data.sort_values(['Method', 'Time'], ascending=[1, 1])
data['PrevTime'] = data['Time'].shift()
data['Delta'] = data.apply(get_delta, axis=1)

# Drop unnecessary columns
data = data.drop("Time", axis=1)
data = data.drop("LogTime", axis=1)


# Pivot
data = data.pivot(index='Id', columns='Method', values='Delta')
data.head(5)

Method,rupdate_filelist,update_filelist
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-11-20 09:02:14.013464,0.0,0.0
2019-11-20 09:02:16.368776,2.388565,2.367866
2019-11-20 09:02:18.816217,2.44389,2.472921
2019-11-20 09:02:21.263157,2.446791,2.428145
2019-11-20 09:02:23.637047,2.285331,2.364878


In [5]:
# First file list update
print(data['update_filelist'].describe())
print(data.sort_values(['update_filelist']).head(2))

count    25.000000
mean      2.276277
std       0.476096
min       0.000000
25%       2.350835
50%       2.365391
75%       2.387263
max       2.472921
Name: update_filelist, dtype: float64
Method                      rupdate_filelist  update_filelist
Id                                                           
2019-11-20 09:02:14.013464          0.000000         0.000000
2019-11-20 09:03:10.898566          2.302672         2.304323


In [7]:
# Second file list update
print(data['rupdate_filelist'].describe())
print(data.sort_values(['rupdate_filelist']).head(2))

count    25.000000
mean      2.274086
std       0.476313
min       0.000000
25%       2.332645
50%       2.380429
75%       2.392749
max       2.446791
Name: rupdate_filelist, dtype: float64
Method                      rupdate_filelist  update_filelist
Id                                                           
2019-11-20 09:02:14.013464          0.000000         0.000000
2019-11-20 09:02:54.374877          2.280142         2.310353
