
# Statistical Analysis of ETA Logs

This notebook extends the normalized correlation analysis by adding descriptive statistics:
- Mean execution time
- Standard deviation
- Z-scores
- 95th percentile (p95)
- 99th percentile (p99)

These metrics help identify anomalies and performance outliers.


In [1]:

import pandas as pd
import numpy as np


In [2]:

# Load ETA logs
eta_logs = pd.read_csv('datasets/cleaned_eta_logs.csv')
eta_logs['datetime'] = pd.to_datetime(eta_logs['datetime'])
eta_logs.head()


Unnamed: 0,datetime,agent_type,pid,transaction_id,execution_time,source_file,line_number,date,hour,minute,is_slow,is_very_slow,is_critical
0,2025-11-06 00:00:00.850,eta_agent,7964,11017737610524,4.121,time6.txt,82537,2025-11-06,0,0,False,False,False
1,2025-11-06 00:00:04.978,eta_agent,7773,11017737601406,0.099,time6.txt,44569,2025-11-06,0,0,False,False,False
2,2025-11-06 00:00:05.251,eta_agent,7690,11017737607061,10.955,time6.txt,1,2025-11-06,0,0,False,False,False
3,2025-11-06 00:00:05.782,eta_agent,8144,11017737599261,3.988,time6.txt,128222,2025-11-06,0,0,False,False,False
4,2025-11-06 00:00:05.995,eta_agent,7771,11017737599253,4.455,time6.txt,40181,2025-11-06,0,0,False,False,False



### Statistical Formulas

- **Mean**:
\[
\mu = \frac{\sum_{i=1}^{n} x_i}{n}
\]

- **Standard Deviation**:
\[
\sigma = \sqrt{\frac{\sum_{i=1}^{n}(x_i - \mu)^2}{n}}
\]

- **Z-score**:
\[
z_i = \frac{x_i - \mu}{\sigma}
\]

- **95th Percentile (p95)**:
\[
p_{95} = \text{value at } 95^{th} \text{ percentile of sorted data}
\]

- **99th Percentile (p99)**:
\[
p_{99} = \text{value at } 99^{th} \text{ percentile of sorted data}
\]


In [6]:

# Calculate descriptive statistics for execution_time
execution_times = eta_logs['execution_time']
mean_val = execution_times.mean()
std_val = execution_times.std()
p95 = np.percentile(execution_times, 95)
p99 = np.percentile(execution_times, 99)

# Compute z-scores
eta_logs['z_score'] = (execution_times - mean_val) / std_val

{
    'mean_execution_time': mean_val,
    'std_dev': std_val,
    'p95': p95,
    'p99': p99,
   # 'z-score': eta_logs
}


{'mean_execution_time': np.float64(3.69212020659224),
 'std_dev': np.float64(5.999948978345758),
 'p95': np.float64(14.229),
 'p99': np.float64(24.958149999999993)}

In [4]:

# Identify outliers based on z-score > 3 or execution_time > p99
outliers = eta_logs[(eta_logs['z_score'] > 3) | (eta_logs['execution_time'] > p99)]
outliers.head()


Unnamed: 0,datetime,agent_type,pid,transaction_id,execution_time,source_file,line_number,date,hour,minute,is_slow,is_very_slow,is_critical,z_score
1909,2025-11-06 00:22:30.014,eta_agent,7760,11017737640276,26.321,time6.txt,15674,2025-11-06,0,22,True,False,False,3.771512
2082,2025-11-06 00:24:09.114,eta_agent,7760,11017737638263,25.41,time6.txt,15679,2025-11-06,0,24,True,False,False,3.619677
2099,2025-11-06 00:24:22.106,eta_agent,7864,11017737627648,25.952,time6.txt,71450,2025-11-06,0,24,True,False,False,3.710012
2107,2025-11-06 00:24:26.079,eta_agent,7976,11017737633002,29.538,time6.txt,84724,2025-11-06,0,24,True,False,False,4.307683
5314,2025-11-06 01:06:22.670,eta_agent,7962,11017737689460,23.451,time6.txt,78059,2025-11-06,1,6,True,False,False,3.293175
