## import libs

In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
from matplotlib.dates import DateFormatter
import plotly.express as px

## Data Preparation and Exploration

In [60]:
df = pd.read_csv('Tavily Data Analysis Home Test - Time Series.csv')
print(df.head())

             DATE_TIME  VOLUME  RESP_AVG
0   2025-03-22 8:00:00     442  7.085964
1   2025-03-22 9:00:00     418  8.405124
2  2025-03-22 11:00:00     507  7.779719
3  2025-03-22 12:00:00     559  6.877425
4  2025-03-22 13:00:00     602  8.178685


In [61]:
#Understanding the data 
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1706 entries, 0 to 1705
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   DATE_TIME  1706 non-null   object 
 1   VOLUME     1706 non-null   int64  
 2   RESP_AVG   1706 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 40.1+ KB
None
             VOLUME     RESP_AVG
count   1706.000000  1706.000000
mean    2640.644197     7.848615
std     2410.114417     2.171984
min      362.000000     4.594024
25%     1136.000000     6.904281
50%     2052.500000     7.694853
75%     3297.000000     8.506829
max    23259.000000    64.354239


In [62]:
#Visualize the data 
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['DATE_TIME'], y=df['VOLUME'], mode='lines+markers', name='Value'))
fig.update_layout(title='Time Series Data', xaxis_title='Date', yaxis_title='VOLUME')
fig.show()

## An interim summary of what our data looks like and what we have in it:
- **DATE_TIME** :
Indicates the date and time of the measurement. In this time series, it will serve as the index metric
- **VOLUME** : 
The number of requests (traffic) that the system received in this time period. Very high or low values ​​relative to the average can indicate an abnormal load or an unexpected drop in activity
- **RESP_AVG** : 
The average response time of the system for all requests in that time period

## Feature Engineering
- Normalization and consideration of the relationship between the metrics

In [63]:
df['log_VOLUME'] = np.log(df['VOLUME'])
df['resp_per_logreq'] = df['RESP_AVG'] / df['log_VOLUME']


df['z_resp_per_logreq'] = (
    df['resp_per_logreq'] - df['resp_per_logreq'].mean()
) / df['resp_per_logreq'].std()


In [64]:
df.head()

Unnamed: 0,DATE_TIME,VOLUME,RESP_AVG,log_VOLUME,resp_per_logreq,z_resp_per_logreq
0,2025-03-22 8:00:00,442,7.085964,6.09131,1.163291,0.40174
1,2025-03-22 9:00:00,418,8.405124,6.035481,1.392619,1.13342
2,2025-03-22 11:00:00,507,7.779719,6.228511,1.24905,0.675357
3,2025-03-22 12:00:00,559,6.877425,6.326149,1.087142,0.158785
4,2025-03-22 13:00:00,602,8.178685,6.400257,1.277868,0.767304


## Visualize data and identify anomalies by setting a fixed threshold

Assuming we normalized the VOLUME column by taking it logarithmically, and created a new feature that relies on the relationship between VOLUME and resp . We use the zscore calculation to highlight the points outside the chosen threshold, each such point is a potential anomaly.

In [71]:
color_range = [-5, 5]

fig = px.scatter(
    df.reset_index(),
    x='DATE_TIME',
    y='resp_per_logreq',
    color='z_resp_per_logreq',
    color_continuous_scale='RdYlBu_r',
    range_color=color_range,           
    title='resp_per_logreq colored by Z-score'
)
fig.update_traces(marker=dict(size=5, opacity=0.7))
fig.show()