# Multivariate Time Series - Data Discovery

Initial exploration of the Azure Anomaly Detector multivariate sample dataset.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (14, 5)

## 1. Load and Inspect

In [None]:
df = pd.read_csv("multivariate_sample_data.csv", parse_dates=["timestamp"])
df = df.set_index("timestamp")
print(f"Shape: {df.shape}")
print(f"Time range: {df.index.min()} → {df.index.max()}")
print(f"Frequency: {pd.infer_freq(df.index)}")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## 2. Missing Values

In [None]:
missing = df.isnull().sum()
print("Missing values per column:")
print(missing)
print(f"\nTotal missing: {missing.sum()}")

## 3. Time Series Plots

In [None]:
fig, axes = plt.subplots(len(df.columns), 1, figsize=(14, 3 * len(df.columns)), sharex=True)

for ax, col in zip(axes, df.columns):
    ax.plot(df.index, df[col], linewidth=0.7)
    ax.set_ylabel(col, fontsize=10)
    ax.tick_params(labelsize=8)

axes[-1].set_xlabel("Timestamp")
fig.suptitle("Individual Time Series", fontsize=14, y=1.01)
plt.tight_layout()
plt.show()

## 4. Distributions

In [None]:
fig, axes = plt.subplots(1, len(df.columns), figsize=(16, 4))

for ax, col in zip(axes, df.columns):
    sns.histplot(df[col], kde=True, ax=ax, bins=50)
    ax.set_title(col, fontsize=10)
    ax.tick_params(labelsize=8)

fig.suptitle("Value Distributions", fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, len(df.columns), figsize=(16, 4))

for ax, col in zip(axes, df.columns):
    sns.boxplot(y=df[col], ax=ax)
    ax.set_title(col, fontsize=10)

fig.suptitle("Box Plots", fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

## 5. Correlation Analysis

In [None]:
corr = df.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap="coolwarm", center=0, fmt=".2f", square=True)
plt.title("Pearson Correlation Matrix")
plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(df.sample(min(2000, len(df)), random_state=42), plot_kws={"s": 5, "alpha": 0.3})
plt.suptitle("Pairwise Scatter Plots", y=1.02)
plt.show()

## 6. Rolling Statistics

In [None]:
window = 144  # 1 day at 10-min intervals

fig, axes = plt.subplots(len(df.columns), 1, figsize=(14, 3 * len(df.columns)), sharex=True)

for ax, col in zip(axes, df.columns):
    ax.plot(df.index, df[col], alpha=0.3, linewidth=0.5, label="raw")
    ax.plot(df.index, df[col].rolling(window).mean(), linewidth=1.2, label="rolling mean")
    ax.fill_between(
        df.index,
        df[col].rolling(window).mean() - 2 * df[col].rolling(window).std(),
        df[col].rolling(window).mean() + 2 * df[col].rolling(window).std(),
        alpha=0.15, label="±2σ band"
    )
    ax.set_ylabel(col, fontsize=10)
    ax.legend(loc="upper right", fontsize=8)

axes[-1].set_xlabel("Timestamp")
fig.suptitle(f"Rolling Mean & ±2σ (window={window}, ~1 day)", fontsize=14, y=1.01)
plt.tight_layout()
plt.show()

## 7. Stationarity Check (ADF Test)

In [None]:
from statsmodels.tsa.stattools import adfuller

results = []
for col in df.columns:
    adf_stat, p_value, *_ = adfuller(df[col].dropna(), autolag="AIC")
    results.append({"variable": col, "adf_statistic": adf_stat, "p_value": p_value, "stationary": p_value < 0.05})

pd.DataFrame(results)

## 8. Summary

In [None]:
print("=== Dataset Summary ===")
print(f"Samples:    {len(df)}")
print(f"Variables:  {len(df.columns)} — {list(df.columns)}")
print(f"Time range: {df.index.min()} → {df.index.max()}")
print(f"Duration:   {df.index.max() - df.index.min()}")
print(f"Frequency:  {pd.infer_freq(df.index)}")
print(f"Missing:    {df.isnull().sum().sum()}")