In [13]:
#from fbprophet import Prophet
import pandas as pd
class DailySpendingAnomalyDetector:
  """
  This class detects anomalies in daily spending data using IQR and Prophet.
  """

  def __init__(self, data):
    """
    Initializes the class with daily spending data.

    Args:
        data (pandas.DataFrame): A DataFrame with a 'date' column (datetime) and a 'spending' column (numeric).
    """
    self.data = data.copy()  # Avoid modifying original data

  def _get_iqr(self):
    """
    Calculates the Interquartile Range (IQR) of the spending data.

    Returns:
        float: The IQR value.
    """
    q1 = self.data['spending'].quantile(0.25)
    q3 = self.data['spending'].quantile(0.75)
    return q1, q3

  def detect_anomalies_iqr(self, num_std=1.5):
    """
    Identifies anomalies in the daily spending data based on IQR thresholds.

    Args:
        num_std (float, optional): The number of standard deviations used for thresholds. Defaults to 1.5.

    Returns:
        pandas.DataFrame: A copy of the original data with a new column 'is_anomaly' (boolean) indicating anomalies.
    """
    q1, q3 = self._get_iqr()  # Unpack the returned tuple to access q1 and q3
    iqr = q3 - q1  # Calculate IQR using the unpacked values
    lower_bound = q1 - (num_std * iqr)
    upper_bound = q3 + (num_std * iqr)
      
    #iqr = self._get_iqr()
    #lower_bound = q1 - (num_std * iqr)
    #upper_bound = q3 + (num_std * iqr)
    self.data['is_anomaly_iqr'] = ~self.data['spending'].between(lower_bound, upper_bound)
    return self.data.copy()


In [15]:
 
# Example usage
data = pd.DataFrame({
  'date': pd.to_datetime(['2024-04-01', '2024-04-02', '2024-04-03', '2024-04-04', '2024-04-05', '2024-04-06', '2024-04-07']),
  'spending': [100, 120, 80, 95, 150, 110, 70]
})

anomaly_detector = DailySpendingAnomalyDetector(data)
data_with_iqr_anomalies = anomaly_detector.detect_anomalies_iqr()
#data_with_prophet_anomalies = anomaly_detector.detect_anomalies_prophet()

print(data_with_iqr_anomalies)
#print(data_with_prophet_anomalies)



        date  spending  is_anomaly_iqr
0 2024-04-01       100           False
1 2024-04-02       120           False
2 2024-04-03        80           False
3 2024-04-04        95           False
4 2024-04-05       150           False
5 2024-04-06       110           False
6 2024-04-07        70           False


In [17]:
import pandas as pd

def detect_anomalies_ewma(data, alpha=0.2, threshold_multiplier=2):
    """
    Detect anomalies in time series data using exponential weighted moving average (EWMA).

    Args:
    - data (DataFrame): DataFrame containing the time series data with a 'date' column and a numeric column to analyze.
    - alpha (float): Smoothing factor for EWMA. Default is 0.2.
    - threshold_multiplier (float): Multiplier for the anomaly threshold (e.g., 2 times the IQR). Default is 2.

    Returns:
    - DataFrame: Input DataFrame with an additional 'is_anomaly_iqr' column indicating whether each data point is an anomaly.
    """
    # Calculate EWMA
    data['ewma'] = data['spending'].ewm(alpha=alpha, adjust=False).mean()

    # Calculate the difference between actual spending and EWMA
    data['diff'] = data['spending'] - data['ewma']

    # Define anomaly threshold based on IQR
    Q1 = data['diff'].quantile(0.25)
    Q3 = data['diff'].quantile(0.75)
    IQR = Q3 - Q1
    threshold_iqr = threshold_multiplier * IQR

    # Flag anomalies based on IQR
    data['is_anomaly_ewma'] = (data['diff'] < (Q1 - threshold_iqr)) | (data['diff'] > (Q3 + threshold_iqr))

    return data

# Example usage
data = pd.DataFrame({
    'date': pd.to_datetime(['2024-04-01', '2024-04-02', '2024-04-03', '2024-04-04', '2024-04-05', '2024-04-06', '2024-04-07']),
    'spending': [100, 120, 80, 95, 150, 110, 70]
})

data_with_anomalies = detect_anomalies_ewma(data)
print(data_with_anomalies[['date', 'spending', 'is_anomaly_ewma']])


        date  spending  is_anomaly_ewma
0 2024-04-01       100            False
1 2024-04-02       120            False
2 2024-04-03        80            False
3 2024-04-04        95            False
4 2024-04-05       150            False
5 2024-04-06       110            False
6 2024-04-07        70            False


In [10]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler

def detect_anomalies_lstm(data, window_size=3, threshold_multiplier=2):
    """
    Detect anomalies in time series data using LSTM.

    Args:
    - data (DataFrame): DataFrame containing the time series data with a 'date' column and a numeric column to analyze.
    - window_size (int): Number of previous time steps to use as input features for LSTM. Default is 3.
    - threshold_multiplier (float): Multiplier for the anomaly threshold (e.g., 2 times the standard deviation). Default is 2.

    Returns:
    - DataFrame: Input DataFrame with an additional 'is_anomaly_lstm' column indicating whether each data point is an anomaly.
    """
    # Normalize the data
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_data = scaler.fit_transform(data['spending'].values.reshape(-1, 1))

    # Create sequences of data for LSTM
    X, y = [], []
    for i in range(window_size, len(scaled_data)):
        X.append(scaled_data[i - window_size:i, 0])
        y.append(scaled_data[i, 0])
    X, y = np.array(X), np.array(y)

    # Reshape data for LSTM
    X = np.reshape(X, (X.shape[0], X.shape[1], 1))

    # Build LSTM model
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(X.shape[1], 1)))
    model.add(LSTM(units=50))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.fit(X, y, epochs=100, batch_size=32)

    # Predictions using the trained LSTM model
    predictions = model.predict(X)
    residuals = np.abs(scaled_data[window_size:] - predictions)

    # Calculate threshold for anomalies
    threshold = threshold_multiplier * residuals.std()

    # Flag anomalies based on threshold
    data['is_anomaly_lstm'] = False
    data.iloc[window_size:, data.columns.get_loc('is_anomaly_lstm')] = residuals > threshold

    return data

# Example usage
data = pd.DataFrame({
    'date': pd.to_datetime(['2024-04-01', '2024-04-02', '2024-04-03', '2024-04-04', '2024-04-05', '2024-04-06', '2024-04-07']),
    'spending': [100, 120, 80, 95, 150, 110, 70]
})

data_with_anomalies_lstm = detect_anomalies_lstm(data)
print(data_with_anomalies_lstm[['date', 'spending', 'is_anomaly_lstm']])


2024-05-07 22:28:05.005682: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/100


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 0.3337
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - loss: 0.3222
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - loss: 0.3110
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - loss: 0.3001
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - loss: 0.2895
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - loss: 0.2790
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - loss: 0.2686
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - loss: 0.2582
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 0.2480
Epoch 10/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - loss: 0.2378
Epoch 11/100
[1m1/1[0

In [18]:
from fbprophet import Prophet
import pandas as pd
import numpy as np

def detect_anomalies_prophet(data, threshold_multiplier=2):
    """
    Detect anomalies in time series data using Prophet.

    Args:
    - data (DataFrame): DataFrame containing the time series data with a 'date' column and a numeric column to analyze.
    - threshold_multiplier (float): Multiplier for the anomaly threshold (e.g., 2 times the standard deviation). Default is 2.

    Returns:
    - DataFrame: Input DataFrame with an additional 'is_anomaly_prophet' column indicating whether each data point is an anomaly.
    """
    # Prepare data for Prophet
    prophet_data = data.rename(columns={'date': 'ds', 'spending': 'y'})

    # Fit Prophet model
    model = Prophet()
    model.fit(prophet_data)

    # Make predictions
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)

    # Calculate residuals
    residuals = np.abs(prophet_data['y'] - forecast['yhat'])

    # Calculate threshold for anomalies
    threshold = threshold_multiplier * residuals.std()

    # Flag anomalies based on threshold
    data['is_anomaly_prophet'] = False
    data.loc[residuals > threshold, 'is_anomaly_prophet'] = True

    return data

# Example usage
data = pd.DataFrame({
    'date': pd.to_datetime(['2024-04-01', '2024-04-02', '2024-04-03', '2024-04-04', '2024-04-05', '2024-04-06', '2024-04-07']),
    'spending': [100, 120, 80, 95, 150, 110, 70]
})

data_with_anomalies_prophet = detect_anomalies_prophet(data)
print(data_with_anomalies_prophet[['date', 'spending', 'is_anomaly_prophet']])


ModuleNotFoundError: No module named 'fbprophet'

In [19]:
conda install -c conda-forge prophet

Channels:
 - conda-forge
 - defaults
Platform: osx-64
Collecting package metadata (repodata.json): done
Solving environment: \ ^C

Note: you may need to restart the kernel to use updated packages.
