In [9]:
# For target_1 "at_risk_event".
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix


data = pd.read_csv('D:/dissertation/session 1/feature.csv')

# Split the dataset into three parts based on the cluster label.
data_0 = data[data['cluster_label'] == 0]
data_1 = data[data['cluster_label'] == 1]
data_2 = data[data['cluster_label'] == 2]

# Define model parameters tailored for each cluster.
params = {
    0: {'n_estimators': 200, 'max_samples': 'auto', 'contamination': 0.03, 'random_state': 42},
    1: {'n_estimators': 200, 'max_samples': 'auto', 'contamination': 0.03, 'random_state': 42},
    2: {'n_estimators': 200, 'max_samples': 'auto', 'contamination': 0.04, 'random_state': 42}
}

# List of features to use in the model.
features = ['is_non_working_hour', 'is_weekend', 'adjusted_non_working_hour_risk_normalized', 
            'adjusted_weekend_risk_normalized', 'first_non_working_hour', 'first_weekend', 
            'time_to_working_hour_normalized', 'spend_normalized', 'spend_diff_flag']

test_results = []
clusters = [data_0, data_1, data_2]
cluster_labels = [0, 1, 2]

for subset, label in zip(clusters, cluster_labels):
    # Split each cluster into training and testing datasets.
    X_train, X_test = train_test_split(subset, test_size=0.2, random_state=42)
    
    # Initialize and train the Isolation Forest model using the parameters defined above.
    iso = IsolationForest(**params[label])
    iso.fit(X_train[features])
    
    # Predict on the testing dataset to identify outliers (anomalies).
    X_test['scores'] = iso.decision_function(X_test[features])
    X_test['outlier_label'] = iso.predict(X_test[features])
    
    test_results.append(X_test)

# Combine all testing results into a single DataFrame.
final_test_data = pd.concat(test_results)

# Convert the 'at_risk_event' from Boolean to a binary format suitable for scoring.
final_test_data['true_label'] = final_test_data['at_risk_event'].apply(lambda x: 1 if x else 0)

# Convert the Isolation Forest output from -1 and 1 to 1 and 0 respectively for easier comparison.
final_test_data['predicted_label'] = (final_test_data['outlier_label'] == -1).astype(int)

# Calculate performance metrics to evaluate the effectiveness of the model.
precision = precision_score(final_test_data['true_label'], final_test_data['predicted_label'])
recall = recall_score(final_test_data['true_label'], final_test_data['predicted_label'])
f1 = f1_score(final_test_data['true_label'], final_test_data['predicted_label'])

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Compute the confusion matrix.
conf_matrix = confusion_matrix(final_test_data['true_label'], final_test_data['predicted_label'])
print("Confusion Matrix:")
print(conf_matrix)

Precision: 0.3757
Recall: 0.5084
F1 Score: 0.4321
Confusion Matrix:
[[20070   452]
 [  263   272]]


In [10]:
# For target_2 "at_risk_event".
import pandas as pd
from sklearn.ensemble import IsolationForest

data = pd.read_csv('D:/dissertation/session 1/feature.csv')

# Define model parameters for each cluster based on predefined settings.
params = {
    0: {'n_estimators': 200, 'max_samples': 'auto', 'contamination': 0.01, 'random_state': 42},
    1: {'n_estimators': 200, 'max_samples': 'auto', 'contamination': 0.01, 'random_state': 42},
    2: {'n_estimators': 200, 'max_samples': 'auto', 'contamination': 0.02, 'random_state': 42}
}

# Define the list of features that will be used in the model.
features = ['is_non_working_hour', 'is_weekend', 'prob_non_working_hours_normalized', 'prob_weekend_normalized',
            'first_and_second_non_working_hour', 'first_and_second_weekend', 
            'spend_normalized', 'spend_diff', 'spend_diff_flag', 'first_two_high_spend']

test_results = []

# Split the dataset by cluster labels and apply the Isolation Forest model to each cluster.
for cluster_label in data['cluster_label'].unique():
    cluster_data = data[data['cluster_label'] == cluster_label]

    # Train the Isolation Forest model using the specified parameters for the cluster.
    iso = IsolationForest(**params[cluster_label])
    iso.fit(cluster_data[features])
    
    # Predict anomalies and calculate decision function scores.
    cluster_data['scores'] = iso.decision_function(cluster_data[features])
    cluster_data['outlier_label'] = iso.predict(cluster_data[features])

    # Convert the outlier labels from -1, 1 to 1, 0.
    cluster_data['outlier_label'] = (cluster_data['outlier_label'] == -1).astype(int)

    # Save the prediction results for further analysis.
    test_results.append(cluster_data)

# Combine the results from all clusters into a single DataFrame.
final_test_data = pd.concat(test_results)

final_test_data['date'] = pd.to_datetime(final_test_data['date'])
final_test_data.sort_values(by=['individual_id', 'date'], inplace=True)

# Initialize window_id as NA and processed_flag as False for new data marking.
final_test_data['window_id'] = pd.NA
final_test_data['processed_flag'] = False

# Detect and mark anomaly windows.
window_id = 0
for index, row in final_test_data.iterrows():
    if row['outlier_label'] == 1 and not row['processed_flag']:
        current_datetime = row['date']
        end_datetime = current_datetime + pd.Timedelta(days=4)
        
        # Select records within a 5-day window.
        mask = (final_test_data['individual_id'] == row['individual_id']) & \
               (final_test_data['date'] >= current_datetime) & \
               (final_test_data['date'] <= end_datetime)
        
        # Start marking from the current record to ensure the window starts from the first anomalous event.
        current_record_index = final_test_data[(final_test_data['individual_id'] == row['individual_id']) & \
                                               (final_test_data['date'] == current_datetime)].index
        start_index = current_record_index[current_record_index >= index][0]
        
        mask = (mask) & (final_test_data.index >= start_index)
        
        # Assign window ID to all records within the window and mark them as processed.
        final_test_data.loc[mask, 'window_id'] = window_id
        final_test_data.loc[mask, 'processed_flag'] = True
        window_id += 1

final_test_data.drop(columns=['processed_flag'], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_data['scores'] = iso.decision_function(cluster_data[features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_data['outlier_label'] = iso.predict(cluster_data[features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_data['outlier_label'] = (cluster_data['outlier_label'] == 

In [11]:
def second_target_eval_func(df,
                            second_target_pred_col_name="second_target_pred",
                            at_risk_behaviour_window_col_name="at_risk_behaviour_window"):
    """
    Evaluate the predictions for the second target, at_risk_behaviour_window.

    Args:
        df (pandas.DataFrame): the dataframe with your predictions and the target window.
        second_target_pred_col_name (string): the name of the column with your predictions.
        at_risk_behaviour_window_col_name (string): the name of the column with the target window.

    Returns:
        pandas.DataFrame: the original dataframe with 2 additional columns:
            target_window_found: has value True if window found, False if window wasnt found, NaN if event isnt in a window;
            correct_predictions: has value True if prediction was in a window, False if outside, NaN if no prediction made.
        float: precision.
        float: recall.
    """

    df_copy = df.copy()

    # Identify the windows found.
    windows_found = df_copy.groupby(at_risk_behaviour_window_col_name).agg({second_target_pred_col_name:"max"})[second_target_pred_col_name]
    windows_missed = ~windows_found

    # Join the windows found with the df.
    df_copy = pd.merge(df_copy, pd.DataFrame(windows_found), left_on=at_risk_behaviour_window_col_name, right_index=True, how="left", suffixes=("", "_found"))
    df_copy = df_copy.rename(columns={second_target_pred_col_name + "_found":"target_window_found"})

    # Identify the events where a prediction was correctly or incorrectly made.
    correct_predictions = df_copy[df_copy[second_target_pred_col_name]][at_risk_behaviour_window_col_name].notna()
    df_copy["correct_predictions"] = correct_predictions
    num_predictions = sum(df_copy[second_target_pred_col_name])
    missed_predictions = (correct_predictions == False).sum()

    # Calculate metrics.
    """
    TP is the number of windows correctly identified.
    FP is the number of predictions made incorrectly.
    Additional predictions made for a window after the first do not affect the precision or recall (ie the FP wont change).
    """
    TP = windows_found.sum()
    FP = missed_predictions
    support = len(windows_found)
    precision = TP/(TP + FP)
    recall = TP/support

    return df_copy, precision, recall

In [14]:
def check_window_id(row):
    if not pd.isna(row['window_id']):
        return True
    else:
        return False

final_test_data["my_pred_column"] = final_test_data.apply(lambda row: check_window_id(row), axis=1)

In [15]:
df_with_pred_evaluations, precision, recall = second_target_eval_func(df=final_test_data,
                                                                      second_target_pred_col_name="my_pred_column")

print("precision of IF:", precision, "recall of IF:", recall)

precision of IF: 0.04509894155545329 recall of IF: 0.2156215621562156
