In [None]:
%pip install transformers "sagemaker" "s3fs" "boto3" "pandas" "gremlinpython" "numpy" "seaborn" "matplotlib" "scikit-learn" --upgrade --quiet

In [None]:

%load_ext autoreload
%autoreload 2

### Environment Setup
- Import libraries

In [None]:
# Loyalty Abuse Preliminary Code
# Import libraries
import random
import pandas as pd
from datetime import datetime, timedelta
import sagemaker
from sagemaker import RandomCutForest
import boto3
import asyncio
import gremlin_python
from gremlin_python.structure.graph import Graph
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.graph_traversal import __
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import os
import time



- Initialize SageMaker session

In [None]:

# Initialize SageMaker session
sagemaker_session = sagemaker.Session()
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sagemaker_session is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sagemaker_session.default_bucket()


try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    #use this code if you are running on SageMaker Studio
    #role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
    #use this code if you are running locally
    #TODO delete ARN when pushing to github
    role_name='your-role-arn'
    role = iam.get_role(RoleName=role_name)['Role']['Arn']


sagemaker_session = sagemaker.Session(default_bucket=sagemaker_session_bucket)
sm_client = boto3.client('sagemaker', region_name=sagemaker_session.boto_region_name)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sagemaker_session.default_bucket()}")
print(f"sagemaker session region: {sagemaker_session.boto_region_name}")

# Step 1:  Generate Complex Synthetic Data
### Helper functions
This code cell generates complex synthetic user data for a loyalty program trial, including both normal and abusive user accounts.

In [None]:

# Step 1: Generate Complex Synthetic Data
def generate_complex_synthetic_user_data(num_users=1000, abuse_percentage=0.1):
    """
    Generates complex synthetic data for loyalty program trial abuse with additional features.
    """
    data = []
    abuse_count = int(num_users * abuse_percentage)
    normal_count = num_users - abuse_count
   
    email_domains = ['gmail.com', 'yahoo.com', 'hotmail.com']
    locations = ['New York', 'California', 'Texas', 'London', 'Berlin']
   
    for i in range(normal_count):
        data.append({
            'user_id': f"user_{i}",
            'email': f"user{i}@{random.choice(email_domains)}",
            'ip_address': f"{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(0, 255)}",
            'device_id': f"device_{random.randint(1000, 9999)}",
            'location': random.choice(locations),
            'trial_uses': random.randint(0, 2),
            'transaction_value': round(random.uniform(50, 500), 2),
            'transaction_count': random.randint(1, 10),
            'time_spent': round(random.uniform(5, 100), 2),  # Time spent in loyalty program (minutes)
            'loyalty_points': random.randint(100, 10000),
            'signup_date': datetime.now() - timedelta(days=random.randint(30, 365)),
            'abusive_account':0
        })
   
    # Add abusive users
    for i in range(abuse_count):
        data.append({
            'user_id': f"abuser_{i}",
            'email': f"abuser{i}@{random.choice(email_domains)}",
            'ip_address': f"{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(0, 255)}",
            'device_id': f"device_{random.randint(1000, 9999)}",
            'location': random.choice(locations),
            'trial_uses': random.randint(3, 10),  # Abusive users exploit multiple trials
            'transaction_value': round(random.uniform(10, 50), 2),  # Lower transaction values
            'transaction_count': random.randint(5, 50),  # Abusers perform many small transactions
            'time_spent': round(random.uniform(1, 10), 2),  # Less time spent
            'loyalty_points': random.randint(10, 100),  # Abusive users have few points
            'signup_date': datetime.now() - timedelta(days=random.randint(1, 30))  # Recent signups
            ,'abusive_account':1  
        })

    df = pd.DataFrame(data)
    return df

# Generate synthetic data
df = generate_complex_synthetic_user_data(num_users=10000,abuse_percentage=0.2)


The code extracts the 'abusive_account' column from the dataframe and stores it in a separate variable, then drops the same column from the original dataframe.

In [None]:
abusive_accounts=df['abusive_account']
#Drop column from dataframe
df.drop(['abusive_account'],axis=1,inplace=True)


# Step 2: Exploratory Data Analysis (EDA)

This code performs exploratory data analysis (EDA) on the dataset, which is a crucial step in understanding the data and identifying potential issues or patterns.

In [None]:

# Step 2: Exploratory Data Analysis (EDA)

excluded_columns = {'user_id', 'email', 'ip_address', 'device_id', 'location'}
columns = [column for column in df.columns if column not in excluded_columns]


# Display basic statistics
print("\nBasic statistics for the dataset:")
print(df.describe())

# Plot distribution of trial uses and transaction values
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
sns.histplot(df['trial_uses'], bins=20, kde=True)
plt.title('Distribution of Trial Uses')

plt.subplot(1, 2, 2)
sns.histplot(df['transaction_value'], bins=20, kde=True)
plt.title('Distribution of Transaction Value')
plt.show()

# Correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df[columns].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Time spent vs Transaction Value (for detecting anomalies)
plt.figure(figsize=(8, 6))
sns.scatterplot(x='time_spent', y='transaction_value', hue='trial_uses', data=df)
plt.title('Time Spent vs Transaction Value')
plt.show()


# Step 3: Preprocess Data

This code preprocesses the data by scaling numerical features and saving the preprocessed data to a CSV file in the 'dataset' directory.

In [None]:

# Step 3: Preprocess Data
# Preprocess for the model

os.makedirs('dataset', exist_ok=True)

df.fillna(0, inplace=True)
scaler = StandardScaler()

df_scaled = scaler.fit_transform(df[['trial_uses', 'transaction_value', 'transaction_count', 'time_spent', 'loyalty_points']])
np.savetxt('dataset/train.csv', df_scaled, fmt='%s',delimiter=",")


This code uploads the preprocessed training data to an Amazon S3 bucket for use in model training with SageMaker.

In [None]:
# Upload preprocessed data to S3 for training
train_file = 'loyalty-program-abuse/train.csv'
train_path = f's3://{sagemaker_session.default_bucket()}/{train_file}'
pd.DataFrame(df_scaled).to_csv(train_path, index=False)

# Step 4: Train an Anomaly Detection Model (RCF & Autoencoder)
### Training with Random Cut Forest (RCF) built-in algorithm in Sagemaker

This code sets up an Amazon SageMaker Random Cut Forest (RCF) model for anomaly detection, specifying the execution role, instance configuration, and output path.

In [None]:

# Step 4: Train an Anomaly Detection Model (RCF)
# Setup RCF model using Amazon SageMaker
rcf = RandomCutForest(
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=f's3://{sagemaker_session.default_bucket()}/loyalty-program-abuse/output/'
)

The next cell trains a Random Cut Forest (RCF) model using the scaled data from the previous step.

In [None]:
# Train the RCF model
rcf_train_data = rcf.record_set(df_scaled)
rcf.fit(rcf_train_data)

### Training an custom autoencoder algorithm with Pytorch 

This code sets up an autoencoder model using Amazon SageMaker's PyTorch estimator, specifying various configurations such as the entry point, source directory, AWS role, framework version, instance type, and hyperparameters.

In [None]:
# Setup autoencoder model using Amazon SageMaker
import sagemaker
from sagemaker.pytorch import PyTorch

autoencoder = PyTorch(
    entry_point='train.py',
    source_dir='scripts',
    role=role,
    framework_version='1.8.0',
    py_version='py3',
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=f's3://{sagemaker_session.default_bucket()}/loyalty-program-abuse/output/',
    hyperparameters={
        'epochs': 100,
        'batch-size': 32,
        'learning-rate': 0.001,
    }
)


This line of code trains the autoencoder model using the training data stored in an Amazon S3 bucket location specified by the SageMaker session and train_file variables.

In [None]:
autoencoder.fit({'training': f's3://{sagemaker_session.default_bucket()}/{train_file}'})

# [**TODO**] this section is not ready yet 
# Step 5: Deploy Models for a real-time detection endpoint

In [None]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorchModel
from sagemaker.amazon.amazon_estimator import get_image_uri


In [None]:
# Assume we have already trained our PyTorch and RCF models
pytorch_model_data = autoencoder.model_data
rcf_model_data = rcf.model_data


In [None]:
pytorch_model = PyTorchModel(
    model_data=pytorch_model_data,  # Path to your local model artifacts
    role=role,  # This is ignored in local mode
    entry_point='inference.py',
    source_dir='scripts',
    framework_version='1.8.0',  # Use the version that matches your training
    py_version='py3',
    sagemaker_session=sagemaker_session,
)

In [None]:
# Create model containers
pytorch_container = pytorch_model.prepare_container_def(instance_type='ml.m5.xlarge')
rcf_container = rcf.create_model().prepare_container_def(instance_type='ml.m5.xlarge')

In [None]:
# Create the multi-model
multi_model_name = 'multi-model-pytorch-rcf'
create_model_response = sagemaker_session.boto_session.client('sagemaker').create_model(
    ModelName=multi_model_name,
    ExecutionRoleArn=role,
    Containers=[pytorch_container,rcf_container]
)

In [None]:
# Create endpoint configuration
endpoint_config_name = 'multi-model-endpoint-config'
create_endpoint_config_response = sagemaker_session.boto_session.client('sagemaker').create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[{
        'InstanceType': 'ml.m5.xlarge',
        'InitialInstanceCount': 1,
        'ModelName': multi_model_name,
        'VariantName': 'AllTraffic'
    }]
)

In [None]:
# Create endpoint
endpoint_name = 'multi-model-pytorch-rcf-endpoint'
create_endpoint_response = sagemaker_session.boto_session.client('sagemaker').create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name
)

print(f"Endpoint '{endpoint_name}' is being created...")

In [None]:
import sagemaker
import numpy as np
from sagemaker.predictor import Predictor
from sagemaker.serializers import NumpySerializer
from sagemaker.deserializers import NumpyDeserializer

# Set up the SageMaker session
sagemaker_session = sagemaker.Session()

# Define your endpoint name
endpoint_name = 'multi-model-pytorch-rcf-endpoint'

# Create a Predictor object
predictor = Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=NumpySerializer(),
    deserializer=NumpyDeserializer()
)

# Function to perform inference
def perform_inference(data):
    # Ensure the data is 2-dimensional
    if data.ndim == 1:
        data = data.reshape(1, -1)
    
    # Perform prediction
    result = predictor.predict(data)
    return result

# Example: Inference
data = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
result = perform_inference(df_scaled)
print("Inference Result:", result)

# If you're using an RCF model, you might want to use a 2D array instead:
# data = np.array([[1.0, 2.0, 3.0, 4.0, 5.0]], dtype=np.float32)
# result = perform_inference(data)
# print("Inference Result:", result)

# Step 5: Deploy the Model for real-time detection

The code deploys the trained Random Forest Classifier (RFC) model to a SageMaker endpoint for real-time fraud detection.

In [None]:

# Step 5: Deploy the Model for real-time detection
import time
endpoint_name=f'loyalty-abuse-prediction-rfc-{int(time.time())}'
predictor_1 = rcf.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large',
    endpoint_name=endpoint_name,
)


This code block sets up an Amazon SageMaker PyTorchModel for deployment and creates an endpoint for making inferences.

In [None]:
import time
from sagemaker.pytorch import PyTorchModel
import boto3
from botocore.config import Config

model = PyTorchModel(
    model_data=autoencoder.model_data,  # Path to your local model artifacts
    role=role,  # This is ignored in local mode
    entry_point='inference.py',
    source_dir='scripts',
    framework_version='1.8.0',  # Use the version that matches your training
    py_version='py3',
    sagemaker_session=sagemaker_session,
)

endpoint_name=f'loyalty-abuse-prediction-autoencoder-{int(time.time())}'
predictor_2 = model.deploy(
                             initial_instance_count=1,
                             endpoint_name=endpoint_name,
                             instance_type='ml.m4.xlarge')

#Increase the timeout to 120
config = Config(read_timeout=120, connect_timeout=120)
predictor_2.sagemaker_session.sagemaker_runtime_client = boto3.client('sagemaker-runtime', config=config)

# Step 6: Detect Anomalies
The code block creates two copies of the original dataframe, computes anomaly scores using a pre-trained predictor, adds the anomaly scores as a new column to one of the dataframes, and flags the top 5% anomalies as suspicious.

In [None]:

# Step 6: Detect Anomalies
# Use the deployed model to predict anomalies in the dataset
df_rfc=df
df_autoencoder=df
anomaly_scores = predictor_1.predict(df_scaled)
anomalies = np.array([record.label['score'].float32_tensor.values[0] for record in anomaly_scores])

# Add anomaly scores to the original dataframe
df_rfc['anomaly_score'] = anomalies

# Flag top 5% anomalies
df_rfc['is_suspicious'] = df_rfc['anomaly_score'] > np.percentile(df_rfc['anomaly_score'], 95)


This code cell calculates anomaly scores, sets a threshold based on the mean and standard deviation of the scores, and flags the top 5% instances as suspicious anomalies in the dataset.

In [None]:
anomaly_scores = predictor_2.predict(df_scaled)
# You can then set a threshold to determine which instances are anomalies
threshold = np.mean(anomaly_scores) + 2 * np.std(anomaly_scores)
anomalies = anomaly_scores > threshold

df_autoencoder['anomaly_score'] = anomaly_scores.flatten()

# Ensure anomalies is a 1D array
anomalies = anomalies.flatten()

# Flag top 5% anomalies
df_autoencoder['is_suspicious'] = anomalies


# Step 8: Statistical Analysis
This code performs statistical analysis on the anomaly scores generated by the fraud detection model.

In [None]:

# Step 8: Statistical Analysis
print("\nSummary statistics for the anomaly scores(RFC algorithm):")
print(df_rfc['anomaly_score'].describe())

# Perform statistical test to assess anomaly score distribution (e.g., normality)
stat, p_value = stats.shapiro(df_rfc['anomaly_score'])
print(f"\nShapiro-Wilk test for normality: stat={stat}, p-value={p_value}")
if p_value > 0.05:
    print("Anomaly scores are normally distributed.")
else:
    print("Anomaly scores are not normally distributed.")


In [None]:
print("\nSummary statistics for the anomaly scores(Autoencoder):")
print(df_autoencoder['anomaly_score'].describe())

# Perform statistical test to assess anomaly score distribution (e.g., normality)
stat, p_value = stats.shapiro(df_autoencoder['anomaly_score'])
print(f"\nShapiro-Wilk test for normality: stat={stat}, p-value={p_value}")
if p_value > 0.05:
    print("Anomaly scores are normally distributed.")
else:
    print("Anomaly scores are not normally distributed.")


# Step 9: Visualize Anomalies
The next cells code visualizes the distribution of anomaly scores and highlights the suspicious users based on their anomaly scores and user IDs.

In [None]:

# Step 9: Visualize Anomalies
plt.figure(figsize=(10, 6))
sns.histplot(df_rfc['anomaly_score'], bins=50, kde=True)
plt.title("Anomaly Score Distribution")
plt.show()

# Plot suspicious users
suspicious_df = df_rfc[df_rfc['is_suspicious']]
plt.figure(figsize=(10, 6))
sns.scatterplot(x='user_id', y='anomaly_score', data=suspicious_df)
plt.title("Suspicious Users Based on Anomaly Scores")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df_autoencoder['anomaly_score'], bins=50, kde=True)
plt.title("Anomaly Score Distribution")
plt.show()

# Plot suspicious users
suspicious_df = df_autoencoder[df_autoencoder['is_suspicious']]
plt.figure(figsize=(10, 6))
sns.scatterplot(x='user_id', y='anomaly_score', data=suspicious_df)
plt.title("Suspicious Users Based on Anomaly Scores")
plt.show()

# Step 10: Delete Sagemaker Endpoints
This code cell defines a function called delete_sagemaker_endpoints() that uses the boto3 library to list and delete any existing SageMaker endpoints with names starting with 'loyalty-abuse-prediction'.

In [None]:
# Step 10: Delete Sagemaker Endpoints
import boto3
def delete_sagemaker_endpoints():
    sm = boto3.client('sagemaker')
    endpoints = sm.list_endpoints()['Endpoints']
    for endpoint in endpoints:
        sagemaker_endpoint_name = endpoint['EndpointName']
        if sagemaker_endpoint_name.startswith('loyalty-abuse-prediction'):
            print(f'Deleting endpoint: {sagemaker_endpoint_name}')
            sm.delete_endpoint(EndpointName=sagemaker_endpoint_name)




This code snippet uses the AWS SDK for Python (Boto3) to list all the existing SageMaker endpoints in your AWS account.

In [None]:
sm = boto3.client('sagemaker')
endpoints = sm.list_endpoints()['Endpoints']
endpoints

Delete the active endpoints that start with 'loyalty-abuse-prediction'

In [None]:
delete_sagemaker_endpoints()