In [None]:
!pip install seaborn
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
# Define constants
num_parties = 100000
core_start_date = datetime(2020, 1, 1)
core_end_date = datetime(2023, 1, 1)
lookback_months = 24
lookback_start_date = core_start_date - timedelta(days=lookback_months * 30)
# Helper function to generate random dates
def random_date(start, end):
    """Generate a random datetime between start and end."""
    return start + timedelta(days=np.random.randint(0, (end - start).days))

# Generate parties data
np.random.seed(42)  # For reproducibility
parties = pd.DataFrame({
    'party_id': [f'PARTY_{i}' for i in range(num_parties)],
    'validity_start_time': [random_date(lookback_start_date, core_end_date) for _ in range(num_parties)],
    'is_entity_deleted': np.random.choice([True, False], num_parties, p=[0.1, 0.9]),
    'source_system': np.random.choice(['CRM', 'ERP', 'External'], num_parties),
    'type': np.random.choice(['Individual', 'Business'], num_parties, p=[0.7, 0.3])
})


# Display top 100 records for each table
print("\nTop 100 records from 'parties' table:")
print(parties.head(100))
# Generate transactions data
transactions = []
for i in range(num_parties * 2):  # Assuming ~2 transactions per party on average
    source = np.random.choice(parties['party_id'])
    dest = np.random.choice(parties['party_id'])
    while source == dest:  # Ensure no self-transactions
        dest = np.random.choice(parties['party_id'])
    transactions.append({
        'transaction_id': f'TXN_{i}',
        'source_party_id': source,
        'destination_party_id': dest,
        'amount': round(np.random.uniform(10, 100000), 2),
        'timestamp': random_date(lookback_start_date, core_end_date)
    })
transactions_df = pd.DataFrame(transactions)


print("\nTop 100 records from 'transactions' table:")
print(transactions_df.head(100))

# Visualizations

# 1. Distribution of transaction amounts
plt.figure(figsize=(10, 6))
sns.histplot(transactions_df['amount'], bins=50, kde=True, color='blue')
plt.title('Distribution of Transaction Amounts')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.show()

# 2. Transaction count by source system
plt.figure(figsize=(8, 6))
sns.countplot(data=parties, x='source_system', palette='Set2')
plt.title('Count of Parties by Source System')
plt.xlabel('Source System')
plt.ylabel('Count')
plt.show()
# Generate risk cases
risk_cases = []
months = pd.date_range(lookback_start_date, core_end_date, freq='MS')
for month_start in months:
    # Generate positive cases
    positive_cases = transactions_df[
        (transactions_df['timestamp'] >= month_start - timedelta(days=60)) &
        (transactions_df['timestamp'] < month_start)
    ].groupby('destination_party_id').agg({'amount': 'sum'}).reset_index()
    positive_cases = positive_cases.nlargest(10, 'amount')  # Top 10 highest income
    structuring_cases = transactions_df.sample(10)  # Random structuring activity
   
    for case in positive_cases.itertuples():
        risk_cases.append({
            'party_id': case.destination_party_id,
            'risk_type': 'highest_income',
            'risk_date': month_start
        })
    for case in structuring_cases.itertuples():
        risk_cases.append({
            'party_id': case.source_party_id,
            'risk_type': 'structuring_activity',
            'risk_date': month_start
        })
   
    # Generate negative cases
    for _ in range(300):
        risk_cases.append({
            'party_id': np.random.choice(parties['party_id']),
            'risk_type': 'negative',
            'risk_date': month_start
        })
   
    # Apply 0.1% chance for opposite state
    if np.random.random() <= 0.001:
        opposite_case = risk_cases.pop()
        opposite_case['risk_type'] = (
            'positive' if opposite_case['risk_type'] == 'negative' else 'negative'
        )
        risk_cases.append(opposite_case)

risk_cases_df = pd.DataFrame(risk_cases)


print("\nTop 100 records from 'risk_cases' table:")
print(risk_cases_df.head(100))


# 3. Risk types count
plt.figure(figsize=(8, 6))
sns.countplot(data=risk_cases_df, x='risk_type', order=risk_cases_df['risk_type'].value_counts().index, palette='muted')
plt.title('Risk Types Count')
plt.xlabel('Risk Type')
plt.ylabel('Count')
plt.show()
#Additional Visualizations

# 4. Transaction trends over time
transactions_df['month'] = transactions_df['timestamp'].dt.to_period('M')
monthly_transactions = transactions_df.groupby('month').size()

plt.figure(figsize=(12, 6))
monthly_transactions.plot(kind='line', title='Monthly Transactions Trend', ylabel='Number of Transactions', xlabel='Month')
plt.show()

# 5. Party type distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=parties, x='type', palette='pastel')
plt.title('Party Type Distribution')
plt.xlabel('Type')
plt.ylabel('Count')
plt.show()
# Save to CSVs
parties_csv = "parties.csv"
transactions_csv = "transactions.csv"
risk_cases_csv = "risk_cases.csv"

parties.to_csv(parties_csv, index=False)
transactions_df.to_csv(transactions_csv, index=False)
risk_cases_df.to_csv(risk_cases_csv, index=False)


# Upload to S3
def upload_to_s3(file_name, bucket_name, object_name=None):
    s3_client = boto3.client('s3')
    if object_name is None:
        object_name = file_name
    s3_client.upload_file(file_name, bucket_name, object_name)
    print(f"Uploaded {file_name} to S3 bucket {bucket_name} as {object_name}")

# Specify your S3 bucket name
bucket_name = "fraud-isv-samples-usecases"

upload_to_s3(parties_csv, bucket_name)
upload_to_s3(transactions_csv, bucket_name)
upload_to_s3(risk_cases_csv, bucket_name)

print("Synthetic data generation, visualization, and upload completed:")
print(f"- Parties: {len(parties)}")
print(f"- Transactions: {len(transactions_df)}")
print(f"- Risk Cases: {len(risk_cases_df)}")
#Create Manifest Files for Data Exploratory Dashboard

import json

# Create manifest files for each dataset
def create_manifest(file_name, s3_bucket_name):
    manifest_data = {
        "fileLocations": [
            {
                "URIs": [
                    f"s3://{s3_bucket_name}/{file_name}"
                ]
            }
        ],
        "globalUploadSettings": {
            "format": "CSV",
            "delimiter": ",",
            "textqualifier": "'",
            "containsHeader": "true"
        }
    }
   
    manifest_file = file_name.replace(".csv", ".manifest.json")
    with open(manifest_file, "w") as f:
        json.dump(manifest_data, f, indent=4)
    print(f"Manifest file created: {manifest_file}")

    return manifest_file

# Specify bucket name
bucket_name = "your-s3-bucket-name"

# Generate manifest files
parties_manifest = create_manifest(parties_csv, bucket_name)
transactions_manifest = create_manifest(transactions_csv, bucket_name)
risk_cases_manifest = create_manifest(risk_cases_csv, bucket_name)

# Upload manifest files to S3
upload_to_s3(parties_manifest, bucket_name)
upload_to_s3(transactions_manifest, bucket_name)
upload_to_s3(risk_cases_manifest, bucket_name)

print("Manifest files created and uploaded to S3.")


# Generate manifest files
bucket_name = "your-s3-bucket-name"
parties_manifest = create_manifest(parties_csv, bucket_name)
transactions_manifest = create_manifest(transactions_csv, bucket_name)
risk_cases_manifest = create_manifest(risk_cases_csv, bucket_name)

# Upload manifest files to S3
upload_to_s3(parties_manifest, bucket_name)
upload_to_s3(transactions_manifest, bucket_name)
upload_to_s3(risk_cases_manifest, bucket_name)

print("Manifest files created and uploaded to S3.")





In [None]:
#new code 
import json
import boto3

# Prepare Neptune bulk loader manifest
def create_neptune_manifest(file_name, s3_bucket_name, s3_prefix, region):
    s3_path = f"s3://{s3_bucket_name}/{s3_prefix}/{file_name}"
    manifest_data = {
        "fileName": s3_path,
        "fileFormat": "csv",
        "delimiter": ",",
        "headerRow": True,
        "region": region,
    }
    manifest_file = file_name.replace(".csv", ".neptune.manifest.json")
    with open(manifest_file, "w") as f:
        json.dump([manifest_data], f, indent=4)
    print(f"Neptune manifest file created: {manifest_file}")
    return manifest_file

# Create Neptune manifest files
region = "us-east-1"  # Replace with your AWS Region
s3_prefix = "neptune-data"  # Define a folder for Neptune in your S3 bucket

parties_neptune_manifest = create_neptune_manifest(parties_csv, bucket_name, s3_prefix, region)
transactions_neptune_manifest = create_neptune_manifest(transactions_csv, bucket_name, s3_prefix, region)
risk_cases_neptune_manifest = create_neptune_manifest(risk_cases_csv, bucket_name, s3_prefix, region)

# Upload Neptune manifest files to S3
upload_to_s3(parties_neptune_manifest, bucket_name, f"{s3_prefix}/{parties_neptune_manifest}")
upload_to_s3(transactions_neptune_manifest, bucket_name, f"{s3_prefix}/{transactions_neptune_manifest}")
upload_to_s3(risk_cases_neptune_manifest, bucket_name, f"{s3_prefix}/{risk_cases_neptune_manifest}")

# Initiate Neptune Bulk Loader
neptune_cluster_endpoint = "your-neptune-endpoint"  # Replace with your Neptune cluster endpoint

def initiate_bulk_load(neptune_endpoint, manifest_s3_path, iam_role_arn):
    neptune_client = boto3.client('neptune-data')
    response = neptune_client.start_loader_job(
        source=manifest_s3_path,
        format="csv",
        iamRoleArn=iam_role_arn
    )
    print("Bulk load initiated:", response)
    return response

# Replace with your Neptune IAM role ARN
neptune_iam_role_arn = "arn:aws:iam::123456789012:role/YourNeptuneIAMRole"

# Load data into Neptune
manifest_paths = [
    f"s3://{bucket_name}/{s3_prefix}/{parties_neptune_manifest}",
    f"s3://{bucket_name}/{s3_prefix}/{transactions_neptune_manifest}",
    f"s3://{bucket_name}/{s3_prefix}/{risk_cases_neptune_manifest}",
]

for manifest_path in manifest_paths:
    initiate_bulk_load(neptune_cluster_endpoint, manifest_path, neptune_iam_role_arn)

print("Data import to Neptune initiated.")
