In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Number of records
num_records = 10000

# Simulate data for each column
np.random.seed(42)  # Ensure reproducibility

trade_id = [f"T{str(i).zfill(5)}" for i in range(1, num_records + 1)]
trade_type = np.random.choice(['Buy', 'Sell'], size=num_records)
instrument_type = np.random.choice(['Equity', 'Bond', 'Derivative'], size=num_records)
trade_value = np.random.lognormal(mean=13, sigma=1, size=num_records)
trade_volume = np.random.randint(1, 100000, size=num_records)  # Increased range for large trades
counterparty_ids = [f"CP{np.random.randint(1, 50)}" for _ in range(num_records)]

# One-to-one mapping of counterparty risk score and failures
unique_counterparties = list(set(counterparty_ids))
counterparty_risk_scores = {cp: np.random.uniform(0, 1) for cp in unique_counterparties}
counterparty_failures = {cp: np.random.randint(0, 10) for cp in unique_counterparties}

counterparty_risk_score = [counterparty_risk_scores[cp] for cp in counterparty_ids]
counterparty_failures = [counterparty_failures[cp] for cp in counterparty_ids]

# Assign settlement status, giving more failures to counterparties with higher failures
settlement_status = [
    'Fail' if np.random.rand() < (cf / 20) else 'Success' for cf in counterparty_failures
]

settlement_duration = np.random.normal(48, 10, size=num_records)
market_volatility = np.random.uniform(0.1, 0.7, size=num_records)  # Increased max range to allow high volatility
liquidity = np.random.uniform(0.01, 0.1, size=num_records)
processing_time = np.random.randint(1, 60, size=num_records)
manual_intervention = np.random.choice([0, 1], size=num_records, p=[0.7, 0.3])
time_to_settle = settlement_duration + np.random.randint(-5, 5, size=num_records)
counterparty_failure_rate = np.array(counterparty_failures) / 100
currencies = np.random.choice(['USD', 'EUR', 'GBP', 'INR', 'JPY'], size=num_records)
settlement_type = np.random.choice(['FoP', 'AoP'], size=num_records)
client_ids = [f"CL{np.random.randint(1, 50)}" for _ in range(num_records)]

# Generate 50 known ISINs
known_isins = [f"INE{str(i).zfill(6)}{np.random.choice(list('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'))}" for i in range(50)]
isins = np.random.choice(known_isins, size=num_records)

# Ordered failure reasons with decreasing frequency
failure_reasons_ordered = [
    'Account Difference', 'Invalid Counterparty', 'Currency Difference', 
    'Quantity Difference', 'Documentation Issue',  
    'Client Instruction Missing', 'Issues at the CSD'
]

# Normalize weights to sum to 1
failure_weights = np.array([0.20, 0.15, 0.12, 0.11, 0.10, 0.09, 0.08])
failure_weights /= failure_weights.sum()  # Normalize

# Identify conditions for assigning failure reasons
is_high_trade_volume = trade_volume > 80000

# Assign settlement failure reasons based on conditions
settlement_failure_reason = [None] * num_records

for i in range(num_records):
    if settlement_status[i] == 'Fail':
        if is_high_trade_volume[i]:  # Ensure these reasons appear only when trade_volume > 80,000
            if trade_type[i] == 'Buy':
                settlement_failure_reason[i] = np.random.choice(
                    ['Insufficient Funds', 'Counterparty Short to Deliver'], p=[0.7, 0.3]
                )
            elif trade_type[i] == 'Sell':
                settlement_failure_reason[i] = np.random.choice(
                    ['Insufficient Funds at the Counterparty', 'Client Short to Deliver'], p=[0.6, 0.4]
                )
        elif liquidity[i] < 0.02:  # ✅ Market Closure strictly when liquidity < 0.02
            settlement_failure_reason[i] = 'Market Closure'
        elif market_volatility[i] > 0.5:  # Technical Error only when volatility is high
            settlement_failure_reason[i] = 'Technical Error'
        else:
            settlement_failure_reason[i] = np.random.choice(failure_reasons_ordered, p=failure_weights)

# ✅ Ensure that "Market Closure" is assigned only when liquidity < 0.02
for i in range(num_records):
    if settlement_failure_reason[i] == "Market Closure" and liquidity[i] >= 0.02:
        settlement_failure_reason[i] = np.random.choice(failure_reasons_ordered, p=failure_weights)

# ✅ Ensure that "Insufficient Funds" and related reasons are only assigned where trade_volume > 80,000
for i in range(num_records):
    if settlement_failure_reason[i] in [
        'Insufficient Funds', 'Counterparty Short to Deliver', 'Insufficient Funds at the Counterparty', 'Client Short to Deliver'
    ]:
        if trade_volume[i] <= 80000:  # If wrongly assigned, correct it
            settlement_failure_reason[i] = np.random.choice(failure_reasons_ordered, p=failure_weights)

# ✅ Ensure that "Technical Error" is assigned only when market_volatility > 0.5
for i in range(num_records):
    if settlement_failure_reason[i] == "Technical Error" and market_volatility[i] <= 0.5:
        settlement_failure_reason[i] = np.random.choice(failure_reasons_ordered, p=failure_weights)

# Add Trade Matched or Not column
trade_matched = np.random.choice(['Matched', 'Not Matched'], size=num_records, p=[0.95, 0.05])

# Generate Expected Settlement Dates within the past year
expected_settlement_dates = pd.to_datetime(
    [datetime(2024, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(num_records)]
)

# Generate Actual Settlement Dates based on Settlement Status
actual_settlement_dates = [
    expected_settlement_dates[i] if settlement_status[i] == 'Success' else 
    expected_settlement_dates[i] + timedelta(days=np.random.randint(1, 6))
    for i in range(num_records)
]

# Calculate Days Difference
days_difference = [
    (actual_settlement_dates[i] - expected_settlement_dates[i]).days
    for i in range(num_records)
]

# Combine data into a DataFrame
data = {
    'Trade ID': trade_id,
    'Trade Type': trade_type,
    'Instrument Type': instrument_type,
    'Trade Value': trade_value,
    'Trade Volume': trade_volume,
    'Counterparty ID': counterparty_ids,
    'Counterparty Risk Score': counterparty_risk_score,
    'Counterparty Failures': counterparty_failures,
    'Settlement Status': settlement_status,
    'Settlement Failure Reason': settlement_failure_reason,
    'Settlement Duration': settlement_duration,
    'Market Volatility': market_volatility,
    'Liquidity': liquidity,
    'Processing Time': processing_time,
    'Manual Intervention': manual_intervention,
    'Time to Settle': time_to_settle,
    'Counterparty Failure Rate': counterparty_failure_rate,
    'Currency': currencies,
    'Settlement Type': settlement_type,
    'Client ID': client_ids,
    'ISIN': isins,
    'Trade Matched or Not': trade_matched,
    'Expected Settlement Date': expected_settlement_dates,
    'Actual Settlement Date': actual_settlement_dates,
    'Days Difference': days_difference
}

df = pd.DataFrame(data)

# Save dataset to CSV
df.to_csv('TradeSettlementDetails.csv', index=False)

print("Dataset created and saved as 'TradeSettlementDetails.csv'.")


Dataset created and saved as 'TradeSettlementDetails.csv'.


In [2]:
import pandas as pd
dataset=pd.read_csv('TradeSettlementDetails.csv')
dataset

Unnamed: 0,Trade ID,Trade Type,Instrument Type,Trade Value,Trade Volume,Counterparty ID,Counterparty Risk Score,Counterparty Failures,Settlement Status,Settlement Failure Reason,...,Time to Settle,Counterparty Failure Rate,Currency,Settlement Type,Client ID,ISIN,Trade Matched or Not,Expected Settlement Date,Actual Settlement Date,Days Difference
0,T00001,Buy,Derivative,6.117879e+05,94974,CP15,0.197086,2,Success,,...,42.461085,0.02,JPY,AoP,CL24,INE000004E,Matched,2024-02-21,2024-02-21,0
1,T00002,Sell,Derivative,7.307262e+05,16986,CP8,0.458036,2,Success,,...,61.836701,0.02,INR,FoP,CL43,INE000026S,Not Matched,2024-09-11,2024-09-11,0
2,T00003,Buy,Derivative,1.317804e+05,92472,CP29,0.802380,7,Fail,Insufficient Funds,...,32.604111,0.07,EUR,AoP,CL7,INE000024D,Matched,2024-07-01,2024-07-03,2
3,T00004,Buy,Bond,3.243996e+05,21069,CP42,0.914575,2,Success,,...,42.245322,0.02,INR,AoP,CL11,INE000028L,Matched,2024-04-09,2024-04-09,0
4,T00005,Buy,Derivative,5.392381e+05,13532,CP39,0.153635,9,Success,,...,49.037923,0.09,INR,FoP,CL36,INE000031G,Matched,2024-03-19,2024-03-19,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,T09996,Sell,Derivative,7.986397e+05,6858,CP34,0.944301,6,Success,,...,53.863479,0.06,JPY,AoP,CL33,INE000011H,Matched,2024-11-05,2024-11-05,0
9996,T09997,Buy,Derivative,2.931886e+05,65604,CP14,0.744404,2,Success,,...,32.956419,0.02,EUR,AoP,CL17,INE0000385,Matched,2024-02-14,2024-02-14,0
9997,T09998,Sell,Equity,8.856600e+05,75400,CP36,0.325795,6,Fail,Documentation Issue,...,32.857306,0.06,EUR,AoP,CL33,INE000045O,Matched,2024-08-06,2024-08-09,3
9998,T09999,Sell,Bond,1.632633e+06,58087,CP48,0.639493,2,Success,,...,41.631755,0.02,EUR,FoP,CL35,INE000001N,Not Matched,2024-02-04,2024-02-04,0


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Trade ID                   10000 non-null  object 
 1   Trade Type                 10000 non-null  object 
 2   Instrument Type            10000 non-null  object 
 3   Trade Value                10000 non-null  float64
 4   Trade Volume               10000 non-null  int64  
 5   Counterparty ID            10000 non-null  object 
 6   Counterparty Risk Score    10000 non-null  float64
 7   Counterparty Failures      10000 non-null  int64  
 8   Settlement Status          10000 non-null  object 
 9   Settlement Failure Reason  1850 non-null   object 
 10  Settlement Duration        10000 non-null  float64
 11  Market Volatility          10000 non-null  float64
 12  Liquidity                  10000 non-null  float64
 13  Processing Time            10000 non-null  int6

In [4]:
dataset['Settlement Status'].value_counts()

Settlement Status
Success    8150
Fail       1850
Name: count, dtype: int64

In [5]:
dataset.nunique()

Trade ID                     10000
Trade Type                       2
Instrument Type                  3
Trade Value                  10000
Trade Volume                  9508
Counterparty ID                 49
Counterparty Risk Score         49
Counterparty Failures           10
Settlement Status                2
Settlement Failure Reason       13
Settlement Duration          10000
Market Volatility            10000
Liquidity                    10000
Processing Time                 59
Manual Intervention              2
Time to Settle               10000
Counterparty Failure Rate       10
Currency                         5
Settlement Type                  2
Client ID                       49
ISIN                            50
Trade Matched or Not             2
Expected Settlement Date       365
Actual Settlement Date         370
Days Difference                  6
dtype: int64