In [39]:
import numpy as np
import pandas as pd
import os
import random
import time

In [3]:
def create_customers_table(num_customers: int, random_state: int = 0) -> pd.DataFrame:
    np.random.seed(random_state)
    
    rows = []
    
    for i in range(num_customers):
        x_coord = np.random.uniform(0, 100)
        y_coord = np.random.uniform(0, 100)
        
        mean_spent = np.random.uniform(5, 100)
        std_spent = mean_spent/2
        
        mean_daily_txs = np.random.uniform(0, 4)

        rows.append([i, x_coord, y_coord, mean_spent, std_spent, mean_daily_txs])
    
    return pd.DataFrame(rows, columns=['customer_id', 'x_coord', 'y_coord', 'mean_spent', 'std_spent', 'mean_daily_txs'])

In [5]:
def create_terminal_table(num_terminals: int, random_state: int = 0) -> pd.DataFrame:
    np.random.seed(random_state)
    
    rows = []
    
    for i in range(num_terminals):
        x_coord = np.random.uniform(0, 100)
        y_coord = np.random.uniform(0, 100)
        
        rows.append([i, x_coord, y_coord])
    
    return pd.DataFrame(rows, columns=['terminal_id', 'x_coord', 'y_coord'])

In [None]:
def get_terminals_near_customer(customer: pd.Series, terminals: pd.DataFrame, range: int = 10) -> np.ndarray:
    return terminals[(terminals['x_coord'] - customer['x_coord'])**2 + (terminals['y_coord'] - customer['y_coord'])**2 <= range**2]['terminal_id'].values

In [28]:
def generate_transactions(customer: pd.Series, start_date: pd.Timestamp, n_days: int) -> pd.DataFrame:
    random.seed(int(customer['customer_id']))
    np.random.seed(int(customer['customer_id']))
    
    customer_transactions = []
    
    if (len(customer["avail_terminals"]) == 0):
        return pd.DataFrame(customer_transactions)
    
    for day in range(n_days):
        n_transactions = np.random.poisson(customer['mean_daily_txs'])
        
        if (n_transactions == 0):
            continue
        
        for i in range(n_transactions):
            time_since_start_date = int(np.random.normal(86400/2, 20000))
            
            if (time_since_start_date < 0 or time_since_start_date > 86400):
                continue
            
            amount = np.random.normal(customer['mean_spent'], customer['std_spent'])
            
            if (amount < 0):
                amount = np.random.uniform(0, customer['mean_spent']**2)
            
            amount = np.round(amount, 2)
            
            terminal_id = random.choice(customer["avail_terminals"])
            
            customer_transactions.append([customer['customer_id'],
                                          terminal_id,
                                          amount,
                                          day,
                                          time_since_start_date + day*86400])
    
    customer_transactions = pd.DataFrame(customer_transactions, columns=["customer_id", "terminal_id", "amount", "days_since_start_date", "time_since_start_date"])
    
    if (len(customer_transactions) > 0):
        customer_transactions["transaction_datetime"] = pd.to_datetime(customer_transactions["time_since_start_date"], unit='s', origin=start_date)
    
    return customer_transactions

In [65]:
def generate_dataset(num_customers: int, num_terminals: int, num_days: int, start_date: pd.Timestamp, radius: int) -> tuple:
    customers = create_customers_table(num_customers, random_state=0)
    terminals = create_terminal_table(num_terminals, random_state=1)
    
    customers["avail_terminals"] = customers.apply(get_terminals_near_customer, axis=1, args=(terminals, radius))
    
    transactions = customers.groupby("customer_id").apply(
        lambda x : generate_transactions(x.iloc[0], start_date, num_days)
        ).reset_index(drop=True)
    
    transactions.sort_values("transaction_datetime", inplace=True)
    transactions.reset_index(drop=True, inplace=True)
    transactions.reset_index(inplace=True)
    
    transactions.rename(columns={"index": "transaction_id"}, inplace=True)
    
    return (customers, terminals, transactions)

In [None]:
def add_frauds(customers: pd.DataFrame, terminals: pd.DataFrame, transactions: pd.DataFrame) -> pd.DataFrame:
    transactions["fraud"] = 0
    transactions["fraud_type"] = 0
    
    # Scenario 1 - all transactions that exceed 220 spent are fraudulent
    transactions.loc[transactions["amount"] > 220, "fraud"] = 1
    transactions.loc[transactions["amount"] > 220, "fraud_type"] = 1
    
    # Scenario 2 - every day, two terminals are drawn randomly and all transactions on those
    # terminals are fraudulent for the next 28 days
    for day in range(transactions["days_since_start_date"].max()):
        fraudulent_terminals = terminals["terminal_id"].sample(2, random_state=day)
        
        fraudulent_transactions = transactions[(transactions["days_since_start_date"] >= day) &
                                               (transactions["days_since_start_date"] < day + 28) &
                                               (transactions["terminal_id"].isin(fraudulent_terminals))]
        
        transactions.loc[fraudulent_transactions.index, "fraud"] = 1
        transactions.loc[fraudulent_transactions.index, "fraud_type"] = 2
    
    # Scenario 3 - every day, 3 customers are drawn randomly and for the next 14 days 1/3 of
    # their transactions are multiplied by 5 and marked as fraudulent
    for day in range(transactions["days_since_start_date"].max()):
        fraudulent_customers = customers["customer_id"].sample(3, random_state=day)
        
        fraudulent_transactions = transactions[(transactions["days_since_start_date"] >= day) &
                                               (transactions["days_since_start_date"] < day + 14) &
                                               (transactions["customer_id"].isin(fraudulent_customers))]
        
        n_fraudulent_transactions = len(fraudulent_transactions)
        
        random.seed(day)
        index_frauds = random.sample(list(fraudulent_transactions.index.values), k=int(n_fraudulent_transactions/3))
        
        transactions.loc[index_frauds, "amount"] *= 5
        transactions.loc[index_frauds, "fraud"] = 1
        transactions.loc[index_frauds, "fraud_type"] = 3
    
    return transactions

In [66]:
#start_date = pd.to_datetime(time.time(), unit='s')
start_date = pd.to_datetime("2018-04-01")
customers, terminals, transactions = generate_dataset(5000, 10000, 183, start_date, 5)

  transactions = customers.groupby("customer_id").apply(
  transactions = customers.groupby("customer_id").apply(


In [67]:
add_frauds(customers, terminals, transactions)

Unnamed: 0,transaction_id,customer_id,terminal_id,amount,days_since_start_date,time_since_start_date,transaction_datetime,fraud,fraud_type
0,0,596,3156,57.16,0,31,2018-04-01 00:00:31,0,0
1,1,4961,3412,81.51,0,130,2018-04-01 00:02:10,0,0
2,2,2,1365,146.00,0,476,2018-04-01 00:07:56,0,0
3,3,4128,8737,64.49,0,569,2018-04-01 00:09:29,0,0
4,4,927,9906,50.99,0,634,2018-04-01 00:10:34,0,0
...,...,...,...,...,...,...,...,...,...
1754167,1754167,161,655,54.24,182,15810996,2018-09-30 23:56:36,0,0
1754168,1754168,4342,6181,1.23,182,15811058,2018-09-30 23:57:38,0,0
1754169,1754169,618,1502,6.62,182,15811101,2018-09-30 23:58:21,0,0
1754170,1754170,4056,3067,55.40,182,15811192,2018-09-30 23:59:52,0,0


In [68]:
transactions.to_csv("datasets/transactions.csv", index=False)