In [1]:
#!pip install pandarallel

Defaulting to user installation because normal site-packages is not writeable
Collecting pandarallel
  Downloading pandarallel-1.6.3.tar.gz (12 kB)
Collecting dill>=0.3.1
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[K     |████████████████████████████████| 110 kB 2.3 MB/s eta 0:00:01
Collecting psutil
  Downloading psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (280 kB)
[K     |████████████████████████████████| 280 kB 13.9 MB/s eta 0:00:01
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25ldone
[?25h  Created wheel for pandarallel: filename=pandarallel-1.6.3-py3-none-any.whl size=16449 sha256=fb1fa613523ff3922aad5c3ef6eb416dd91dc3f41c6916fafdf8182bfadfa963
  Stored in directory: /home/ubuntu/.cache/pip/wheels/16/f5/91/d7efa7c4911ae1cf1aff825f902382eb69a1855f9987a7d17c
Successfully built pandarallel
Installing collected packages: dill, psutil, pandarallel
Su

In [2]:
import os
import sys
import pickle

import numpy as np
import pandas as pd

import datetime
import time

import random

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

from tqdm import tqdm
tqdm.pandas()

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
def generate_customer_profiles_table(n_customers, random_state=0):
    
    np.random.seed(random_state)
        
    customer_id_properties=[]
    
    # Generate customer properties from random distributions 
    for customer_id in range(n_customers):
        
        x_customer_id = np.random.uniform(0,100)
        y_customer_id = np.random.uniform(0,100)
        
        mean_amount = np.random.uniform(5,100) # Arbitrary (but sensible) value 
        std_amount = mean_amount/2 # Arbitrary (but sensible) value
        
        mean_nb_tx_per_day = np.random.uniform(0,4) # Arbitrary (but sensible) value 
        
        customer_id_properties.append([customer_id,
                                      x_customer_id, y_customer_id,
                                      mean_amount, std_amount,
                                      mean_nb_tx_per_day])
        
    customer_profiles_table = pd.DataFrame(customer_id_properties, columns=['CUSTOMER_ID',
                                                                      'x_customer_id', 'y_customer_id',
                                                                      'mean_amount', 'std_amount',
                                                                      'mean_nb_tx_per_day'])
    
    return customer_profiles_table

In [4]:
def generate_terminal_profiles_table(n_terminals, random_state=0):
    
    np.random.seed(random_state)
        
    terminal_id_properties=[]
    
    # Generate terminal properties from random distributions 
    for terminal_id in range(n_terminals):
        
        x_terminal_id = np.random.uniform(0,100)
        y_terminal_id = np.random.uniform(0,100)
        
        terminal_id_properties.append([terminal_id,
                                      x_terminal_id, y_terminal_id])
                                       
    terminal_profiles_table = pd.DataFrame(terminal_id_properties, columns=['TERMINAL_ID',
                                                                      'x_terminal_id', 'y_terminal_id'])
    
    return terminal_profiles_table

In [5]:
def get_list_terminals_within_radius(customer_profile, x_y_terminals, r):
    
    # Use numpy arrays in the following to speed up computations
    
    # Location (x,y) of customer as numpy array
    x_y_customer = customer_profile[['x_customer_id','y_customer_id']].values.astype(float)
    
    # Squared difference in coordinates between customer and terminal locations
    squared_diff_x_y = np.square(x_y_customer - x_y_terminals)
    
    # Sum along rows and compute suared root to get distance
    dist_x_y = np.sqrt(np.sum(squared_diff_x_y, axis=1))
    
    # Get the indices of terminals which are at a distance less than r
    available_terminals = list(np.where(dist_x_y<r)[0])
    
    # Return the list of terminal IDs
    return available_terminals

In [6]:
def generate_transactions_table(customer_profile, start_date, nb_days = 10):
    
    customer_transactions = []
    
    random.seed(int(customer_profile.CUSTOMER_ID))
    np.random.seed(int(customer_profile.CUSTOMER_ID))
    
    # For all days
    for day in range(nb_days):
        
        # Random number of transactions for that day 
        nb_tx = np.random.poisson(customer_profile.mean_nb_tx_per_day)
        
        # If nb_tx positive, let us generate transactions
        if nb_tx>0:
            
            for tx in range(nb_tx):
                
                # Time of transaction: Around noon, std 20000 seconds. This choice aims at simulating the fact that 
                # most transactions occur during the day.
                time_tx = int(np.random.normal(86400/2, 20000))
                
                # If transaction time between 0 and 86400, let us keep it, otherwise, let us discard it
                if (time_tx>0) and (time_tx<86400):
                    
                    # Amount is drawn from a normal distribution  
                    amount = np.random.normal(customer_profile.mean_amount, customer_profile.std_amount)
                    
                    # If amount negative, draw from a uniform distribution
                    if amount<0:
                        amount = np.random.uniform(0,customer_profile.mean_amount*2)
                    
                    amount=np.round(amount,decimals=2)
                    
                    if len(customer_profile.available_terminals)>0:
                        
                        terminal_id = random.choice(customer_profile.available_terminals)
                    
                        customer_transactions.append([time_tx+day*86400, day,
                                                      customer_profile.CUSTOMER_ID, 
                                                      terminal_id, amount])
            
    customer_transactions = pd.DataFrame(customer_transactions, columns=['TX_TIME_SECONDS', 'TX_TIME_DAYS', 'CUSTOMER_ID', 'TERMINAL_ID', 'TX_AMOUNT'])
    
    if len(customer_transactions)>0:
        customer_transactions['TX_DATETIME'] = pd.to_datetime(customer_transactions["TX_TIME_SECONDS"], unit='s', origin=start_date)
        customer_transactions=customer_transactions[['TX_DATETIME','CUSTOMER_ID', 'TERMINAL_ID', 'TX_AMOUNT','TX_TIME_SECONDS', 'TX_TIME_DAYS']]
    
    return customer_transactions

In [7]:
def add_frauds(customer_profiles_table, terminal_profiles_table, transactions_df):
    
    # By default, all transactions are genuine
    transactions_df['TX_FRAUD']=0
    transactions_df['TX_FRAUD_SCENARIO']=0
    
    # Scenario 1
    transactions_df.loc[transactions_df.TX_AMOUNT>220, 'TX_FRAUD']=1
    transactions_df.loc[transactions_df.TX_AMOUNT>220, 'TX_FRAUD_SCENARIO']=1
    nb_frauds_scenario_1=transactions_df.TX_FRAUD.sum()
    print("Number of frauds from scenario 1: "+str(nb_frauds_scenario_1))
    
    # Scenario 2
    for day in range(transactions_df.TX_TIME_DAYS.max()):
        
        compromised_terminals = terminal_profiles_table.TERMINAL_ID.sample(n=1, random_state=day)
        
        compromised_transactions=transactions_df[(transactions_df.TX_TIME_DAYS>=day) & 
                                                    (transactions_df.TX_TIME_DAYS<day+14) & 
                                                    (transactions_df.TERMINAL_ID.isin(compromised_terminals))]
                            
        transactions_df.loc[compromised_transactions.index,'TX_FRAUD']=1
        transactions_df.loc[compromised_transactions.index,'TX_FRAUD_SCENARIO']=2
    
    nb_frauds_scenario_2=transactions_df.TX_FRAUD.sum()-nb_frauds_scenario_1
    print("Number of frauds from scenario 2: "+str(nb_frauds_scenario_2))
    
    # Scenario 3
    for day in range(transactions_df.TX_TIME_DAYS.max()):
        
        compromised_customers = customer_profiles_table.CUSTOMER_ID.sample(n=3, random_state=day).values
        
        compromised_transactions=transactions_df[(transactions_df.TX_TIME_DAYS>=day) & 
                                                    (transactions_df.TX_TIME_DAYS<day+14) & 
                                                    (transactions_df.CUSTOMER_ID.isin(compromised_customers))]
        
        nb_compromised_transactions=len(compromised_transactions)
        
        
        random.seed(day)
        index_fauds = random.sample(list(compromised_transactions.index.values),k=int(nb_compromised_transactions/3))
        
        transactions_df.loc[index_fauds,'TX_AMOUNT']=transactions_df.loc[index_fauds,'TX_AMOUNT']*5
        transactions_df.loc[index_fauds,'TX_FRAUD']=1
        transactions_df.loc[index_fauds,'TX_FRAUD_SCENARIO']=3
        
                             
    nb_frauds_scenario_3=transactions_df.TX_FRAUD.sum()-nb_frauds_scenario_2-nb_frauds_scenario_1
    print("Number of frauds from scenario 3: "+str(nb_frauds_scenario_3))
    
    return transactions_df

In [8]:
#switch to pandarallel when computing on a spark cluster

In [9]:
def generate_dataset(customer_profiles_table, terminal_profiles_table, nb_days=5, start_date="2022-01-01", r=5, start_index = 0):
    
    start_time=time.time()
    
    x_y_terminals = terminal_profiles_table[['x_terminal_id','y_terminal_id']].values.astype(float)
    #customer_profiles_table['available_terminals'] = customer_profiles_table.apply(lambda x : get_list_terminals_within_radius(x, x_y_terminals=x_y_terminals, r=r), axis=1)
    # With Pandarallel
    customer_profiles_table['available_terminals'] = customer_profiles_table.parallel_apply(lambda x : get_list_terminals_within_radius(x, x_y_terminals=x_y_terminals, r=r), axis=1)
    customer_profiles_table['nb_terminals']=customer_profiles_table.available_terminals.apply(len)
    print("Time to associate terminals to customers: {0:.2}s".format(time.time()-start_time))
    
    start_time=time.time()
    
    #transactions_df=customer_profiles_table.groupby('CUSTOMER_ID').apply(lambda x : generate_transactions_table(x.iloc[0], start_date, nb_days=nb_days)).reset_index(drop=True)
    # With Pandarallel
    transactions_df=customer_profiles_table.groupby('CUSTOMER_ID').parallel_apply(lambda x : generate_transactions_table(x.iloc[0], start_date, nb_days=nb_days)).reset_index(drop=True)
    print("Time to generate transactions: {0:.2}s".format(time.time()-start_time))
    
    # Sort transactions chronologically
    transactions_df=transactions_df.sort_values('TX_DATETIME')
    # Reset indices, starting from 0
    transactions_df.reset_index(inplace=True,drop=True)
    transactions_df.reset_index(inplace=True)
    # TRANSACTION_ID are the dataframe indices, starting from 0
    transactions_df.rename(columns = {'index':'TRANSACTION_ID'}, inplace = True)
    transactions_df.TRANSACTION_ID += start_index

    transactions_df = add_frauds(customer_profiles_table, terminal_profiles_table, transactions_df)

    return transactions_df

In [10]:
DIR_OUTPUT = "/home/ubuntu/Fraud-detection/simulated-data-raw/"

if not os.path.exists(DIR_OUTPUT):
    os.makedirs(DIR_OUTPUT)

In [11]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
udev            7.8G     0  7.8G   0% /dev
tmpfs           1.6G  804K  1.6G   1% /run
/dev/vda2        99G   12G   83G  13% /
tmpfs           7.9G     0  7.9G   0% /dev/shm
tmpfs           5.0M     0  5.0M   0% /run/lock
tmpfs           7.9G     0  7.9G   0% /sys/fs/cgroup
tmpfs           1.6G     0  1.6G   0% /run/user/1000


In [12]:
N_CUSTOMERS = 5000 #20000 #200000
N_TERMINALS = 10000 #2000 #20000
NB_DAYS = 183
SEEDS = [0] #[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
YEARS = [2022] #[2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

start_index = 0

for seed, year in zip([0], [2022]):

    customer_profiles_table = generate_customer_profiles_table(n_customers = N_CUSTOMERS, random_state = seed)
    terminal_profiles_table = generate_terminal_profiles_table(n_terminals = N_TERMINALS, random_state = seed)

    print('Generating data for year {}'.format(year))
    print()

    start_date = "{}-01-01".format(year)

    transactions_df = generate_dataset(customer_profiles_table, 
                                         terminal_profiles_table, 
                                         nb_days=NB_DAYS, 
                                         start_date = start_date, 
                                         r=5,
                                         start_index = start_index)

    print('Size MB: {}'.format(round(sys.getsizeof(transactions_df) / 1e6)))

    start_index += len(transactions_df)

    print(transactions_df.shape)

    print(transactions_df.head(3))

    start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")    
    
    for day in range(transactions_df.TX_TIME_DAYS.max()+1):
    
        transactions_day = transactions_df[transactions_df.TX_TIME_DAYS==day].sort_values('TX_TIME_SECONDS')

        date = start_date + datetime.timedelta(days=day)
        filename_output = date.strftime("%Y-%m-%d")+'.pkl'

        #transactions_day.to_csv(DIR_OUTPUT+filename_output, index=False)

        transactions_day.to_pickle(DIR_OUTPUT+filename_output)

    print()
    
    del transactions_df
    del customer_profiles_table
    del terminal_profiles_table 

Generating data for year 2022



VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

Time to associate terminals to customers: 2.0s


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

Time to generate transactions: 6.7e+01s
Number of frauds from scenario 1: 978
Number of frauds from scenario 2: 2371
Number of frauds from scenario 3: 4624
Size MB: 323
(1754155, 9)
   TRANSACTION_ID         TX_DATETIME CUSTOMER_ID TERMINAL_ID  TX_AMOUNT  \
0               0 2022-01-01 00:00:31         596        1997      57.16   
1               1 2022-01-01 00:02:10        4961        4448      81.51   
2               2 2022-01-01 00:07:56           2        1338     146.00   

  TX_TIME_SECONDS TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
0              31            0         0                  0  
1             130            0         0                  0  
2             476            0         0                  0  



In [None]:
# from pathlib import Path

# root_directory = Path('/home/ubuntu/')
# print('Folder size (MB): ', round(sum(f.stat().st_size for f in root_directory.glob('My projects/Fraud-detection/simulated-data-raw/*') if f.is_file()) / 1e6))

In [None]:
# with open(r"/home/ubuntu/My projects/Fraud-detection/simulated-data-raw/2020-12.pkl", "rb") as input_file:
#     data = pickle.load(input_file)

In [None]:
# data.head()

In [None]:
# print('Total number of transactions: ', data.iloc[-1]['TRANSACTION_ID'])

In [None]:
# !df -h