In [1]:
#!pip install pandarallel

In [2]:
import os
import sys
import pickle

import numpy as np
import pandas as pd

import datetime
import time

import random

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

from tqdm import tqdm
tqdm.pandas()

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
def generate_customer_profiles_table(n_customers, random_state=0):
    
    np.random.seed(random_state)
        
    customer_id_properties=[]
    
    # Generate customer properties from random distributions 
    for customer_id in range(n_customers):
        
        x_customer_id = np.random.uniform(0,100)
        y_customer_id = np.random.uniform(0,100)
        
        mean_amount = np.random.uniform(5,100) # Arbitrary (but sensible) value 
        std_amount = mean_amount/2 # Arbitrary (but sensible) value
        
        mean_nb_tx_per_day = np.random.uniform(0,4) # Arbitrary (but sensible) value 
        
        customer_id_properties.append([customer_id,
                                      x_customer_id, y_customer_id,
                                      mean_amount, std_amount,
                                      mean_nb_tx_per_day])
        
    customer_profiles_table = pd.DataFrame(customer_id_properties, columns=['CUSTOMER_ID',
                                                                      'x_customer_id', 'y_customer_id',
                                                                      'mean_amount', 'std_amount',
                                                                      'mean_nb_tx_per_day'])
    
    return customer_profiles_table

In [4]:
def generate_terminal_profiles_table(n_terminals, random_state=0):
    
    np.random.seed(random_state)
        
    terminal_id_properties=[]
    
    # Generate terminal properties from random distributions 
    for terminal_id in range(n_terminals):
        
        x_terminal_id = np.random.uniform(0,100)
        y_terminal_id = np.random.uniform(0,100)
        
        terminal_id_properties.append([terminal_id,
                                      x_terminal_id, y_terminal_id])
                                       
    terminal_profiles_table = pd.DataFrame(terminal_id_properties, columns=['TERMINAL_ID',
                                                                      'x_terminal_id', 'y_terminal_id'])
    
    return terminal_profiles_table

In [5]:
def get_list_terminals_within_radius(customer_profile, x_y_terminals, r):
    
    # Use numpy arrays in the following to speed up computations
    
    # Location (x,y) of customer as numpy array
    x_y_customer = customer_profile[['x_customer_id','y_customer_id']].values.astype(float)
    
    # Squared difference in coordinates between customer and terminal locations
    squared_diff_x_y = np.square(x_y_customer - x_y_terminals)
    
    # Sum along rows and compute suared root to get distance
    dist_x_y = np.sqrt(np.sum(squared_diff_x_y, axis=1))
    
    # Get the indices of terminals which are at a distance less than r
    available_terminals = list(np.where(dist_x_y<r)[0])
    
    # Return the list of terminal IDs
    return available_terminals

In [6]:
def generate_transactions_table(customer_profile, start_date, nb_days = 10):
    
    customer_transactions = []
    
    random.seed(int(customer_profile.CUSTOMER_ID))
    np.random.seed(int(customer_profile.CUSTOMER_ID))
    
    # For all days
    for day in range(nb_days):
        
        # Random number of transactions for that day 
        nb_tx = np.random.poisson(customer_profile.mean_nb_tx_per_day)
        
        # If nb_tx positive, let us generate transactions
        if nb_tx>0:
            
            for tx in range(nb_tx):
                
                # Time of transaction: Around noon, std 20000 seconds. This choice aims at simulating the fact that 
                # most transactions occur during the day.
                time_tx = int(np.random.normal(86400/2, 20000))
                
                # If transaction time between 0 and 86400, let us keep it, otherwise, let us discard it
                if (time_tx>0) and (time_tx<86400):
                    
                    # Amount is drawn from a normal distribution  
                    amount = np.random.normal(customer_profile.mean_amount, customer_profile.std_amount)
                    
                    # If amount negative, draw from a uniform distribution
                    if amount<0:
                        amount = np.random.uniform(0,customer_profile.mean_amount*2)
                    
                    amount=np.round(amount,decimals=2)
                    
                    if len(customer_profile.available_terminals)>0:
                        
                        terminal_id = random.choice(customer_profile.available_terminals)
                    
                        customer_transactions.append([time_tx+day*86400, day,
                                                      customer_profile.CUSTOMER_ID, 
                                                      terminal_id, amount])
            
    customer_transactions = pd.DataFrame(customer_transactions, columns=['TX_TIME_SECONDS', 'TX_TIME_DAYS', 'CUSTOMER_ID', 'TERMINAL_ID', 'TX_AMOUNT'])
    
    if len(customer_transactions)>0:
        customer_transactions['TX_DATETIME'] = pd.to_datetime(customer_transactions["TX_TIME_SECONDS"], unit='s', origin=start_date)
        customer_transactions=customer_transactions[['TX_DATETIME','CUSTOMER_ID', 'TERMINAL_ID', 'TX_AMOUNT','TX_TIME_SECONDS', 'TX_TIME_DAYS']]
    
    return customer_transactions

In [7]:
def add_frauds(customer_profiles_table, terminal_profiles_table, transactions_df):
    
    # By default, all transactions are genuine
    transactions_df['TX_FRAUD']=0
    transactions_df['TX_FRAUD_SCENARIO']=0
    
    # Scenario 1
    transactions_df.loc[transactions_df.TX_AMOUNT>220, 'TX_FRAUD']=1
    transactions_df.loc[transactions_df.TX_AMOUNT>220, 'TX_FRAUD_SCENARIO']=1
    nb_frauds_scenario_1=transactions_df.TX_FRAUD.sum()
    print("Number of frauds from scenario 1: "+str(nb_frauds_scenario_1))
    
    # Scenario 2
#     for day in range(transactions_df.TX_TIME_DAYS.max()):
        
#         compromised_terminals = terminal_profiles_table.TERMINAL_ID.sample(n=1, random_state=day)
        
#         compromised_transactions=transactions_df[(transactions_df.TX_TIME_DAYS>=day) & 
#                                                     (transactions_df.TX_TIME_DAYS<day+14) & 
#                                                     (transactions_df.TERMINAL_ID.isin(compromised_terminals))]
                            
#         transactions_df.loc[compromised_transactions.index,'TX_FRAUD']=1
#         transactions_df.loc[compromised_transactions.index,'TX_FRAUD_SCENARIO']=2
    
#     nb_frauds_scenario_2=transactions_df.TX_FRAUD.sum()-nb_frauds_scenario_1
#     print("Number of frauds from scenario 2: "+str(nb_frauds_scenario_2))
    
    # Scenario 3
#     for day in range(transactions_df.TX_TIME_DAYS.max()):
        
#         compromised_customers = customer_profiles_table.CUSTOMER_ID.sample(n=3, random_state=day).values
        
#         compromised_transactions=transactions_df[(transactions_df.TX_TIME_DAYS>=day) & 
#                                                     (transactions_df.TX_TIME_DAYS<day+14) & 
#                                                     (transactions_df.CUSTOMER_ID.isin(compromised_customers))]
        
#         nb_compromised_transactions=len(compromised_transactions)
        
        
#         random.seed(day)
#         index_fauds = random.sample(list(compromised_transactions.index.values),k=int(nb_compromised_transactions/3))
        
#         transactions_df.loc[index_fauds,'TX_AMOUNT']=transactions_df.loc[index_fauds,'TX_AMOUNT']*5
#         transactions_df.loc[index_fauds,'TX_FRAUD']=1
#         transactions_df.loc[index_fauds,'TX_FRAUD_SCENARIO']=3
        
                             
#     nb_frauds_scenario_3=transactions_df.TX_FRAUD.sum()-nb_frauds_scenario_2-nb_frauds_scenario_1
#     print("Number of frauds from scenario 3: "+str(nb_frauds_scenario_3))
    
    return transactions_df

In [8]:
def generate_dataset(customer_profiles_table, terminal_profiles_table, nb_days=10, start_date="2018-01-01", r=5, start_index = 0):
    
    start_time=time.time()
    x_y_terminals = terminal_profiles_table[['x_terminal_id','y_terminal_id']].values.astype(float)
    #customer_profiles_table['available_terminals'] = customer_profiles_table.apply(lambda x : get_list_terminals_within_radius(x, x_y_terminals=x_y_terminals, r=r), axis=1)
    # With Pandarallel
    customer_profiles_table['available_terminals'] = customer_profiles_table.parallel_apply(lambda x : get_list_terminals_within_radius(x, x_y_terminals=x_y_terminals, r=r), axis=1)
    customer_profiles_table['nb_terminals']=customer_profiles_table.available_terminals.apply(len)
    print("Time to associate terminals to customers: {0:.2}s".format(time.time()-start_time))
    
    start_time=time.time()
    # transactions_df=customer_profiles_table.groupby('CUSTOMER_ID').apply(lambda x : generate_transactions_table(x.iloc[0], nb_days=nb_days)).reset_index(drop=True)
    # With Pandarallel
    transactions_df=customer_profiles_table.groupby('CUSTOMER_ID').parallel_apply(lambda x : generate_transactions_table(x.iloc[0], start_date, nb_days=nb_days)).reset_index(drop=True)
    print("Time to generate transactions: {0:.2}s".format(time.time()-start_time))
    
    # Sort transactions chronologically
    transactions_df=transactions_df.sort_values('TX_DATETIME')
    # Reset indices, starting from 0
    transactions_df.reset_index(inplace=True,drop=True)
    transactions_df.reset_index(inplace=True)
    # TRANSACTION_ID are the dataframe indices, starting from 0
    transactions_df.rename(columns = {'index':'TRANSACTION_ID'}, inplace = True)
    transactions_df.TRANSACTION_ID += start_index

    transactions_df = add_frauds(customer_profiles_table, terminal_profiles_table, transactions_df)

    return transactions_df

In [9]:
DIR_OUTPUT = "simulated-data-raw/"

if not os.path.exists(DIR_OUTPUT):
    os.makedirs(DIR_OUTPUT)

In [10]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
udev             63G     0   63G   0% /dev
tmpfs            13G  988K   13G   1% /run
/dev/vda2       126G   13G  109G  11% /
tmpfs            63G     0   63G   0% /dev/shm
tmpfs           5.0M     0  5.0M   0% /run/lock
tmpfs            63G     0   63G   0% /sys/fs/cgroup
tmpfs            13G     0   13G   0% /run/user/1000


In [11]:
start_index = 0

for seed, year in zip([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 
                      [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]):

    customer_profiles_table = generate_customer_profiles_table(n_customers = 200000, random_state = seed)
    terminal_profiles_table = generate_terminal_profiles_table(n_terminals = 20000, random_state = seed)

    print('Generating data for year {}'.format(year))
    print()

    start_date = "{}-01-01".format(year)

    transactions_df = generate_dataset(customer_profiles_table, 
                                         terminal_profiles_table, 
                                         nb_days=365, 
                                         start_date = start_date, 
                                         r=5,
                                         start_index = start_index)

    print('Size MB: {}'.format(round(sys.getsizeof(transactions_df) / 1e6)))

    start_index += len(transactions_df)

    print(transactions_df.shape)

    print(transactions_df.head(3))

    start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")    
    
#     for day in range(transactions_df.TX_TIME_DAYS.max()+1):
    
#         transactions_day = transactions_df[transactions_df.TX_TIME_DAYS==day].sort_values('TX_TIME_SECONDS')

#         date = start_date + datetime.timedelta(days=day)
#         filename_output = date.strftime("%Y-%m-%d")+'.pkl' / .csv

#         #transactions_day.to_csv(DIR_OUTPUT+filename_output, index=False)

#     # Protocol=4 required for Google Colab
#         transactions_day.to_pickle(DIR_OUTPUT+filename_output)

#     print()
    
    transactions_df['TX_TIME_MONTHS'] = transactions_df['TX_DATETIME'].dt.month
    
    for month in range(transactions_df.TX_TIME_MONTHS.max()):
    
        transactions_month = transactions_df[transactions_df.TX_TIME_MONTHS==month+1].sort_values('TX_TIME_SECONDS')
        
        if month <= 8:
        
            filename_output = "{}-0{}".format(year, month+1) +'.pkl'
        
        else:
            
            filename_output = "{}-{}".format(year, month+1) +'.pkl'

        #transactions_day.to_csv(DIR_OUTPUT+filename_output, index=False)

    # Protocol=4 required for Google Colab
        transactions_month.to_pickle(DIR_OUTPUT+filename_output)

    print()
    
    del transactions_df
    del customer_profiles_table
    del terminal_profiles_table 

Generating data for year 2011



VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to associate terminals to customers: 2.8e+01s


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to generate transactions: 1e+03s
Number of frauds from scenario 1: 80511
Size MB: 26015
(141396207, 9)
   TRANSACTION_ID         TX_DATETIME CUSTOMER_ID TERMINAL_ID  TX_AMOUNT  \
0               0 2011-01-01 00:00:01      102538       10931     126.90   
1               1 2011-01-01 00:00:02      150923       16325      10.47   
2               2 2011-01-01 00:00:02       48113        4197     108.66   

  TX_TIME_SECONDS TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
0               1            0         0                  0  
1               2            0         0                  0  
2               2            0         0                  0  

Generating data for year 2012



VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to associate terminals to customers: 3.6e+01s


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to generate transactions: 1.1e+03s
Number of frauds from scenario 1: 80835
Size MB: 26071
(141698220, 9)
   TRANSACTION_ID         TX_DATETIME CUSTOMER_ID TERMINAL_ID  TX_AMOUNT  \
0       141396207 2012-01-01 00:00:02      150923       14557      12.13   
1       141396208 2012-01-01 00:00:02       48113        4571      87.37   
2       141396209 2012-01-01 00:00:04      100490       15708      15.51   

  TX_TIME_SECONDS TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
0               2            0         0                  0  
1               2            0         0                  0  
2               4            0         0                  0  

Generating data for year 2013



VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to associate terminals to customers: 4.4e+01s


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to generate transactions: 1.1e+03s
Number of frauds from scenario 1: 81764
Size MB: 26039
(141525528, 9)
   TRANSACTION_ID         TX_DATETIME CUSTOMER_ID TERMINAL_ID  TX_AMOUNT  \
0       283094427 2013-01-01 00:00:01      130737       15541      15.54   
1       283094428 2013-01-01 00:00:02       48113       18424      21.01   
2       283094429 2013-01-01 00:00:05       46215       15624      80.49   

  TX_TIME_SECONDS TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
0               1            0         0                  0  
1               2            0         0                  0  
2               5            0         0                  0  

Generating data for year 2014



VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to associate terminals to customers: 4.5e+01s


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to generate transactions: 1.2e+03s
Number of frauds from scenario 1: 80743
Size MB: 26064
(141660804, 9)
   TRANSACTION_ID         TX_DATETIME CUSTOMER_ID TERMINAL_ID  TX_AMOUNT  \
0       424619955 2014-01-01 00:00:01      130737       16589      15.67   
1       424619956 2014-01-01 00:00:02       48113       17322      63.65   
2       424619957 2014-01-01 00:00:03      109802       10581      14.12   

  TX_TIME_SECONDS TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
0               1            0         0                  0  
1               2            0         0                  0  
2               3            0         0                  0  

Generating data for year 2015



VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to associate terminals to customers: 4.8e+01s


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to generate transactions: 1.2e+03s
Number of frauds from scenario 1: 82712
Size MB: 26069
(141687222, 9)
   TRANSACTION_ID         TX_DATETIME CUSTOMER_ID TERMINAL_ID  TX_AMOUNT  \
0       566280759 2015-01-01 00:00:03      109802       11627      35.79   
1       566280760 2015-01-01 00:00:04      152990        5139      56.46   
2       566280761 2015-01-01 00:00:04      100490       13948      40.57   

  TX_TIME_SECONDS TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
0               3            0         0                  0  
1               4            0         0                  0  
2               4            0         0                  0  

Generating data for year 2016



VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to associate terminals to customers: 4.9e+01s


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to generate transactions: 1.2e+03s
Number of frauds from scenario 1: 82629
Size MB: 26049
(141577161, 9)
   TRANSACTION_ID         TX_DATETIME CUSTOMER_ID TERMINAL_ID  TX_AMOUNT  \
0       707967981 2016-01-01 00:00:01      102538        9872      66.84   
1       707967982 2016-01-01 00:00:02      150923       16989      21.80   
2       707967983 2016-01-01 00:00:03      179129       14836      65.08   

  TX_TIME_SECONDS TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
0               1            0         0                  0  
1               2            0         0                  0  
2               3            0         0                  0  

Generating data for year 2017



VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to associate terminals to customers: 4.9e+01s


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to generate transactions: 1.2e+03s
Number of frauds from scenario 1: 81301
Size MB: 26036
(141509089, 9)
   TRANSACTION_ID         TX_DATETIME CUSTOMER_ID TERMINAL_ID  TX_AMOUNT  \
0       849545142 2017-01-01 00:00:02       48113        5754     146.89   
1       849545143 2017-01-01 00:00:02      150923       17663     111.55   
2       849545144 2017-01-01 00:00:03      105863        4273      94.58   

  TX_TIME_SECONDS TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
0               2            0         0                  0  
1               2            0         0                  0  
2               3            0         0                  0  

Generating data for year 2018



VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to associate terminals to customers: 4.9e+01s


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to generate transactions: 1.2e+03s
Number of frauds from scenario 1: 80795
Size MB: 26025
(141447938, 9)
   TRANSACTION_ID         TX_DATETIME CUSTOMER_ID TERMINAL_ID  TX_AMOUNT  \
0       991054231 2018-01-01 00:00:02       48113        3959      85.45   
1       991054232 2018-01-01 00:00:06      148618        1057     108.90   
2       991054233 2018-01-01 00:00:07      119062       14198     117.18   

  TX_TIME_SECONDS TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
0               2            0         0                  0  
1               6            0         0                  0  
2               7            0         0                  0  

Generating data for year 2019



VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to associate terminals to customers: 4.7e+01s


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to generate transactions: 1.2e+03s
Number of frauds from scenario 1: 81089
Size MB: 25987
(141241028, 9)
   TRANSACTION_ID         TX_DATETIME CUSTOMER_ID TERMINAL_ID  TX_AMOUNT  \
0      1132502169 2019-01-01 00:00:02       48113        4142      12.34   
1      1132502170 2019-01-01 00:00:04      152990       19109      72.23   
2      1132502171 2019-01-01 00:00:05      112421       17172       7.18   

  TX_TIME_SECONDS TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
0               2            0         0                  0  
1               4            0         0                  0  
2               5            0         0                  0  

Generating data for year 2020



VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to associate terminals to customers: 4.9e+01s


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12500), Label(value='0 / 12500')))…

Time to generate transactions: 1.2e+03s
Number of frauds from scenario 1: 81198
Size MB: 26043
(141548505, 9)
   TRANSACTION_ID         TX_DATETIME CUSTOMER_ID TERMINAL_ID  TX_AMOUNT  \
0      1273743197 2020-01-01 00:00:01      102538       11649      64.90   
1      1273743198 2020-01-01 00:00:01      130737       17693       7.72   
2      1273743199 2020-01-01 00:00:02      150923       12159      97.64   

  TX_TIME_SECONDS TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
0               1            0         0                  0  
1               1            0         0                  0  
2               2            0         0                  0  



In [14]:
from pathlib import Path

root_directory = Path('/home/ubuntu/')
print('Folder size (MB): ', round(sum(f.stat().st_size for f in root_directory.glob('My projects/Fraud detection/simulated-data-raw/*') if f.is_file()) / 1e6))

Folder size (MB):  99968


In [21]:
with open(r"/home/ubuntu/My projects/Fraud detection/simulated-data-raw/2020-12.pkl", "rb") as input_file:
    data = pickle.load(input_file)

In [28]:
data.head()

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO,TX_TIME_MONTHS
129913199,1403656396,2020-12-01 00:00:03,169670,18277,28.41,28944003,335,0,0,12
129913200,1403656397,2020-12-01 00:00:03,87565,12968,37.77,28944003,335,0,0,12
129913201,1403656398,2020-12-01 00:00:05,60653,17052,41.93,28944005,335,0,0,12
129913202,1403656399,2020-12-01 00:00:08,68692,19878,31.42,28944008,335,0,0,12
129913203,1403656400,2020-12-01 00:00:09,73717,5661,104.52,28944009,335,0,0,12


In [27]:
print('Total number of transactions: ', data.iloc[-1]['TRANSACTION_ID'])

Total number of transactions:  1415291701


In [23]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
udev             63G     0   63G   0% /dev
tmpfs            13G  988K   13G   1% /run
/dev/vda2       126G  106G   16G  88% /
tmpfs            63G     0   63G   0% /dev/shm
tmpfs           5.0M     0  5.0M   0% /run/lock
tmpfs            63G     0   63G   0% /sys/fs/cgroup
tmpfs            13G     0   13G   0% /run/user/1000
