We will attempt to use the dataset from Kaggle and data clean it here for us to be able to finally test the ASA-GNN framework. Thanks to the notebook provided by https://www.kaggle.com/code/tuttifrutti/isolating-a-cardid

In [35]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [36]:
train_transaction = pd.read_csv('../data/train_transaction.csv')
train_identity = pd.read_csv('../data/train_identity.csv')
train_transaction = train_transaction.merge(train_identity, how='left', left_on='TransactionID',right_on='TransactionID')
del train_identity

In [37]:
train_transaction.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [38]:
import itertools
import math
import networkx as nx

#function to create keys based on multiple columns
def create_key(df, cols, name_new_col):
    '''
    df: pandas dataframe
    cols: list of columns composing the key
    name_new_col: name given to the new column
    '''
    df.loc[:,name_new_col] = ''
    for col in cols:
        df.loc[:,name_new_col] = df.loc[:,name_new_col] + df.loc[:,col].astype(str)
    return df  

def truncate(f, n):
    return math.floor(f * 10 ** n) / 10 ** n  

def merge(list1, list2): 
    merged_list = [[p1, p2] for idx1, p1 in enumerate(list1)  
    for idx2, p2 in enumerate(list2) if idx1 == idx2] 
    return merged_list   



In [39]:
train_transaction['day'] = train_transaction['TransactionDT']/(3600*24)
train_transaction['D1minusday'] = (train_transaction['D1']-train_transaction['day']).replace(np.nan, -9999).map(int)
colsID = ['card1','card2','card3','card4','card5','card6','D1minusday','ProductCD']
train_transaction = create_key(train_transaction, colsID, 'cardID_D1')

In [40]:
train_transaction['cardID_D1'].value_counts()

cardID_D1
15775481.0150.0mastercard102.0credit-129S         1414
9500321.0150.0visa226.0debit84W                    480
7919194.0150.0mastercard166.0debit-92W             439
7919194.0150.0mastercard166.0debit-124W            282
7919194.0150.0mastercard202.0debit-34W             242
                                                  ... 
16452514.0150.0mastercard224.0credit-57H             1
16426399.0150.0american express118.0credit-57R       1
3602119.0150.0visa195.0credit-57R                    1
4029594.0150.0visa226.0debit-57W                     1
12037595.0150.0mastercard224.0debit-182W             1
Name: count, Length: 171059, dtype: int64

In [41]:
train_transaction['V307'] = train_transaction['V307'].fillna(0)
train_transaction['V307plus'] = train_transaction['V307']+train_transaction['TransactionAmt']

In [42]:
train_transaction['V307rtrunc'] = train_transaction['V307'].apply(lambda x: truncate(x,3))
train_transaction['V307round'] = train_transaction['V307'].apply(lambda x: round(x,3))
train_transaction['V307plusround'] = train_transaction['V307plus'].apply(lambda x: round(x,4))
train_transaction['V307plusroundtrunc'] = train_transaction['V307plusround'].apply(lambda x: truncate(x,3))
train_transaction['V307plusround'] = train_transaction['V307plus'].apply(lambda x: round(x,3))
train_transaction['V307trunc2'] = train_transaction['V307'].apply(lambda x: truncate(x,2))
train_transaction['V307plustrunc2'] = train_transaction['V307plus'].apply(lambda x: truncate(x,2))
train_transaction['TransactionAmttrunq'] = train_transaction['TransactionAmt'].apply(lambda x: round(x,3))

In [43]:
def find_groups(aa):
    group_list = []
    
    #get the couples by existstrun
    list1 = aa['V307plusroundtrunc'].tolist()
    list2 = aa['V307rtrunc'].tolist()
    kv = []
    res = [[list(filter(lambda z: list1[z]==x, range(len(list1)))),list(filter(lambda z: list2[z]==x, range(len(list2))))] for x in list1 if x in list2] #find the pairs
    res= [list(map(kv.append,map(list,(itertools.product(*sublist))))) for sublist in res] #drop duplicates from list of list
    res = list(map(list, set(map(lambda i: tuple(i), kv)))) #create list of couple indexes
    list1 = aa.iloc[[i[0] for i in res]]['TransactionID'].tolist()
    list2 = aa.iloc[[i[1] for i in res]]['TransactionID'].tolist()
    liste_existstrun = merge(list1, list2)


    #get the couples by existsroundtrunc
    list1 = aa['V307plusroundtrunc'].tolist()
    list2 = aa['V307round'].tolist()
    kv = []
    res = [[list(filter(lambda z: list1[z]==x, range(len(list1)))),list(filter(lambda z: list2[z]==x, range(len(list2))))] for x in list1 if x in list2] #find the pairs
    res= [list(map(kv.append,map(list,(itertools.product(*sublist))))) for sublist in res] #drop duplicates from list of list
    res = list(map(list, set(map(lambda i: tuple(i), kv)))) #create list of couple indexes
    list1 = aa.iloc[[i[0] for i in res]]['TransactionID'].tolist()
    list2 = aa.iloc[[i[1] for i in res]]['TransactionID'].tolist()
    liste_existsroundtrunc = merge(list1, list2)

    #get the couples by existsroundtrunc
    list1 = aa['V307plusround'].tolist()
    list2 = aa['V307round'].tolist()
    kv = []
    res = [[list(filter(lambda z: list1[z]==x, range(len(list1)))),list(filter(lambda z: list2[z]==x, range(len(list2))))] for x in list1 if x in list2] #find the pairs
    res= [list(map(kv.append,map(list,(itertools.product(*sublist))))) for sublist in res] #drop duplicates from list of list
    res = list(map(list, set(map(lambda i: tuple(i), kv)))) #create list of couple indexes
    list1 = aa.iloc[[i[0] for i in res]]['TransactionID'].tolist()
    list2 = aa.iloc[[i[1] for i in res]]['TransactionID'].tolist()
    liste_existsroundround = merge(list1, list2)


    #get the couples by existsroundtrunc
    list1 = aa['V307trunc2'].tolist()
    list2 = aa['V307plustrunc2'].tolist()
    kv = []
    res = [[list(filter(lambda z: list1[z]==x, range(len(list1)))),list(filter(lambda z: list2[z]==x, range(len(list2))))] for x in list1 if x in list2] #find the pairs
    res= [list(map(kv.append,map(list,(itertools.product(*sublist))))) for sublist in res] #drop duplicates from list of list
    res = list(map(list, set(map(lambda i: tuple(i), kv)))) #create list of couple indexes
    list1 = aa.iloc[[i[0] for i in res]]['TransactionID'].tolist()
    list2 = aa.iloc[[i[1] for i in res]]['TransactionID'].tolist()
    liste_existstrunc2 = merge(list1, list2)


    #get the couples by existsamount
    list1 = aa['TransactionAmttrunq'].tolist()
    list2 = aa['V307round'].tolist()
    kv = []
    res = [[list(filter(lambda z: list1[z]==x, range(len(list1)))),list(filter(lambda z: list2[z]==x, range(len(list2))))] for x in list1 if x in list2] #find the pairs
    res= [list(map(kv.append,map(list,(itertools.product(*sublist))))) for sublist in res] #drop duplicates from list of list
    res = list(map(list, set(map(lambda i: tuple(i), kv)))) #create list of couple indexes
    list1 = aa.iloc[[i[0] for i in res]]['TransactionID'].tolist()
    list2 = aa.iloc[[i[1] for i in res]]['TransactionID'].tolist()
    liste_existsamount = merge(list1, list2)

    #get by exact same amount
    a=[]
    liste_sameamount = aa.groupby('TransactionAmt')['TransactionID'].apply(list).tolist()
    res = [list(map(a.append, map(list,zip(i, i[1:] + i[:1])))) for i in liste_sameamount]

    group_list.extend(liste_existstrun)
    group_list.extend(liste_existsroundtrunc)
    group_list.extend(liste_existsamount)
    group_list.extend(liste_existsroundround)
    group_list.extend(liste_existstrunc2)

    group_list.extend(a)

    L = group_list
    G = nx.Graph()
    G.add_nodes_from(sum(L, []))
    q = [[(s[i],s[i+1]) for i in range(len(s)-1)] for s in L]
    for i in q:
        G.add_edges_from(i)
    group_list = [list(i) for i in nx.connected_components(G)]
    return group_list

In [44]:
def find_groups_optimized(aa):
    """
    Optimized version of find_groups using pandas operations instead of O(n²) loops.
    Uses V307 columns for card identification.
    """
    if len(aa) == 0:
        return []
    
    if len(aa) == 1:
        return [[aa['TransactionID'].iloc[0]]]
    
    edges = []
    transaction_ids = aa['TransactionID'].values
    
    def find_pairs_fast(col_from, col_to):
        """Find pairs where col_from value matches col_to value using pandas merge."""
        df_from = aa[['TransactionID', col_from]].copy()
        df_from.columns = ['tid_from', 'key']
        df_to = aa[['TransactionID', col_to]].copy()
        df_to.columns = ['tid_to', 'key']
        
        # Remove rows where key is 0 (no meaningful link)
        df_to = df_to[df_to['key'] != 0]
        
        if len(df_to) == 0:
            return []
        
        merged = df_from.merge(df_to, on='key', how='inner')
        merged = merged[merged['tid_from'] != merged['tid_to']]
        
        return list(zip(merged['tid_from'].values, merged['tid_to'].values))
    
    # Find pairs using different rounding strategies (same logic as original)
    edges.extend(find_pairs_fast('V307plusroundtrunc', 'V307rtrunc'))
    edges.extend(find_pairs_fast('V307plusroundtrunc', 'V307round'))
    edges.extend(find_pairs_fast('V307plusround', 'V307round'))
    edges.extend(find_pairs_fast('V307trunc2', 'V307plustrunc2'))
    edges.extend(find_pairs_fast('TransactionAmttrunq', 'V307round'))
    
    # Get pairs by exact same transaction amount
    for amt, group in aa.groupby('TransactionAmt'):
        if len(group) > 1:
            tids = group['TransactionID'].values
            for i in range(len(tids) - 1):
                edges.append((tids[i], tids[i + 1]))
    
    if not edges:
        return [[tid] for tid in transaction_ids]
    
    G = nx.Graph()
    G.add_nodes_from(transaction_ids)
    G.add_edges_from(edges)
    
    return [list(component) for component in nx.connected_components(G)]

In [45]:
card_group = train_transaction[train_transaction.cardID_D1=='16136204.0185.0visa138.0debit108C']
groups_found = find_groups(card_group)
groups_found

[[3030465, 3009027, 3026025, 3008981, 3008631, 3025980],
 [3271808, 3271837, 3271815],
 [3173544, 3173773, 3173777, 3173458, 3173468],
 [3437224, 3537388, 3518333, 3437190],
 [3445490, 3445471]]

In [46]:
def create_card_ids(train_transaction, find_groups):
    """
    Creates a unique card_ID for each transaction based on cardID_D1 groups
    and V307 patterns identified by find_groups().
    """
    # Initialize the card_ID column with None
    train_transaction['card_ID'] = None
    
    # Counter for unique card IDs
    card_id_counter = 0
    
    # Get all unique ccardID_D1 values
    unique_card_id_d1 = train_transaction['cardID_D1'].unique()
    
    # Process each cardID_D1 group
    for card_d1 in unique_card_id_d1:
        # Filter dataframe for this specific cardID_D1
        mask = train_transaction['cardID_D1'] == card_d1
        df_subset = train_transaction[mask]
        
        # Get the groups (list of lists of indices)
        groups = find_groups(df_subset)
        
        # Assign a unique card_ID to each group
        for group_transaction_ids in groups:

            transaction_mask = train_transaction['TransactionID'].isin(group_transaction_ids)
            train_transaction.loc[transaction_mask, 'card_ID'] = card_id_counter
            card_id_counter += 1
    
    return train_transaction


In [47]:
def create_card_ids_optimized(train_transaction):
    """
    Optimized version of create_card_ids using find_groups_optimized.
    Creates unique card_ID for each transaction based on cardID_D1 groups
    and V307 patterns.
    """
    train_transaction['card_ID'] = None
    card_id_counter = 0
    
    unique_card_id_d1 = train_transaction['cardID_D1'].unique()
    print(f"Processing {len(unique_card_id_d1)} unique cardID_D1 groups...")
    
    for card_d1 in tqdm(unique_card_id_d1, desc="Card groups"):
        mask = train_transaction['cardID_D1'] == card_d1
        df_subset = train_transaction[mask]
        
        groups = find_groups_optimized(df_subset)
        
        for group_transaction_ids in groups:
            transaction_mask = train_transaction['TransactionID'].isin(group_transaction_ids)
            train_transaction.loc[transaction_mask, 'card_ID'] = card_id_counter
            card_id_counter += 1
    
    print(f"Total unique card_IDs created: {card_id_counter}")
    return train_transaction

In [48]:
# Use the optimized version
train_transaction = create_card_ids_optimized(train_transaction)

Processing 171059 unique cardID_D1 groups...


Card groups: 100%|██████████| 171059/171059 [1:02:01<00:00, 45.96it/s]

Total unique card_IDs created: 304711





We have now gotten a column to unqiuely identify a card called "card_ID". Now, we need to create a way to uniquely identify a user called "user_ID".

In [49]:
colsID = ['id_19','id_20','id_31','DeviceInfo']
train_transaction = create_key(train_transaction, colsID, 'user_ID_mock')

In [50]:
#check the number of unique users first
train_transaction['user_ID_mock'].value_counts()

user_ID_mock
nannannannan                                                    449851
633.0533.0edge 16.0Windows                                        1102
216.0214.0ie 11.0 for desktoprv:11.0                              1044
266.0507.0chrome 63.0Windows                                       758
266.0325.0chrome 63.0Windows                                       486
                                                                 ...  
574.0507.0chrome 64.0 for androidSM-G900M Build/LRX21T               1
548.0521.0chrome 63.0 for androidVenue                               1
256.0324.0chrome 63.0Windows                                         1
391.0600.0chrome 63.0 for androidSM-G935P Build/NRD90M               1
417.0595.0chrome 66.0 for androidRNE-L03 Build/HUAWEIRNE-L03         1
Name: count, Length: 33650, dtype: int64

In [51]:
counts = train_transaction['user_ID_mock'].dropna().value_counts()

# users that appear between 10 and 25 times (inclusive)
users_10_25 = counts[(counts >= 10) & (counts <= 25)].index

# filtered dataframe containing only those users
filtered = train_transaction[train_transaction['user_ID_mock'].isin(users_10_25)]

filtered['user_ID_mock'].value_counts()

user_ID_mock
100.0533.0chrome 66.0Windows                 25
529.0299.0mobile safari genericiOS Device    25
633.0391.0mobile safari genericiOS Device    25
100.0549.0chrome 62.0MacOS                   25
100.0533.0chrome 65.0Windows                 25
                                             ..
202.0107.0chrome 64.0Windows                 10
410.0278.0chrome 63.0Windows                 10
193.0333.0edge 16.0Windows                   10
542.0277.0chrome 63.0Windows                 10
410.0142.0chrome 62.0Windows                 10
Name: count, Length: 1640, dtype: int64

In [52]:
train_transaction[train_transaction['user_ID_mock']=='633.0533.0edge 16.0Windows']

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V307plus,V307rtrunc,V307round,V307plusround,V307plusroundtrunc,V307trunc2,V307plustrunc2,TransactionAmttrunq,card_ID,user_ID_mock
106634,3093634,0,2128047,100.0,R,2616,327.0,150.0,discover,102.0,...,100.0,0.0,0.0,100.0,100.0,0.0,100.0,100.0,89424,633.0533.0edge 16.0Windows
216910,3203910,0,5069345,200.0,H,7918,555.0,150.0,visa,226.0,...,200.0,0.0,0.0,200.0,200.0,0.0,200.0,200.0,148656,633.0533.0edge 16.0Windows
368553,3355553,0,9156842,100.0,R,7309,399.0,150.0,american express,118.0,...,100.0,0.0,0.0,100.0,100.0,0.0,100.0,100.0,219237,633.0533.0edge 16.0Windows
368556,3355556,0,9156897,100.0,R,7309,399.0,150.0,american express,118.0,...,200.0,100.0,100.0,200.0,200.0,100.0,200.0,100.0,219237,633.0533.0edge 16.0Windows
368558,3355558,0,9156948,100.0,R,7309,399.0,150.0,american express,118.0,...,300.0,200.0,200.0,300.0,300.0,200.0,300.0,100.0,219237,633.0533.0edge 16.0Windows
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
574505,3561505,0,15271575,30.0,S,15775,481.0,150.0,mastercard,102.0,...,145795.0,145765.0,145765.0,145795.0,145795.0,145765.0,145795.0,30.0,249289,633.0533.0edge 16.0Windows
577530,3564530,0,15364252,120.0,S,15775,481.0,150.0,mastercard,102.0,...,145730.0,145610.0,145610.0,145730.0,145730.0,145610.0,145730.0,120.0,249290,633.0533.0edge 16.0Windows
579667,3566667,0,15451381,85.0,S,15775,481.0,150.0,mastercard,102.0,...,145695.0,145610.0,145610.0,145695.0,145695.0,145610.0,145695.0,85.0,249291,633.0533.0edge 16.0Windows
581436,3568436,0,15527523,30.0,S,15775,481.0,150.0,mastercard,102.0,...,145670.0,145640.0,145640.0,145670.0,145670.0,145640.0,145670.0,30.0,249289,633.0533.0edge 16.0Windows


In [53]:
train_transaction['V264'] = train_transaction['V264'].fillna(0)
train_transaction['V264plus'] = train_transaction['V264']+train_transaction['TransactionAmt']

In [54]:
train_transaction['V264'].value_counts()

V264
0.000000      550444
100.000000      2462
50.000000       1660
200.000000      1613
150.000000      1363
               ...  
179.623398         1
18.764099          1
82.961403          1
365.391693         1
499.955414         1
Name: count, Length: 13358, dtype: int64

In [55]:
train_transaction['V264plus'].value_counts()

V264plus
59.000000     30582
117.000000    28935
107.950000    23954
57.950000     23600
100.000000    17939
              ...  
640.460302        1
169.193404        1
148.995399        1
99.330199         1
400.780000        1
Name: count, Length: 38283, dtype: int64

In [56]:
train_transaction['V264rtrunc'] = train_transaction['V264'].apply(lambda x: truncate(x,3))
train_transaction['V264round'] = train_transaction['V264'].apply(lambda x: round(x,3))
train_transaction['V264plusround'] = train_transaction['V264plus'].apply(lambda x: round(x,4))
train_transaction['V264plusroundtrunc'] = train_transaction['V264plusround'].apply(lambda x: truncate(x,3))
train_transaction['V264plusround'] = train_transaction['V264plus'].apply(lambda x: round(x,3))
train_transaction['V264trunc2'] = train_transaction['V264'].apply(lambda x: truncate(x,2))
train_transaction['V264plustrunc2'] = train_transaction['V264plus'].apply(lambda x: truncate(x,2))
train_transaction['TransactionAmttrunq'] = train_transaction['TransactionAmt'].apply(lambda x: round(x,3))

In [57]:
def find_user_groups(aa):
    group_list = []
    
    #get the couples by existstrun
    list1 = aa['V264plusroundtrunc'].tolist()
    list2 = aa['V264rtrunc'].tolist()
    kv = []
    res = [[list(filter(lambda z: list1[z]==x, range(len(list1)))),list(filter(lambda z: list2[z]==x, range(len(list2))))] for x in list1 if x in list2] #find the pairs
    res= [list(map(kv.append,map(list,(itertools.product(*sublist))))) for sublist in res] #drop duplicates from list of list
    res = list(map(list, set(map(lambda i: tuple(i), kv)))) #create list of couple indexes
    list1 = aa.iloc[[i[0] for i in res]]['TransactionID'].tolist()
    list2 = aa.iloc[[i[1] for i in res]]['TransactionID'].tolist()
    liste_existstrun = merge(list1, list2)


    #get the couples by existsroundtrunc
    list1 = aa['V264plusroundtrunc'].tolist()
    list2 = aa['V264round'].tolist()
    kv = []
    res = [[list(filter(lambda z: list1[z]==x, range(len(list1)))),list(filter(lambda z: list2[z]==x, range(len(list2))))] for x in list1 if x in list2] #find the pairs
    res= [list(map(kv.append,map(list,(itertools.product(*sublist))))) for sublist in res] #drop duplicates from list of list
    res = list(map(list, set(map(lambda i: tuple(i), kv)))) #create list of couple indexes
    list1 = aa.iloc[[i[0] for i in res]]['TransactionID'].tolist()
    list2 = aa.iloc[[i[1] for i in res]]['TransactionID'].tolist()
    liste_existsroundtrunc = merge(list1, list2)

    #get the couples by existsroundtrunc
    list1 = aa['V264plusround'].tolist()
    list2 = aa['V264round'].tolist()
    kv = []
    res = [[list(filter(lambda z: list1[z]==x, range(len(list1)))),list(filter(lambda z: list2[z]==x, range(len(list2))))] for x in list1 if x in list2] #find the pairs
    res= [list(map(kv.append,map(list,(itertools.product(*sublist))))) for sublist in res] #drop duplicates from list of list
    res = list(map(list, set(map(lambda i: tuple(i), kv)))) #create list of couple indexes
    list1 = aa.iloc[[i[0] for i in res]]['TransactionID'].tolist()
    list2 = aa.iloc[[i[1] for i in res]]['TransactionID'].tolist()
    liste_existsroundround = merge(list1, list2)


    #get the couples by existsroundtrunc
    list1 = aa['V264trunc2'].tolist()
    list2 = aa['V264plustrunc2'].tolist()
    kv = []
    res = [[list(filter(lambda z: list1[z]==x, range(len(list1)))),list(filter(lambda z: list2[z]==x, range(len(list2))))] for x in list1 if x in list2] #find the pairs
    res= [list(map(kv.append,map(list,(itertools.product(*sublist))))) for sublist in res] #drop duplicates from list of list
    res = list(map(list, set(map(lambda i: tuple(i), kv)))) #create list of couple indexes
    list1 = aa.iloc[[i[0] for i in res]]['TransactionID'].tolist()
    list2 = aa.iloc[[i[1] for i in res]]['TransactionID'].tolist()
    liste_existstrunc2 = merge(list1, list2)


    #get the couples by existsamount
    list1 = aa['TransactionAmttrunq'].tolist()
    list2 = aa['V264round'].tolist()
    kv = []
    res = [[list(filter(lambda z: list1[z]==x, range(len(list1)))),list(filter(lambda z: list2[z]==x, range(len(list2))))] for x in list1 if x in list2] #find the pairs
    res= [list(map(kv.append,map(list,(itertools.product(*sublist))))) for sublist in res] #drop duplicates from list of list
    res = list(map(list, set(map(lambda i: tuple(i), kv)))) #create list of couple indexes
    list1 = aa.iloc[[i[0] for i in res]]['TransactionID'].tolist()
    list2 = aa.iloc[[i[1] for i in res]]['TransactionID'].tolist()
    liste_existsamount = merge(list1, list2)

    #get by exact same amount
    a=[]
    liste_sameamount = aa.groupby('TransactionAmt')['TransactionID'].apply(list).tolist()
    res = [list(map(a.append, map(list,zip(i, i[1:] + i[:1])))) for i in liste_sameamount]

    group_list.extend(liste_existstrun)
    group_list.extend(liste_existsroundtrunc)
    group_list.extend(liste_existsamount)
    group_list.extend(liste_existsroundround)
    group_list.extend(liste_existstrunc2)

    group_list.extend(a)

    L = group_list
    G = nx.Graph()
    G.add_nodes_from(sum(L, []))
    q = [[(s[i],s[i+1]) for i in range(len(s)-1)] for s in L]
    for i in q:
        G.add_edges_from(i)
    group_list = [list(i) for i in nx.connected_components(G)]
    return group_list

In [58]:
def find_user_groups_optimized(aa):
    """
    Optimized version of find_user_groups using pandas operations instead of O(n²) loops.
    Uses pandas merge for pair finding and dictionaries for O(1) lookups.
    """
    if len(aa) == 0:
        return []
    
    if len(aa) == 1:
        return [[aa['TransactionID'].iloc[0]]]
    
    edges = []
    transaction_ids = aa['TransactionID'].values
    
    # Helper function to find pairs using pandas merge (vectorized, much faster)
    def find_pairs_fast(col_from, col_to):
        """Find pairs where col_from value matches col_to value using pandas merge."""
        df_from = aa[['TransactionID', col_from]].copy()
        df_from.columns = ['tid_from', 'key']
        df_to = aa[['TransactionID', col_to]].copy()
        df_to.columns = ['tid_to', 'key']
        
        # Remove rows where key is 0 (no meaningful link for cumulative amounts starting at 0)
        df_to = df_to[df_to['key'] != 0]
        
        if len(df_to) == 0:
            return []
        
        # Merge to find matching pairs
        merged = df_from.merge(df_to, on='key', how='inner')
        # Remove self-loops
        merged = merged[merged['tid_from'] != merged['tid_to']]
        
        return list(zip(merged['tid_from'].values, merged['tid_to'].values))
    
    # Find pairs using different rounding strategies (same logic as original)
    edges.extend(find_pairs_fast('V264plusroundtrunc', 'V264rtrunc'))
    edges.extend(find_pairs_fast('V264plusroundtrunc', 'V264round'))
    edges.extend(find_pairs_fast('V264plusround', 'V264round'))
    edges.extend(find_pairs_fast('V264trunc2', 'V264plustrunc2'))
    edges.extend(find_pairs_fast('TransactionAmttrunq', 'V264round'))
    
    # Get pairs by exact same transaction amount (link consecutive transactions with same amount)
    for amt, group in aa.groupby('TransactionAmt'):
        if len(group) > 1:
            tids = group['TransactionID'].values
            # Create edges between consecutive transactions with same amount
            for i in range(len(tids) - 1):
                edges.append((tids[i], tids[i + 1]))
    
    # Build graph and find connected components
    if not edges:
        # No edges found, each transaction is its own group
        return [[tid] for tid in transaction_ids]
    
    G = nx.Graph()
    G.add_nodes_from(transaction_ids)
    G.add_edges_from(edges)
    
    return [list(component) for component in nx.connected_components(G)]

In [59]:
def create_user_ids(train_transaction, find_groups):
    """
    Creates a unique user_ID for each transaction based on user_ID_mock groups
    and V264 patterns identified by find_groups().
    """
    # Initialize the user_ID column with None
    train_transaction['user_ID'] = None
    # Counter for unique user IDs
    user_id_counter = 0
    # Get all unique user_ID_mock values (excluding NaN)
    unique_user_id_mock = train_transaction['user_ID_mock'].dropna().unique()
    
    # Process each user_ID_mock group
    for user_mock in unique_user_id_mock:
        # Filter dataframe for this specific user_ID_mock
        mask = train_transaction['user_ID_mock'] == user_mock
        df_subset = train_transaction[mask].copy()
        
        # Get the groups (list of lists of TransactionIDs)
        groups = find_groups(df_subset)
        
        # Assign a unique user_ID to each group
        for group_transaction_ids in groups:
            transaction_mask = train_transaction['TransactionID'].isin(group_transaction_ids)
            train_transaction.loc[transaction_mask, 'user_ID'] = user_id_counter
            user_id_counter += 1
    
    return train_transaction

In [60]:
def create_user_ids_optimized(train_transaction):
    """
    Creates unique user_IDs with optimizations for the large 'nannannannan' group.
    
    Strategy:
    1. For non-nannannannan users: Group by user_ID_mock, then apply V264 linking
    2. For nannannannan users: First partition by cardID_D1, then apply V264 linking
       (This breaks the 449k row problem into many smaller groups)
    """
    # Initialize the user_ID column
    train_transaction['user_ID'] = None
    user_id_counter = 0
    
    # Split into identified users and guest users (nannannannan)
    nan_mask = train_transaction['user_ID_mock'] == 'nannannannan'
    identified_users = train_transaction[~nan_mask]['user_ID_mock'].unique()
    
    print(f"Processing {len(identified_users)} identified user groups...")
    
    # Process identified users (non-nannannannan)
    for user_mock in tqdm(identified_users, desc="Identified users"):
        mask = train_transaction['user_ID_mock'] == user_mock
        df_subset = train_transaction[mask]
        
        groups = find_user_groups_optimized(df_subset)
        
        for group_tids in groups:
            train_transaction.loc[
                train_transaction['TransactionID'].isin(group_tids), 
                'user_ID'
            ] = user_id_counter
            user_id_counter += 1
    
    # Process nannannannan users by partitioning on cardID_D1
    nan_transactions = train_transaction[nan_mask]
    unique_cards = nan_transactions['cardID_D1'].unique()
    
    print(f"\nProcessing {len(nan_transactions)} guest transactions across {len(unique_cards)} card groups...")
    
    for card_d1 in tqdm(unique_cards, desc="Guest user cards"):
        mask = (train_transaction['user_ID_mock'] == 'nannannannan') & \
               (train_transaction['cardID_D1'] == card_d1)
        df_subset = train_transaction[mask]
        
        if len(df_subset) == 0:
            continue
            
        groups = find_user_groups_optimized(df_subset)
        
        for group_tids in groups:
            train_transaction.loc[
                train_transaction['TransactionID'].isin(group_tids), 
                'user_ID'
            ] = user_id_counter
            user_id_counter += 1
    
    print(f"\nTotal unique user_IDs created: {user_id_counter}")
    return train_transaction

In [61]:
filtered_train = train_transaction[train_transaction["user_ID_mock"]=='nannannannan']
filtered_train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,TransactionAmttrunq,card_ID,user_ID_mock,V264plus,V264rtrunc,V264round,V264plusround,V264plusroundtrunc,V264trunc2,V264plustrunc2
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,68.5,0,nannannannan,68.5,0.0,0.0,68.5,68.5,0.0,68.5
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,29.0,1,nannannannan,29.0,0.0,0.0,29.0,29.0,0.0,29.0
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,59.0,2,nannannannan,59.0,0.0,0.0,59.0,59.0,0.0,59.0
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,50.0,10,nannannannan,50.0,0.0,0.0,50.0,50.0,0.0,50.0
5,2987005,0,86510,49.0,W,5937,555.0,150.0,visa,226.0,...,49.0,15,nannannannan,49.0,0.0,0.0,49.0,49.0,0.0,49.0


In [62]:
filtered_train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,TransactionAmttrunq,card_ID,user_ID_mock,V264plus,V264rtrunc,V264round,V264plusround,V264plusroundtrunc,V264trunc2,V264plustrunc2
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,68.5,0,nannannannan,68.5,0.0,0.0,68.5,68.5,0.0,68.5
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,29.0,1,nannannannan,29.0,0.0,0.0,29.0,29.0,0.0,29.0
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,59.0,2,nannannannan,59.0,0.0,0.0,59.0,59.0,0.0,59.0
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,50.0,10,nannannannan,50.0,0.0,0.0,50.0,50.0,0.0,50.0
5,2987005,0,86510,49.0,W,5937,555.0,150.0,visa,226.0,...,49.0,15,nannannannan,49.0,0.0,0.0,49.0,49.0,0.0,49.0


In [64]:
user_group = train_transaction[train_transaction["user_ID_mock"]=='nannannannan']
groups_found = find_user_groups_optimized(user_group)
groups_found

[[np.int64(3534856),
  np.int64(3139596),
  np.int64(3063823),
  np.int64(3254288),
  np.int64(3164177),
  np.int64(3141650),
  np.int64(3297297),
  np.int64(3330067),
  np.int64(3407893),
  np.int64(3430423),
  np.int64(3399711),
  np.int64(3037216),
  np.int64(3229730),
  np.int64(3301413),
  np.int64(3037222),
  np.int64(3336231),
  np.int64(3338279),
  np.int64(3387431),
  np.int64(3518503),
  np.int64(3010605),
  np.int64(3289138),
  np.int64(3180603),
  np.int64(3170365),
  np.int64(3147838),
  np.int64(3450944),
  np.int64(3176514),
  np.int64(3285059),
  np.int64(3442755),
  np.int64(3502148),
  np.int64(3016775),
  np.int64(3295303),
  np.int64(3205196),
  np.int64(3246158),
  np.int64(3133521),
  np.int64(3450974),
  np.int64(3571808),
  np.int64(3328098),
  np.int64(3389542),
  np.int64(3127399),
  np.int64(3258472),
  np.int64(3223658),
  np.int64(3225708),
  np.int64(3311724),
  np.int64(2990190),
  np.int64(3262574),
  np.int64(3035249),
  np.int64(3496050),
  np.int64(30

In [65]:
# Run the optimized user ID creation
train_transaction = create_user_ids_optimized(train_transaction)

Processing 33649 identified user groups...


Identified users: 100%|██████████| 33649/33649 [10:28<00:00, 53.55it/s]



Processing 449851 guest transactions across 116518 card groups...


Guest user cards: 100%|██████████| 116518/116518 [1:10:51<00:00, 27.41it/s]



Total unique user_IDs created: 392806


In [66]:
# Verify the results
print("User ID distribution:")
print(f"Total transactions: {len(train_transaction)}")
print(f"Unique user_IDs: {train_transaction['user_ID'].nunique()}")
print(f"Transactions with user_ID: {train_transaction['user_ID'].notna().sum()}")
print(f"\nSample of user_ID value counts:")
print(train_transaction['user_ID'].value_counts().head(10))

User ID distribution:
Total transactions: 590540
Unique user_IDs: 392806
Transactions with user_ID: 590540

Sample of user_ID value counts:
user_ID
38835     1056
3156       322
2646       255
14866      234
9021       222
15718      211
651        198
15411      196
970        191
115573     184
Name: count, dtype: int64
