In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import random
import functools

TODO: look at this https://programminghistorian.org/en/lessons/exploring-and-analyzing-network-data-with-python
        

In [2]:
df=pd.read_excel("./input/data.xlsx")
df.info()
#weight in this data is the days taken from one to the other
#seq is the sequence 1,2,3,... of the edges/jumps for a particular transaction


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4242 entries, 0 to 4241
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Unnamed: 0  4242 non-null   int64         
 1   start       4242 non-null   int64         
 2   end         4242 non-null   int64         
 3   weight      4242 non-null   float64       
 4   supplier    4242 non-null   object        
 5   trans_id    4242 non-null   int64         
 6   start_date  4242 non-null   datetime64[ns]
 7   userid      4242 non-null   object        
 8   end_date    4242 non-null   datetime64[ns]
 9   seq         4242 non-null   int64         
dtypes: datetime64[ns](2), float64(1), int64(5), object(2)
memory usage: 331.5+ KB


In [3]:
dft=df[['trans_id','supplier',"start_date","end_date"]].groupby(['trans_id','supplier']).agg(date_start=('start_date','min'), date_end=('end_date','max'),steps=('trans_id','count'))
dft.reset_index(inplace=True)
dft.head()

Unnamed: 0,trans_id,supplier,date_start,date_end,steps
0,1,Parker LLC,2021-04-20 01:40:07,2021-06-02 07:43:20.103,4
1,2,"Larsen, Brown and Pena",2020-09-11 23:08:44,2020-10-28 16:31:31.124,3
2,3,Cox-Sheppard,2021-03-20 20:28:37,2021-04-20 00:17:50.770,3
3,4,Young and Sons,2020-10-06 22:43:13,2020-10-15 18:37:41.806,1
4,5,"Larsen, Brown and Pena",2020-08-13 17:52:39,2020-11-05 11:42:54.059,6


In [4]:
df[df.trans_id==5].sort_values("seq")

Unnamed: 0.1,Unnamed: 0,start,end,weight,supplier,trans_id,start_date,userid,end_date,seq
15,2126,3,6,10.011397,"Larsen, Brown and Pena",5,2020-08-13 17:52:39.000,Melanie Green,2020-08-23 18:09:03.736,1
12,3648,6,9,13.024583,"Larsen, Brown and Pena",5,2020-08-23 18:09:03.736,Mr. Daniel Ellis,2020-09-05 18:44:27.691,2
16,3943,9,10,19.498247,"Larsen, Brown and Pena",5,2020-09-05 18:44:27.691,Maria Jones,2020-09-25 06:41:56.262,3
11,1790,10,9,21.731866,"Larsen, Brown and Pena",5,2020-09-25 06:41:56.262,Vincent Henry,2020-10-17 00:15:49.509,4
14,4162,9,8,11.181985,"Larsen, Brown and Pena",5,2020-10-17 00:15:49.509,Leslie Ramirez,2020-10-28 04:37:53.038,5
13,1620,8,7,8.295151,"Larsen, Brown and Pena",5,2020-10-28 04:37:53.038,Mr. Daniel Ellis,2020-11-05 11:42:54.059,6


In [5]:
G = nx.MultiDiGraph()
for i, r in df.iterrows():
    G.add_edge(r['start'], r['end'], trans_id=r['trans_id'], seq=r['seq'], weight=r['weight'], start_date=r['start_date'], end_date=r['end_date'], userid=r['userid'], supplier=r['supplier'])
    

In [10]:
def filter_edge(n1, n2, key,id):
    if G[n1][n2][key].get("trans_id")==id:
        if n1 != n2:
            return True
        else:
            return False
    else:
        return False

def filter_node(n, id):
    for n1,n2,data in G.in_edges(n, data=True):
        if data["trans_id"]==id:
            return True
    for n1,n2,data in G.out_edges(n, data=True):
        if data["trans_id"]==id:
            return True
    return False
    
def find_root(G,child,n=0):
    parent = list(G.predecessors(child))
    if len(parent) == 0:
        #print(f"found root: {child}")
        return child
    else:  
        if n>100:
            return False
        else:
            return find_root(G, parent[0],n+1)


def find_subgraph(G,id):
    filter_edge_partial=functools.partial(filter_edge,id=id)
    filter_node_partial=functools.partial(filter_node,id=id)
    J = nx.subgraph_view(G , filter_node=filter_node_partial, filter_edge=filter_edge_partial)   
    return J
def check_transact(id):
    result={}
    J=find_subgraph(G,id)
    result["steps_count"]=len(J.edges)+1
    result["raw_nodes"]=sorted(J.nodes()) 
    node_flow=[]
    total_weight=0 
    for ori, end, data in J.edges(data=True):
        node_flow.append([data['seq'],ori,end,data['weight']])
        total_weight=total_weight+data['weight']
    
    #we find the root and leaf (last node) using the input data and manually following the sequence
    #We can't use graph algos to find root or the paths because they don't work with graphs with cycles. 
    # only work for acyclic direct graphs. 
    node_flow.sort(key=lambda x:x[0]) # we sort by sequence to get root and leave node    
    root_node=node_flow[0][1]
    result["root_node"]=root_node
    node_seq=[root_node] 
    for n in node_flow:
        node_seq.append(n[2]) #we add the next destination node
    result["last_node"]=node_seq[-1]
    result['edges']= [[x[1],x[2],x[3]] for x in  node_flow] #sorted list following sequence
    result['nodes']= node_seq # sorted list, following sequence
    result['total_weight']=total_weight
    result['footprint']='_'.join([str(x) for x in node_seq])
    try:
        c=nx.find_cycle(J)
        result['has_cycle']=True
    except:
        result['has_cycle']=False 
    return result
check_transact(5)

{'steps_count': 7,
 'raw_nodes': [3, 6, 7, 8, 9, 10],
 'root_node': 3,
 'last_node': 7,
 'edges': [[3, 6, 10.01139741117076],
  [6, 9, 13.02458281297999],
  [9, 10, 19.49824734465509],
  [10, 9, 21.7318662865112],
  [9, 8, 11.18198529013193],
  [8, 7, 8.295150707378669]],
 'nodes': [3, 6, 9, 10, 9, 8, 7],
 'total_weight': 83.74322985282765,
 'footprint': '3_6_9_10_9_8_7',
 'has_cycle': True}

In [12]:
variant_dict={}
tvariant_id=[]
variant_footprints=[]
variant_counts=[]
variant_has_cycle=[]
for i,r in dft.iterrows():
    id=r['trans_id']
    td=check_transact(id)
    try:
        i= variant_footprints.index(td['footprint'])
        variant_counts[i]+=1
    except:
        variant_footprints.append(td['footprint'])
        variant_counts.append(1)
        variant_has_cycle.append(td['has_cycle'])
        i=len(variant_counts)-1
        try:
            variant_dict[i]=variant_dict[i].append(id)
        except:
            variant_dict[i]=[id]
    tvariant_id.append(i)
  

In [19]:
dft['variant_id']=tvariant_id
dft['footprint']=[variant_footprints[x] for x in tvariant_id]
dft['has_cycle']=[variant_has_cycle[x] for x in tvariant_id]
dft

Unnamed: 0,trans_id,supplier,date_start,date_end,steps,variant_id,footprint,has_cycle
0,1,Parker LLC,2021-04-20 01:40:07,2021-06-02 07:43:20.103,4,0,2_4_7_8_10,False
1,2,"Larsen, Brown and Pena",2020-09-11 23:08:44,2020-10-28 16:31:31.124,3,1,3_5_7_10,False
2,3,Cox-Sheppard,2021-03-20 20:28:37,2021-04-20 00:17:50.770,3,2,1_3_6_7,False
3,4,Young and Sons,2020-10-06 22:43:13,2020-10-15 18:37:41.806,1,3,3_4,False
4,5,"Larsen, Brown and Pena",2020-08-13 17:52:39,2020-11-05 11:42:54.059,6,4,3_6_9_10_9_8_7,True
...,...,...,...,...,...,...,...,...
953,996,Jarvis Ltd,2020-09-16 18:58:40,2020-12-18 15:36:18.119,6,678,2_5_8_9_10_9_8,True
954,997,"Chung, Bonilla and Lawson",2020-12-27 00:57:59,2021-04-11 15:57:25.725,7,679,1_4_6_5_7_6_9_10,True
955,998,Ford Group,2021-05-22 18:03:51,2021-07-08 04:02:23.056,3,680,1_3_6_5,False
956,999,Ramsey LLC,2021-05-02 16:08:54,2021-05-21 16:14:03.559,2,149,1_2_5,False


In [17]:
dfv=pd.DataFrame(list(zip(range(len(variant_sequence)),variant_sequence,variant_counts)),columns=['variant_id','footprint','variant_count'])
dfv

Unnamed: 0,variant_id,footprint,variant_count
0,0,2_4_7_8_10,3
1,1,3_5_7_10,1
2,2,1_3_6_7,2
3,3,3_4,12
4,4,3_6_9_10_9_8_7,1
...,...,...,...
677,677,1_3_4_6_5_4_7,1
678,678,2_5_8_9_10_9_8,1
679,679,1_4_6_5_7_6_9_10,1
680,680,1_3_6_5,1


https://networkx.org/documentation/stable/reference/generated/networkx.classes.function.subgraph_view.html

In [20]:
dft.to_excel("./output/transactions.xlsx")
dfv.to_excel("./output/variants.xlsx")

