In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
import networkx as nx
import matplotlib.pyplot as plt
import copy

In [2]:
df1 = pd.read_csv('data_irregularities_yes_2.csv')
df1.head()

Unnamed: 0,requestor,goods_services,supplier,award_date,award_amount
0,requestor_a,maintain_facilities_a,supplier_aaa,1-Jan-24,150000.0
1,requestor_a,maintain_facilities_a,supplier_bbb,1-Jan-24,150000.0
2,requestor_a,maintain_facilities_a,supplier_c,1-Jan-24,109847.7126
3,requestor_a,maintain_facilities_a,supplier_aaa,1-Apr-24,150000.0
4,requestor_a,maintain_facilities_a,supplier_bbb,1-Apr-24,150000.0


In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   requestor       48 non-null     object 
 1   goods_services  48 non-null     object 
 2   supplier        48 non-null     object 
 3   award_date      48 non-null     object 
 4   award_amount    48 non-null     float64
dtypes: float64(1), object(4)
memory usage: 2.0+ KB


In [4]:
df1['award_date_formatted'] = pd.to_datetime(df1['award_date'], format='%d-%b-%y', errors='coerce')
df1.head()

Unnamed: 0,requestor,goods_services,supplier,award_date,award_amount,award_date_formatted
0,requestor_a,maintain_facilities_a,supplier_aaa,1-Jan-24,150000.0,2024-01-01
1,requestor_a,maintain_facilities_a,supplier_bbb,1-Jan-24,150000.0,2024-01-01
2,requestor_a,maintain_facilities_a,supplier_c,1-Jan-24,109847.7126,2024-01-01
3,requestor_a,maintain_facilities_a,supplier_aaa,1-Apr-24,150000.0,2024-04-01
4,requestor_a,maintain_facilities_a,supplier_bbb,1-Apr-24,150000.0,2024-04-01


In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   requestor             48 non-null     object        
 1   goods_services        48 non-null     object        
 2   supplier              48 non-null     object        
 3   award_date            48 non-null     object        
 4   award_amount          48 non-null     float64       
 5   award_date_formatted  48 non-null     datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 2.4+ KB


In [6]:
df2 = pd.pivot(df1, index=['requestor','goods_services','award_date'], columns='supplier', values='supplier')
df2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,supplier,supplier_a,supplier_aaa,supplier_b,supplier_bbb,supplier_c,supplier_d,supplier_e,supplier_f,supplier_g,supplier_h,supplier_i,supplier_j,supplier_k,supplier_l
requestor,goods_services,award_date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
requestor_a,maintain_facilities_a,1-Apr-24,,supplier_aaa,,supplier_bbb,,,,supplier_f,,,,,,
requestor_a,maintain_facilities_a,1-Jan-24,,supplier_aaa,,supplier_bbb,supplier_c,,,,,,,,,
requestor_a,maintain_facilities_a,1-Jul-24,,supplier_aaa,,supplier_bbb,,,,,,,supplier_i,,,
requestor_a,maintain_facilities_a,1-Oct-24,,supplier_aaa,,supplier_bbb,,,,,,,,,,supplier_l
requestor_b,maintain_facilities_b,1-Apr-24,,,,,,supplier_d,supplier_e,supplier_f,,,,,,


In [7]:
df3 = pd.DataFrame(df2.to_records())
df3.head()

Unnamed: 0,requestor,goods_services,award_date,supplier_a,supplier_aaa,supplier_b,supplier_bbb,supplier_c,supplier_d,supplier_e,supplier_f,supplier_g,supplier_h,supplier_i,supplier_j,supplier_k,supplier_l
0,requestor_a,maintain_facilities_a,1-Apr-24,,supplier_aaa,,supplier_bbb,,,,supplier_f,,,,,,
1,requestor_a,maintain_facilities_a,1-Jan-24,,supplier_aaa,,supplier_bbb,supplier_c,,,,,,,,,
2,requestor_a,maintain_facilities_a,1-Jul-24,,supplier_aaa,,supplier_bbb,,,,,,,supplier_i,,,
3,requestor_a,maintain_facilities_a,1-Oct-24,,supplier_aaa,,supplier_bbb,,,,,,,,,,supplier_l
4,requestor_b,maintain_facilities_b,1-Apr-24,,,,,,supplier_d,supplier_e,supplier_f,,,,,,


In [8]:
df4 = df3.iloc[:,3:].notna()
df4.head()

Unnamed: 0,supplier_a,supplier_aaa,supplier_b,supplier_bbb,supplier_c,supplier_d,supplier_e,supplier_f,supplier_g,supplier_h,supplier_i,supplier_j,supplier_k,supplier_l
0,False,True,False,True,False,False,False,True,False,False,False,False,False,False
1,False,True,False,True,True,False,False,False,False,False,False,False,False,False
2,False,True,False,True,False,False,False,False,False,False,True,False,False,False
3,False,True,False,True,False,False,False,False,False,False,False,False,False,True
4,False,False,False,False,False,True,True,True,False,False,False,False,False,False


In [9]:
df5 = pd.concat([df3.iloc[:,:3], df4], axis=1)
df5.head()

Unnamed: 0,requestor,goods_services,award_date,supplier_a,supplier_aaa,supplier_b,supplier_bbb,supplier_c,supplier_d,supplier_e,supplier_f,supplier_g,supplier_h,supplier_i,supplier_j,supplier_k,supplier_l
0,requestor_a,maintain_facilities_a,1-Apr-24,False,True,False,True,False,False,False,True,False,False,False,False,False,False
1,requestor_a,maintain_facilities_a,1-Jan-24,False,True,False,True,True,False,False,False,False,False,False,False,False,False
2,requestor_a,maintain_facilities_a,1-Jul-24,False,True,False,True,False,False,False,False,False,False,True,False,False,False
3,requestor_a,maintain_facilities_a,1-Oct-24,False,True,False,True,False,False,False,False,False,False,False,False,False,True
4,requestor_b,maintain_facilities_b,1-Apr-24,False,False,False,False,False,True,True,True,False,False,False,False,False,False


In [10]:
df6 = apriori(df5.iloc[:,3:], min_support=0.1, use_colnames=True)
df6

Unnamed: 0,support,itemsets
0,0.1875,(supplier_a)
1,0.25,(supplier_aaa)
2,0.1875,(supplier_b)
3,0.25,(supplier_bbb)
4,0.25,(supplier_c)
5,0.1875,(supplier_d)
6,0.1875,(supplier_e)
7,0.25,(supplier_f)
8,0.1875,(supplier_g)
9,0.1875,(supplier_h)


In [11]:
df7 = association_rules(df6, metric='confidence', min_threshold=0.1, support_only=False)
df7 = df7.sort_values(['confidence','lift'], ascending=[False,False])
df7

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(supplier_a),(supplier_b),0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0
1,(supplier_b),(supplier_a),0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0
8,(supplier_e),(supplier_d),0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0
9,(supplier_d),(supplier_e),0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0
14,(supplier_h),(supplier_g),0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0
15,(supplier_g),(supplier_h),0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0
20,(supplier_k),(supplier_j),0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0
21,(supplier_j),(supplier_k),0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0
26,"(supplier_c, supplier_a)",(supplier_b),0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0
27,"(supplier_c, supplier_b)",(supplier_a),0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0


In [12]:
df7['antecedents'] = df7['antecedents'].apply(lambda x: list(x))
df7['consequents'] = df7['consequents'].apply(lambda x: list(x))
df7

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,[supplier_a],[supplier_b],0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0
1,[supplier_b],[supplier_a],0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0
8,[supplier_e],[supplier_d],0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0
9,[supplier_d],[supplier_e],0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0
14,[supplier_h],[supplier_g],0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0
15,[supplier_g],[supplier_h],0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0
20,[supplier_k],[supplier_j],0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0
21,[supplier_j],[supplier_k],0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0
26,"[supplier_c, supplier_a]",[supplier_b],0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0
27,"[supplier_c, supplier_b]",[supplier_a],0.1875,0.1875,0.1875,1.0,5.333333,0.152344,inf,1.0


In [13]:
df7.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 0 to 49
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   antecedents         50 non-null     object 
 1   consequents         50 non-null     object 
 2   antecedent support  50 non-null     float64
 3   consequent support  50 non-null     float64
 4   support             50 non-null     float64
 5   confidence          50 non-null     float64
 6   lift                50 non-null     float64
 7   leverage            50 non-null     float64
 8   conviction          50 non-null     float64
 9   zhangs_metric       50 non-null     float64
dtypes: float64(8), object(2)
memory usage: 4.3+ KB
