In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
import networkx as nx
import matplotlib.pyplot as plt
import copy

In [2]:
df1 = pd.read_csv('data_irregularities_no.csv')
df1.head()

Unnamed: 0,requestor,goods_services,supplier,award_date,award_amount
0,requestor_a,maintain_facilities_a,supplier_a,1-Jan-24,100128.7881
1,requestor_a,maintain_facilities_a,supplier_b,1-Jan-24,100228.2785
2,requestor_a,maintain_facilities_a,supplier_c,1-Jan-24,109847.7126
3,requestor_a,maintain_facilities_a,supplier_d,1-Apr-24,109727.4697
4,requestor_a,maintain_facilities_a,supplier_e,1-Apr-24,103403.4242


In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   requestor       48 non-null     object 
 1   goods_services  48 non-null     object 
 2   supplier        48 non-null     object 
 3   award_date      48 non-null     object 
 4   award_amount    48 non-null     float64
dtypes: float64(1), object(4)
memory usage: 2.0+ KB


In [4]:
df1['award_date_formatted'] = pd.to_datetime(df1['award_date'], format='%d-%b-%y', errors='coerce')
df1.head()

Unnamed: 0,requestor,goods_services,supplier,award_date,award_amount,award_date_formatted
0,requestor_a,maintain_facilities_a,supplier_a,1-Jan-24,100128.7881,2024-01-01
1,requestor_a,maintain_facilities_a,supplier_b,1-Jan-24,100228.2785,2024-01-01
2,requestor_a,maintain_facilities_a,supplier_c,1-Jan-24,109847.7126,2024-01-01
3,requestor_a,maintain_facilities_a,supplier_d,1-Apr-24,109727.4697,2024-04-01
4,requestor_a,maintain_facilities_a,supplier_e,1-Apr-24,103403.4242,2024-04-01


In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   requestor             48 non-null     object        
 1   goods_services        48 non-null     object        
 2   supplier              48 non-null     object        
 3   award_date            48 non-null     object        
 4   award_amount          48 non-null     float64       
 5   award_date_formatted  48 non-null     datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 2.4+ KB


In [6]:
df2 = pd.pivot(df1, index=['requestor','goods_services','award_date'], columns='supplier', values='supplier')
df2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,supplier,supplier_a,supplier_b,supplier_c,supplier_d,supplier_e,supplier_f,supplier_g,supplier_h,supplier_i,supplier_j,supplier_k,supplier_l
requestor,goods_services,award_date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
requestor_a,maintain_facilities_a,1-Apr-24,,,,supplier_d,supplier_e,supplier_f,,,,,,
requestor_a,maintain_facilities_a,1-Jan-24,supplier_a,supplier_b,supplier_c,,,,,,,,,
requestor_a,maintain_facilities_a,1-Jul-24,,,,,,,supplier_g,supplier_h,supplier_i,,,
requestor_a,maintain_facilities_a,1-Oct-24,,,,,,,,,,supplier_j,supplier_k,supplier_l
requestor_b,maintain_facilities_b,1-Apr-24,,,,supplier_d,supplier_e,supplier_f,,,,,,


In [7]:
df3 = pd.DataFrame(df2.to_records())
df3.head()

Unnamed: 0,requestor,goods_services,award_date,supplier_a,supplier_b,supplier_c,supplier_d,supplier_e,supplier_f,supplier_g,supplier_h,supplier_i,supplier_j,supplier_k,supplier_l
0,requestor_a,maintain_facilities_a,1-Apr-24,,,,supplier_d,supplier_e,supplier_f,,,,,,
1,requestor_a,maintain_facilities_a,1-Jan-24,supplier_a,supplier_b,supplier_c,,,,,,,,,
2,requestor_a,maintain_facilities_a,1-Jul-24,,,,,,,supplier_g,supplier_h,supplier_i,,,
3,requestor_a,maintain_facilities_a,1-Oct-24,,,,,,,,,,supplier_j,supplier_k,supplier_l
4,requestor_b,maintain_facilities_b,1-Apr-24,,,,supplier_d,supplier_e,supplier_f,,,,,,


In [8]:
df4 = df3.iloc[:,3:].notna()
df4.head()

Unnamed: 0,supplier_a,supplier_b,supplier_c,supplier_d,supplier_e,supplier_f,supplier_g,supplier_h,supplier_i,supplier_j,supplier_k,supplier_l
0,False,False,False,True,True,True,False,False,False,False,False,False
1,True,True,True,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,True,True,True,False,False,False
3,False,False,False,False,False,False,False,False,False,True,True,True
4,False,False,False,True,True,True,False,False,False,False,False,False


In [9]:
df5 = pd.concat([df3.iloc[:,:3], df4], axis=1)
df5.head()

Unnamed: 0,requestor,goods_services,award_date,supplier_a,supplier_b,supplier_c,supplier_d,supplier_e,supplier_f,supplier_g,supplier_h,supplier_i,supplier_j,supplier_k,supplier_l
0,requestor_a,maintain_facilities_a,1-Apr-24,False,False,False,True,True,True,False,False,False,False,False,False
1,requestor_a,maintain_facilities_a,1-Jan-24,True,True,True,False,False,False,False,False,False,False,False,False
2,requestor_a,maintain_facilities_a,1-Jul-24,False,False,False,False,False,False,True,True,True,False,False,False
3,requestor_a,maintain_facilities_a,1-Oct-24,False,False,False,False,False,False,False,False,False,True,True,True
4,requestor_b,maintain_facilities_b,1-Apr-24,False,False,False,True,True,True,False,False,False,False,False,False


In [10]:
df6 = apriori(df5.iloc[:,3:], min_support=0.1, use_colnames=True)
df6

Unnamed: 0,support,itemsets
0,0.25,(supplier_a)
1,0.25,(supplier_b)
2,0.25,(supplier_c)
3,0.25,(supplier_d)
4,0.25,(supplier_e)
5,0.25,(supplier_f)
6,0.25,(supplier_g)
7,0.25,(supplier_h)
8,0.25,(supplier_i)
9,0.25,(supplier_j)


In [11]:
df7 = association_rules(df6, metric='confidence', min_threshold=0.1, support_only=False)
df7 = df7.sort_values(['confidence','lift'], ascending=[False,False])
df7

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(supplier_b),(supplier_a),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
1,(supplier_a),(supplier_b),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
2,(supplier_a),(supplier_c),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
3,(supplier_c),(supplier_a),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
4,(supplier_b),(supplier_c),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
5,(supplier_c),(supplier_b),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
6,(supplier_d),(supplier_e),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
7,(supplier_e),(supplier_d),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
8,(supplier_f),(supplier_d),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
9,(supplier_d),(supplier_f),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0


In [12]:
df8 = copy.deepcopy(df7)
df8

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(supplier_b),(supplier_a),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
1,(supplier_a),(supplier_b),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
2,(supplier_a),(supplier_c),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
3,(supplier_c),(supplier_a),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
4,(supplier_b),(supplier_c),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
5,(supplier_c),(supplier_b),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
6,(supplier_d),(supplier_e),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
7,(supplier_e),(supplier_d),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
8,(supplier_f),(supplier_d),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
9,(supplier_d),(supplier_f),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0


In [13]:
df8['antecedents_values'] = df8['antecedents'].apply(lambda x: list(x)[0])
df8['consequents_values'] = df8['consequents'].apply(lambda x: list(x)[0])
df8

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,antecedents_values,consequents_values
0,(supplier_b),(supplier_a),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_b,supplier_a
1,(supplier_a),(supplier_b),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_a,supplier_b
2,(supplier_a),(supplier_c),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_a,supplier_c
3,(supplier_c),(supplier_a),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_c,supplier_a
4,(supplier_b),(supplier_c),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_b,supplier_c
5,(supplier_c),(supplier_b),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_c,supplier_b
6,(supplier_d),(supplier_e),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_d,supplier_e
7,(supplier_e),(supplier_d),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_e,supplier_d
8,(supplier_f),(supplier_d),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_f,supplier_d
9,(supplier_d),(supplier_f),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_d,supplier_f


In [14]:
df8.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48 entries, 0 to 47
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   antecedents         48 non-null     object 
 1   consequents         48 non-null     object 
 2   antecedent support  48 non-null     float64
 3   consequent support  48 non-null     float64
 4   support             48 non-null     float64
 5   confidence          48 non-null     float64
 6   lift                48 non-null     float64
 7   leverage            48 non-null     float64
 8   conviction          48 non-null     float64
 9   zhangs_metric       48 non-null     float64
 10  antecedents_values  48 non-null     object 
 11  consequents_values  48 non-null     object 
dtypes: float64(8), object(4)
memory usage: 4.9+ KB


In [15]:
df8['redundant'] = df8[['antecedents_values','consequents_values']].values.tolist()
df8['redundant'] = df8['redundant'].apply(sorted)
df8['redundant'] = df8['redundant'].apply(lambda x: ', '.join(x))
df8

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,antecedents_values,consequents_values,redundant
0,(supplier_b),(supplier_a),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_b,supplier_a,"supplier_a, supplier_b"
1,(supplier_a),(supplier_b),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_a,supplier_b,"supplier_a, supplier_b"
2,(supplier_a),(supplier_c),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_a,supplier_c,"supplier_a, supplier_c"
3,(supplier_c),(supplier_a),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_c,supplier_a,"supplier_a, supplier_c"
4,(supplier_b),(supplier_c),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_b,supplier_c,"supplier_b, supplier_c"
5,(supplier_c),(supplier_b),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_c,supplier_b,"supplier_b, supplier_c"
6,(supplier_d),(supplier_e),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_d,supplier_e,"supplier_d, supplier_e"
7,(supplier_e),(supplier_d),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_e,supplier_d,"supplier_d, supplier_e"
8,(supplier_f),(supplier_d),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_f,supplier_d,"supplier_d, supplier_f"
9,(supplier_d),(supplier_f),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_d,supplier_f,"supplier_d, supplier_f"


In [16]:
df8.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48 entries, 0 to 47
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   antecedents         48 non-null     object 
 1   consequents         48 non-null     object 
 2   antecedent support  48 non-null     float64
 3   consequent support  48 non-null     float64
 4   support             48 non-null     float64
 5   confidence          48 non-null     float64
 6   lift                48 non-null     float64
 7   leverage            48 non-null     float64
 8   conviction          48 non-null     float64
 9   zhangs_metric       48 non-null     float64
 10  antecedents_values  48 non-null     object 
 11  consequents_values  48 non-null     object 
 12  redundant           48 non-null     object 
dtypes: float64(8), object(5)
memory usage: 5.2+ KB


In [17]:
df8 = df8.drop_duplicates(subset=['support','confidence','lift','redundant'], ignore_index=True)
df8

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,antecedents_values,consequents_values,redundant
0,(supplier_b),(supplier_a),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_b,supplier_a,"supplier_a, supplier_b"
1,(supplier_a),(supplier_c),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_a,supplier_c,"supplier_a, supplier_c"
2,(supplier_b),(supplier_c),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_b,supplier_c,"supplier_b, supplier_c"
3,(supplier_d),(supplier_e),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_d,supplier_e,"supplier_d, supplier_e"
4,(supplier_f),(supplier_d),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_f,supplier_d,"supplier_d, supplier_f"
5,(supplier_f),(supplier_e),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_f,supplier_e,"supplier_e, supplier_f"
6,(supplier_h),(supplier_g),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_h,supplier_g,"supplier_g, supplier_h"
7,(supplier_i),(supplier_g),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_i,supplier_g,"supplier_g, supplier_i"
8,(supplier_h),(supplier_i),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_h,supplier_i,"supplier_h, supplier_i"
9,(supplier_k),(supplier_j),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0,supplier_k,supplier_j,"supplier_j, supplier_k"
