In [2]:
"""
Association rule mining
"""
import os
import numpy as np
import pandas as pd

In [3]:
path = os.getcwd()
files = os.listdir(path)

In [4]:
"""

Key ideas to report in the paper
1. General statistics: Number of pipes with defects, most frequent defects 
2. Number of instances of two defects being within the same vicinity, probabilities of these occurrences
3. Support, lift, etc.

"""

'\n\nKey ideas to report in the paper\n1. General statistics: Number of pipes with defects, most frequent defects \n2. Number of instances of two defects being within the same vicinity, probabilities of these occurrences\n3. Support, lift, etc.\n\n'

In [5]:
"""
Helper functions
"""

def change_defect_names(df_cond):
    """ 
    Change the names of defects: e.g., FL => Fracture, TB => Tap
    Takes df_cond as input. 
    """
    deposit_codes = ['DAE', 'DAGS', 'DAR', 'DAZ', 'DSV', 'DSGV', 'DSC', 'DSZ', 'DNF', 'DNGV', 'DNZ']
    deformed_codes = ['DR', 'DFBR', 'DFBI', 'DFC', 'DFE', 'DTBR', 'DTBI']
    infiltration_codes = ['IS', 'ISB', 'ISJ', 'ISC', 'ISL', 'IW', 'IWB', 'IWC', 'IWJ', 'IWL', 'ID', 'IDB', 'IDC', 'IDJ', 'IDL', 'IR', 'IRB', 'IRC', 'IRJ', 'IRL', 'IG', 'IGB', 'IGC', 'IGL', 'IGJ' ]
    hole_codes = ['HSV', 'HVV']
    fracture_codes = ['FL', 'FC', 'FM', 'FS', 'FH']
    crack_codes = ['CL', 'CC', 'CM', 'CS', 'CH']
    broken_codes = ['BSV', 'BVV']
    
    df_cond.loc[df_cond['PACP_Code'].isin(deposit_codes), 'PACP_Code'] = 'Deposit'
    df_cond.loc[df_cond['PACP_Code'].isin(deformed_codes), 'PACP_Code'] = 'Deformed'
    df_cond.loc[df_cond['PACP_Code'].isin(infiltration_codes), 'PACP_Code'] = 'Infiltration'
    df_cond.loc[df_cond['PACP_Code'].isin(hole_codes), 'PACP_Code'] = 'Hole'
    df_cond.loc[df_cond['PACP_Code'].isin(fracture_codes), 'PACP_Code'] = 'Fracture'
    df_cond.loc[df_cond['PACP_Code'].isin(crack_codes), 'PACP_Code'] = 'Crack'
    df_cond.loc[df_cond['PACP_Code'].isin(broken_codes), 'PACP_Code'] = 'Broken'
    
    df_cond.loc[df_cond['PACP_Code'].str.startswith('T'), 'PACP_Code'] = 'Tap'
    df_cond.loc[df_cond['PACP_Code'].str.startswith('A'), 'PACP_Code'] = 'Manhole'
    df_cond.loc[df_cond['PACP_Code'].str.startswith('J'), 'PACP_Code'] = 'Joint Offset'
    df_cond.loc[df_cond['PACP_Code'].str.startswith('R'), 'PACP_Code'] = 'Root'
    
    return df_cond

def delete_rows(df_cond, keep_defects):
    """
    Delete rows where the defects that we want to consider are not present
    """
    df_cond = df_cond[df_cond['PACP_Code'].isin(keep_defects)]
    return df_cond
    

def select_df(df, insp_id, defect_list):
    """
    Helper function used by count_defect_pairs()
    This function only keeps the rows where ['defect1', 'defect2'] are present
    It helps speed up the computation
    """
    df = df[(df['InspectionID'] == insp_id)]
    return  df[df['PACP_Code'].isin(defect_list)]

def delete_inspections_with_no_defects(df_cond, defects):
    """
    Delete those inspections where there aren't any defects at all. 
    By defects we mean fractures, cracks, etc. Not tap and manhole
    """
    insp_ids = np.unique(list(df_cond['InspectionID']))

    keep_ids = [] # Inspection IDs that we want to keep

    for insp_id in insp_ids:
        df_temp = df_cond[(df_cond['InspectionID'] == insp_id)]
        if df_temp['PACP_Code'].isin(defects).sum() > 0:
            keep_ids.append(insp_id)
    
    return df_cond[df_cond['InspectionID'].isin(keep_ids)]

def count_defect_pairs(df_cond, keep_defects, distance_threshold):
    """
    Count the number of defect pairs that are < distance_threshold within each other
    """
    defect_pair_counts = []
    
    for a in range(0, len(keep_defects)): 
        for b in range(a+1, len(keep_defects)):

            pair_count = 0
            defect_pair = [keep_defects[a], keep_defects[b]] # Pair of defects whose no. of occurrences we wish to calculate

            for insp_id in df_cond['InspectionID'].unique():
            
            # Inspection id which we are counting for
                df = select_df(df_cond, insp_id, defect_pair)
                insp_id = list(df['InspectionID'])
                dist = list(df['Distance'])
                defects = list(df['PACP_Code']) # Defects which are selected from a particular inspection

                for i in range (0, len(defects) - 1):
                    for j in range (i+1, len(defects)):
                        defect_1, defect_2 = defects[i], defects[j]
                        if (defect_1 == defect_pair[0] and defect_2 == defect_pair[1]) or (defect_2 == defect_pair[0] and defect_1 == defect_pair[1]):
                            if abs(dist[i] - dist[j]) <= distance_threshold:
                                pair_count = pair_count + 1

            defect_pair_counts.append([keep_defects[a], keep_defects[b], pair_count])
            
    return defect_pair_counts

def calculate_length_of_pipeline(df_cond):
    # Find total length of pipeline
    insps = df_cond['InspectionID'].unique()
    length = 0.0
    for insp in insps:
        df_temp = df_cond[df_cond['InspectionID'] == insp]
        length += df_temp['Distance'].max()
    return length
   

def create_econometric_database(df_insp, cols, defects):
    df_insp = df_insp[cols]

    f, c, t, m, j, r, d, b = [], [], [], [], [], [], [], []

    insp_ids = list(df_insp['InspectionID'])

    for insp_id in insp_ids:

        df_temp = df_cond.loc[df_cond['InspectionID'] == insp_id]

        f.append(len(df_temp[df_temp['PACP_Code'] == 'Fracture']))
        c.append(len(df_temp[df_temp['PACP_Code'] == 'Crack']))
        t.append(len(df_temp[df_temp['PACP_Code'] == 'Tap']))
        m.append(len(df_temp[df_temp['PACP_Code'] == 'Manhole']))
        j.append(len(df_temp[df_temp['PACP_Code'] == 'Joint Offset']))
        r.append(len(df_temp[df_temp['PACP_Code'] == 'Root']))
        d.append(len(df_temp[df_temp['PACP_Code'] == 'Deposit']))
        b.append(len(df_temp[df_temp['PACP_Code'] == 'Broken']))

    df_insp['Fracture'] = f
    df_insp['Crack'] = c
    df_insp['Tap'] = t
    df_insp['Manhole'] = m
    df_insp['Joint Offset'] = j
    df_insp['Root'] = r
    df_insp['Deposit'] = d
    df_insp['Broken'] = b
    
    return df_insp

In [255]:
cluster_size = 10

# Read the database of condition information from CSV
df_cond = pd.read_csv('Conditions_Combined.csv', sep=',')
df_cond.head();
print("Total number of inspections to begin with are: {}".format(df_cond['InspectionID'].max()))

# Change names of defects
df_cond = change_defect_names(df_cond)

# Keep only the rows which contain these defects. Delete all else
keep_defects = ['Fracture', 'Crack', 'Tap', 'Manhole', 'Joint Offset', 'Root', 'Deposit', 'Broken']
df_cond = delete_rows(df_cond, keep_defects)

# Delete inspections which have no 'defects', i.e, fracture, crack, root, joint offset, deposit, broken
defects = ['Fracture', 'Crack', 'Joint Offset', 'Root', 'Deposit', 'Broken']
# df_cond = delete_inspections_with_no_defects(df_cond, defects)
# print("Length : {}".format(calculate_length_of_pipeline(df_cond)))


# Count the number of inspections that remain after the deletion
# print("\nNumber of inspections which contain defects is: {}".format(df_cond['InspectionID'].nunique()))

# List the number of defects
print("\nNumber of defects is as follows: \n{}".format(df_cond['PACP_Code'].value_counts()))

"""
Counting only Fracture and Tap
Delete all inspections which do not have both Fracture and Tap
"""

keep_defects = ['Joint Offset', 'Fracture']

# Deletes those inspections which don't have defect1 or defect2
df_cond = delete_inspections_with_no_defects(df_cond, [keep_defects[0], keep_defects[1]])

# Count the number of inspections that remain after the deletion
print("\nNumber of inspections with {} and {} is: {}".format(keep_defects[0], keep_defects[1], df_cond['InspectionID'].nunique()))

# List the number of defects
print("\nNumber of defects in these inspections is as follows: \n{}".format(df_cond['PACP_Code'].value_counts()))

# Counting the number of defect clusters within 5 feet
defect_pair_counts = count_defect_pairs(df_cond, keep_defects, cluster_size)

print('\nNumber of ({}, {}) clusters is: {}'.format(keep_defects[0], keep_defects[1], defect_pair_counts[0][2]))

# Count the total length of pipe in this dataframe
length = calculate_length_of_pipeline(df_cond)

print("Length : {}".format(length))

Total number of inspections to begin with are: 8891

Number of defects is as follows: 
Tap             29688
Manhole         17167
Crack            9197
Fracture         8558
Deposit          8003
Root             7943
Joint Offset     1943
Broken            458
Name: PACP_Code, dtype: int64

Number of inspections with Joint Offset and Fracture is: 2039

Number of defects in these inspections is as follows: 
Tap             10244
Fracture         8558
Crack            7291
Deposit          4008
Root             3983
Manhole          3793
Joint Offset     1943
Broken            336
Name: PACP_Code, dtype: int64

Number of (Joint Offset, Fracture) clusters is: 718
Length : 419601.8005263697


In [256]:
num_defect1 = df_cond['PACP_Code'].value_counts()[keep_defects[0]]
num_defect2 = df_cond['PACP_Code'].value_counts()[keep_defects[1]]
print("Number of {} is {}".format(keep_defects[0], num_defect1))
print("Number of {} is {}".format(keep_defects[1], num_defect2))

spacing1 = length/num_defect1
spacing2 = length/num_defect2

print("Length: {}".format(length))
print("Average {} spacing: {} feet".format(keep_defects[0], spacing1))
print("Average {} spacing: {} feet".format(keep_defects[1], spacing2))

Number of Joint Offset is 1943
Number of Fracture is 8558
Length: 419601.8005263697
Average Joint Offset spacing: 215.955635885934 feet
Average Fracture spacing: 49.03035762168377 feet


In [257]:
# If Taps and Fractures were uniformly distributed, calculate the number of co-occurrences
import numpy as np

defect1 = [[keep_defects[0], i, 1] for i in np.arange(0, length, spacing1)]
defect2 = [[keep_defects[1], i, 1] for i in np.arange(0, length, spacing2)]
df_cond_fake = pd.DataFrame(defect1+defect2, columns = ['PACP_Code', 'Distance', 'InspectionID'])
defect_pair_counts = count_defect_pairs(df_cond_fake, [keep_defects[0], keep_defects[1]], cluster_size)
print('\nNumber of ({}, {}) clusters is: {}'.format(keep_defects[0], keep_defects[1], defect_pair_counts[0][2]))


Number of (Joint Offset, Fracture) clusters is: 793


In [258]:
"""
Count number of defect hotspots

TODO: Compare with uiform distribution/Weibull distribution/Try fitting curve

"""

'\nCount number of defect hotspots\n\nTODO: Compare with uiform distribution/Weibull distribution/Try fitting curve\n\n'

In [50]:
# Defect Codes

deposit_codes = ['DAE', 'DAGS', 'DAR', 'DAZ', 'DSV', 'DSGV', 'DSC', 'DSZ', 'DNF', 'DNGV', 'DNZ']
deformed_codes = ['DR', 'DFBR', 'DFBI', 'DFC', 'DFE', 'DTBR', 'DTBI']
infiltration_codes = ['IS', 'ISB', 'ISJ', 'ISC', 'ISL', 'IW', 'IWB', 'IWC', 'IWJ', 'IWL', 'ID', 'IDB', 'IDC', 'IDJ', 'IDL', 'IR', 'IRB', 'IRC', 'IRJ', 'IRL', 'IG', 'IGB', 'IGC', 'IGL', 'IGJ' ]
hole_codes = ['HSV', 'HVV']
fracture_codes = ['FL', 'FC', 'FM', 'FS', 'FH', 'FH2', 'FH3', 'FH4']
crack_codes = ['CL', 'CC', 'CM', 'CS', 'CH', 'CH2', 'CH3', 'CH4']
broken_codes = ['BSV', 'BVV']
tap_codes = ['TB', 'TBI', 'TBD', 'TBC', 'TBA', 'TF', 'TFI', 'TFD', 'TFC', 'TFA', 'TFB', 'TR', 'TRI', 'TRD', 'TRC', 'TRA', 'TRB', 'TS', 'TSI', 'TSD', 'TSA', 'TSB']
root_codes = ['RFB', 'RFL', 'RFC', 'RFJ', 'RMB', 'RML', 'RMC', 'RMJ', 'RBB', 'RBL', 'RBC', 'RBJ', 'RTB', 'RTL', 'RTC', 'RTJ']
joint_offset_codes = ['JOS', 'JOM', 'JOL', 'JOSD', 'JOMD', 'JOLD', 'JSS', 'JSM', 'JSL', 'JAS', 'JAM', 'JAL']
keep_defects = deposit_codes+deformed_codes+infiltration_codes+hole_codes+fracture_codes+crack_codes+broken_codes+root_codes+joint_offset_codes


In [51]:
# Count number of defect zones
df_cond = pd.read_csv('Conditions_Combined.csv', sep=',')
df_cond.head();
print("Total number of inspections to begin with are: {}".format(df_cond['InspectionID'].nunique()))

# Change names of defects
# df_cond = change_defect_names(df_cond)

# Keep only the rows which contain these defects. Delete all else
# keep_defects = ['Fracture', 'Crack', 'Joint Offset', 'Root', 'Deposit', 'Broken', 'Hole', 'Infiltration', 'Deformed']


df_cond = delete_rows(df_cond, keep_defects)

# Delete inspections which have no 'defects', i.e, fracture, crack, root, joint offset, deposit, broken
df_cond = delete_inspections_with_no_defects(df_cond, keep_defects)

print("Number of inspections after deletion are: {}".format(df_cond['InspectionID'].nunique()))

Total number of inspections to begin with are: 8891
Number of inspections after deletion are: 4753


In [52]:
thresh = 10000

insps = df_cond['InspectionID'].unique()
zones = []

for insp in insps:
    
    df_temp = df_cond[df_cond['InspectionID'] == insp]
    df_temp = df_temp.sort_values(by=["Distance"])
    indices = df_temp.index
    defect_prev, defect_curr = "",""
    dist_prev, dist_curr = 0, 0
    zone_curr = []
    
    for index in indices:
        defect_curr = df_temp.at[index, 'PACP_Code'] # Defect code at current index
        dist_curr = float(df_temp.at[index, 'Distance']) # Distance of defect at current index        
    
        if abs(dist_curr - dist_prev) < thresh:
            zone_curr.append((insp, defect_curr, dist_curr))
        else:           
            zones.append(zone_curr)
            zone_curr = []
            zone_curr.append((insp, defect_curr, dist_curr))
                        
        dist_prev = dist_curr
        defect_prev = defect_curr
        
    zones.append(zone_curr)

In [53]:
len(zones)
zones[0:10]

[[(1, 'JOL', 11.10000038)],
 [(2, 'DAGS', 0.10000000099999999), (2, 'DAGS', 37.90000153)],
 [(4, 'DAGS', 1.0),
  (4, 'DAGS', 57.29999924),
  (4, 'DAE', 57.29999924),
  (4, 'DAGS', 77.09999847),
  (4, 'DAGS', 121.90000149999999),
  (4, 'DAGS', 129.5)],
 [(5, 'IR', 9.600000381000001),
  (5, 'DAGS', 12.80000019),
  (5, 'IR', 24.60000038),
  (5, 'DAE', 27.60000038),
  (5, 'IR', 93.59999847),
  (5, 'IR', 111.5),
  (5, 'DAE', 122.6999969)],
 [(6, 'DAE', 13.69999981),
  (6, 'FM', 21.10000038),
  (6, 'FL', 75.5),
  (6, 'IR', 168.1999969),
  (6, 'DAE', 236.60000609999997)],
 [(11, 'DAGS', 10.80000019), (11, 'DAGS', 127.8000031)],
 [(19, 'DAGS', 77.30000305), (19, 'DAE', 97.90000153)],
 [(22, 'DAE', 45.90000153),
  (22, 'IR', 45.90000153),
  (22, 'DAE', 78.90000153),
  (22, 'RFJ', 239.3000031)],
 [(23, 'RFJ', 36.40000153)],
 [(24, 'FL', 101.4000015)]]

In [54]:
# Delete empty zones
zones = list(filter(lambda a: a!=[], zones))

In [55]:
"""
Calculate number of colocated defects
"""

colocated = 0
total = 0
max_colocated = 0

for zone in zones:
    if len(zone) > 1:
        colocated += len(zone)
        if len(zone) > max_colocated:
            max_colocated = len(zone)
            max_zone = zone
    total += len(zone)
    
print(f'Number of colocated defects is {colocated}')
print(f'Total number of defects is {total}')
print(f'Max defects in a zone is {max_colocated}')

Number of colocated defects is 36637
Total number of defects is 37573
Max defects in a zone is 118


In [56]:
max_zone

[(3209, 'JOM', 2.0),
 (3209, 'DAGS', 25.10000038),
 (3209, 'CC', 30.20000076),
 (3209, 'CL', 41.90000153),
 (3209, 'FC', 54.09999847),
 (3209, 'DAGS', 62.90000153),
 (3209, 'DAGS', 66.80000305),
 (3209, 'DAGS', 73.0),
 (3209, 'CL', 76.80000305),
 (3209, 'DAGS', 78.40000153),
 (3209, 'CL', 83.0),
 (3209, 'IR', 83.0),
 (3209, 'CL', 83.0),
 (3209, 'DAGS', 83.09999847),
 (3209, 'DAGS', 85.30000305),
 (3209, 'DAGS', 87.40000153),
 (3209, 'CL', 99.19999695),
 (3209, 'DAGS', 99.19999695),
 (3209, 'CL', 100.3000031),
 (3209, 'CL', 101.8000031),
 (3209, 'DAGS', 105.09999850000001),
 (3209, 'DAGS', 117.8000031),
 (3209, 'CL', 126.5),
 (3209, 'CL', 126.5),
 (3209, 'DAGS', 154.0),
 (3209, 'FC', 163.1999969),
 (3209, 'DAGS', 172.3000031),
 (3209, 'DAGS', 178.1999969),
 (3209, 'CL', 184.6999969),
 (3209, 'IS', 184.6999969),
 (3209, 'CL', 190.8000031),
 (3209, 'DAGS', 199.5),
 (3209, 'CL', 201.3999939),
 (3209, 'CL', 205.6000061),
 (3209, 'IS', 205.6000061),
 (3209, 'CL', 206.0),
 (3209, 'CL', 209.19

In [13]:
"""
Calculate the number of zones of various lengths
"""

num_zones = {i:0 for i in range(1, max_colocated + 1)}
for zone in zones:
    if len(zone) >= 1:
        num_zones[len(zone)] += 1

In [14]:
num_zones

{1: 806,
 2: 802,
 3: 495,
 4: 391,
 5: 293,
 6: 243,
 7: 198,
 8: 133,
 9: 150,
 10: 114,
 11: 82,
 12: 89,
 13: 63,
 14: 63,
 15: 46,
 16: 48,
 17: 34,
 18: 45,
 19: 32,
 20: 29,
 21: 39,
 22: 38,
 23: 32,
 24: 24,
 25: 32,
 26: 19,
 27: 13,
 28: 15,
 29: 14,
 30: 22,
 31: 8,
 32: 18,
 33: 10,
 34: 12,
 35: 11,
 36: 15,
 37: 18,
 38: 8,
 39: 13,
 40: 7,
 41: 1,
 42: 8,
 43: 8,
 44: 4,
 45: 5,
 46: 8,
 47: 9,
 48: 1,
 49: 7,
 50: 3,
 51: 5,
 52: 6,
 53: 3,
 54: 7,
 55: 3,
 56: 1,
 57: 3,
 58: 1,
 59: 0,
 60: 1,
 61: 2,
 62: 0,
 63: 0,
 64: 0,
 65: 0,
 66: 0,
 67: 4,
 68: 1,
 69: 0,
 70: 2,
 71: 3,
 72: 2,
 73: 0,
 74: 1,
 75: 0,
 76: 2,
 77: 0,
 78: 0,
 79: 0,
 80: 0,
 81: 2,
 82: 0,
 83: 2,
 84: 0,
 85: 0,
 86: 0,
 87: 1,
 88: 0,
 89: 0,
 90: 0,
 91: 1,
 92: 0,
 93: 0,
 94: 2,
 95: 0,
 96: 0,
 97: 0,
 98: 0,
 99: 0,
 100: 0,
 101: 0,
 102: 0,
 103: 0,
 104: 0,
 105: 2,
 106: 0,
 107: 0,
 108: 0,
 109: 0,
 110: 0,
 111: 0,
 112: 0,
 113: 0,
 114: 0,
 115: 0,
 116: 0,
 117: 0,
 118: 2}

In [57]:
"""
Most frequent defect in colocated zone
"""
defects = {defect_code:0 for defect_code in keep_defects}
for zone in zones:
    if len(zone) > 1:
        for defect in zone:
            defects[defect[1]] += 1 

In [58]:
defects

{'DAE': 3084,
 'DAGS': 4325,
 'DAR': 11,
 'DAZ': 153,
 'DSV': 0,
 'DSGV': 56,
 'DSC': 106,
 'DSZ': 64,
 'DNF': 9,
 'DNGV': 8,
 'DNZ': 2,
 'DR': 0,
 'DFBR': 0,
 'DFBI': 0,
 'DFC': 0,
 'DFE': 0,
 'DTBR': 0,
 'DTBI': 0,
 'IS': 486,
 'ISB': 0,
 'ISJ': 0,
 'ISC': 0,
 'ISL': 0,
 'IW': 249,
 'IWB': 0,
 'IWC': 0,
 'IWJ': 0,
 'IWL': 0,
 'ID': 169,
 'IDB': 0,
 'IDC': 0,
 'IDJ': 0,
 'IDL': 0,
 'IR': 405,
 'IRB': 0,
 'IRC': 0,
 'IRJ': 0,
 'IRL': 0,
 'IG': 103,
 'IGB': 0,
 'IGC': 0,
 'IGL': 0,
 'IGJ': 0,
 'HSV': 303,
 'HVV': 124,
 'FL': 1332,
 'FC': 1372,
 'FM': 4234,
 'FS': 1506,
 'FH': 0,
 'FH2': 13,
 'FH3': 27,
 'FH4': 26,
 'CL': 3268,
 'CC': 1815,
 'CM': 3182,
 'CS': 751,
 'CH': 0,
 'CH2': 12,
 'CH3': 14,
 'CH4': 10,
 'BSV': 305,
 'BVV': 125,
 'RFB': 242,
 'RFL': 365,
 'RFC': 285,
 'RFJ': 4841,
 'RMB': 32,
 'RML': 76,
 'RMC': 94,
 'RMJ': 612,
 'RBB': 25,
 'RBL': 119,
 'RBC': 53,
 'RBJ': 116,
 'RTB': 3,
 'RTL': 21,
 'RTC': 31,
 'RTJ': 200,
 'JOS': 0,
 'JOM': 1590,
 'JOL': 180,
 'JOSD': 0,
 'JOMD

In [17]:
"""
Finding severity of defect clusters and finding length of these clusters.
Will allow for the best patch repair

"""

'\nFinding severity of defect clusters and finding length of these clusters.\nWill allow for the best patch repair\n\n'

In [18]:
"""
Interdefect association rule mining
"""

'\nInterdefect association rule mining\n'

In [59]:
# Create matrix: https://towardsdatascience.com/association-rule-mining-be4122fc1793
import numpy as np

co_matrix = []

for zone in zones:
    row = np.zeros(len(keep_defects))
    for _, defect, _ in zone:
        defect_index = keep_defects.index(defect)
        row[defect_index] = 1
    co_matrix.append(row)

In [60]:
zones[:5]

[[(1, 'JOL', 11.10000038)],
 [(2, 'DAGS', 0.10000000099999999), (2, 'DAGS', 37.90000153)],
 [(4, 'DAGS', 1.0),
  (4, 'DAGS', 57.29999924),
  (4, 'DAE', 57.29999924),
  (4, 'DAGS', 77.09999847),
  (4, 'DAGS', 121.90000149999999),
  (4, 'DAGS', 129.5)],
 [(5, 'IR', 9.600000381000001),
  (5, 'DAGS', 12.80000019),
  (5, 'IR', 24.60000038),
  (5, 'DAE', 27.60000038),
  (5, 'IR', 93.59999847),
  (5, 'IR', 111.5),
  (5, 'DAE', 122.6999969)],
 [(6, 'DAE', 13.69999981),
  (6, 'FM', 21.10000038),
  (6, 'FL', 75.5),
  (6, 'IR', 168.1999969),
  (6, 'DAE', 236.60000609999997)]]

In [61]:
len(co_matrix)

4753

In [68]:
co_matrix[:2]

[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.]),
 array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.])]

In [67]:
len(co_matrix[0]) == len(keep_defects)

True

In [69]:
def count_pairs(x, y, keep_defects, co_matrix):
    count_x, count_y, count_xy = 0, 0, 0
    for row in co_matrix:
        if row[x] == 1 and row[y] == 1.0:
            count_xy += 1
        if row[x] == 1.0:
            count_x += 1
        if row[y] == 1.0:
            count_y += 1
            
    return count_x, count_y, count_xy

In [72]:
def support_confidence_xy(x, y, keep_defects, co_matrix):
    count_x, count_y, count_xy = count_pairs(x, y, keep_defects, co_matrix)
    support_xy = count_xy/len(co_matrix)
    support_x = count_x/len(co_matrix)
    support_y = count_y/len(co_matrix)
    if count_x ==0 or count_y == 0 or count_xy ==0:
        confidence_xy = 0
    else:
        confidence_xy = count_xy/count_x
    return support_x, support_y, support_xy, confidence_xy

In [80]:
min_support = 0.01
min_confidence = 0.50
rules = []

for x in range(0, len(keep_defects)):
    for y in range(0, len(keep_defects)):
        if x!=y:
            support_x, support_y, support_xy, confidence_xy = support_confidence_xy(x, y, keep_defects, co_matrix)
            if support_xy > min_support and confidence_xy > min_confidence:
                lift_xy = support_xy/(support_x*support_y)
                rules.append([keep_defects[x], keep_defects[y], support_xy, confidence_xy, lift_xy])


In [81]:
rules.sort(key=lambda tup: tup[4], reverse=True)  # sorts in place by confidence
rules[:50]

[['RBJ', 'RMJ', 0.011150852093414685, 0.5047619047619047, 6.59102564102564],
 ['CS', 'FS', 0.04376183463075952, 0.5842696629213483, 5.679005537556582],
 ['JOL', 'JOM', 0.013044393014937934, 0.5961538461538461, 4.6835028607755875],
 ['FL', 'FM', 0.08920681674731748, 0.7464788732394366, 3.0692163360787563],
 ['HSV', 'FM', 0.024405638544077426, 0.7204968944099379, 2.962389047690688],
 ['FS', 'FM', 0.07237534188933305, 0.7034764826175869, 2.8924080639112373],
 ['RMC', 'RFJ', 0.01409636019356196, 0.7976190476190477, 2.8807624113475176],
 ['RFB', 'RFJ', 0.017462655165158845, 0.7757009345794392, 2.8016007158480805],
 ['RTJ', 'RFJ', 0.020618556701030927, 0.765625, 2.7652094414893615],
 ['HVV', 'CL', 0.010730065221965075, 0.6219512195121951, 2.755017843747869],
 ['RMJ', 'RFJ', 0.057647801388596674, 0.7527472527472527, 2.718698854337152],
 ['CM', 'CL', 0.11129812749842205, 0.6101499423298731, 2.702742475203995],
 ['RFB', 'FM', 0.014727540500736377, 0.6542056074766355, 2.689826342851599],
 ['HVV'

In [182]:
""" 

Interesting Rules from Association Rules on Pipe-Level 

['JOL', 'FM', 0.011782032400589101, 0.5384615384615384, 2.213933989885547]
['RFB', 'FM', 0.014727540500736377, 0.6542056074766355, 2.689826342851599],
['RFB', 'CL', 0.011782032400589101, 0.5233644859813084, 2.318314447221956],

"""

(0.2755712636035973,
 0.25363054204430135,
 0.02834447282779858,
 0.10285714285714286)

In [78]:
num_rows = 0
for row in co_matrix:
    if np.count_nonzero(row == 1) > 1:
        num_rows += 1
        
print(f'Number of rows with more than 1 defect is {num_rows}')

Number of rows with more than 1 defect is 3995


In [102]:
lift = support/(support_x*support_y)
lift

1.0530775826485135

In [181]:
support

0.07973102785782901

In [36]:
"""
Make a database with external factors for econometric analysis
"""
# Read the inspections database
df_insp = pd.read_csv('Inspections_Hazen_Sawyer.csv')

# Columns that we want to keep
cols = ['InspectionID', 'Height', 'Down_Rim_to_Invert', 'Up_Rim_to_Invert', 'Height', 'Total_Length', 'Location_Code', 'Location_Details', 'Material', 'Weather', 'Street']

# Defect pairs to keep
keep_defects = ['Fracture', 'Crack', 'Tap', 'Manhole', 'Joint Offset', 'Root', 'Deposit', 'Broken']

df_insp = create_econometric_database(df_insp, cols, keep_defects)

df_insp.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin