In [1]:
"""
Association rule mining
"""
import os
import numpy as np
import pandas as pd

In [2]:
path = os.getcwd()
files = os.listdir(path)

In [3]:
"""

Key ideas to report in the paper
1. General statistics: Number of pipes with defects, most frequent defects 
2. Number of instances of two defects being within the same vicinity, probabilities of these occurrences
3. Support, lift, etc.

"""

'\n\nKey ideas to report in the paper\n1. General statistics: Number of pipes with defects, most frequent defects \n2. Number of instances of two defects being within the same vicinity, probabilities of these occurrences\n3. Support, lift, etc.\n\n'

In [3]:
"""
Helper functions
"""

def change_defect_names(df_cond):
    """ 
    Change the names of defects: e.g., FL => Fracture, TB => Tap
    Takes df_cond as input. 
    """
    deposit_codes = ['DAE', 'DAGS', 'DAR', 'DAZ', 'DSV', 'DSGV', 'DSC', 'DSZ', 'DNF', 'DNGV', 'DNZ']
    deformed_codes = ['DR', 'DFBR', 'DFBI', 'DFC', 'DFE', 'DTBR', 'DTBI']
    infiltration_codes = ['IS', 'ISB', 'ISJ', 'ISC', 'ISL', 'IW', 'IWB', 'IWC', 'IWJ', 'IWL', 'ID', 'IDB', 'IDC', 'IDJ', 'IDL', 'IR', 'IRB', 'IRC', 'IRJ', 'IRL', 'IG', 'IGB', 'IGC', 'IGL', 'IGJ' ]
    hole_codes = ['HSV', 'HVV']
    fracture_codes = ['FL', 'FC', 'FM', 'FS', 'FH']
    crack_codes = ['CL', 'CC', 'CM', 'CS', 'CH']
    broken_codes = ['BSV', 'BVV']
    collapse_codes = ['X']
    
    df_cond.loc[df_cond['PACP_Code'].isin(deposit_codes), 'PACP_Code'] = 'Deposit'
    df_cond.loc[df_cond['PACP_Code'].isin(deformed_codes), 'PACP_Code'] = 'Deformed'
    df_cond.loc[df_cond['PACP_Code'].isin(infiltration_codes), 'PACP_Code'] = 'Infiltration'
    df_cond.loc[df_cond['PACP_Code'].isin(hole_codes), 'PACP_Code'] = 'Hole'
    df_cond.loc[df_cond['PACP_Code'].isin(fracture_codes), 'PACP_Code'] = 'Fracture'
    df_cond.loc[df_cond['PACP_Code'].isin(crack_codes), 'PACP_Code'] = 'Crack'
    df_cond.loc[df_cond['PACP_Code'].isin(broken_codes), 'PACP_Code'] = 'Broken'
    df_cond.loc[df_cond['PACP_Code'].isin(collapse_codes), 'PACP_Code'] = 'Collapse'
    
    df_cond.loc[df_cond['PACP_Code'].str.startswith('T'), 'PACP_Code'] = 'Tap'
    df_cond.loc[df_cond['PACP_Code'].str.startswith('A'), 'PACP_Code'] = 'Manhole'
    df_cond.loc[df_cond['PACP_Code'].str.startswith('J'), 'PACP_Code'] = 'Joint Offset'
    df_cond.loc[df_cond['PACP_Code'].str.startswith('R'), 'PACP_Code'] = 'Root'
    
    return df_cond

def delete_rows(df_cond, keep_defects):
    """
    Delete rows where the defects that we want to consider are not present
    """
    df_cond = df_cond[df_cond['PACP_Code'].isin(keep_defects)]
    return df_cond
    

def select_df(df, insp_id, defect_list):
    """
    Helper function used by count_defect_pairs()
    This function only keeps the rows where ['defect1', 'defect2'] are present
    It helps speed up the computation
    """
    df = df[(df['InspectionID'] == insp_id)]
    return  df[df['PACP_Code'].isin(defect_list)]

def delete_inspections_with_no_defects(df_cond, defects):
    """
    Delete those inspections where there aren't any defects at all. 
    By defects we mean fractures, cracks, etc. Not tap and manhole
    """
    insp_ids = np.unique(list(df_cond['InspectionID']))

    keep_ids = [] # Inspection IDs that we want to keep

    for insp_id in insp_ids:
        df_temp = df_cond[(df_cond['InspectionID'] == insp_id)]
        if df_temp['PACP_Code'].isin(defects).sum() > 0:
            keep_ids.append(insp_id)
    
    return df_cond[df_cond['InspectionID'].isin(keep_ids)]

def count_defect_pairs(df_cond, keep_defects, distance_threshold):
    """
    Count the number of defect pairs that are < distance_threshold within each other
    """
    defect_pair_counts = []
    
    for a in range(0, len(keep_defects)): 
        for b in range(a+1, len(keep_defects)):

            pair_count = 0
            defect_pair = [keep_defects[a], keep_defects[b]] # Pair of defects whose no. of occurrences we wish to calculate

            for insp_id in df_cond['InspectionID'].unique():
            
            # Inspection id which we are counting for
                df = select_df(df_cond, insp_id, defect_pair)
                insp_id = list(df['InspectionID'])
                dist = list(df['Distance'])
                defects = list(df['PACP_Code']) # Defects which are selected from a particular inspection

                for i in range (0, len(defects) - 1):
                    for j in range (i+1, len(defects)):
                        defect_1, defect_2 = defects[i], defects[j]
                        if (defect_1 == defect_pair[0] and defect_2 == defect_pair[1]) or (defect_2 == defect_pair[0] and defect_1 == defect_pair[1]):
                            if abs(dist[i] - dist[j]) <= distance_threshold:
                                pair_count = pair_count + 1

            defect_pair_counts.append([keep_defects[a], keep_defects[b], pair_count])
            
    return defect_pair_counts


def calculate_length_of_pipeline(df_cond):
    # Find total length of pipeline
    insps = df_cond['InspectionID'].unique()
    length = 0.0
    for insp in insps:
        df_temp = df_cond[df_cond['InspectionID'] == insp]
        length += df_temp['Distance'].max()
    return length


def create_econometric_database(df_insp, cols, defects):
    df_insp = df_insp[cols]

    f, c, t, m, j, r, d, b, h, cp = [], [], [], [], [], [], [], [], [], []

    insp_ids = list(df_insp['InspectionID'])

    for insp_id in insp_ids:

        df_temp = df_cond.loc[df_cond['InspectionID'] == insp_id]

        f.append(len(df_temp[df_temp['PACP_Code'] == 'Fracture']))
        c.append(len(df_temp[df_temp['PACP_Code'] == 'Crack']))
        t.append(len(df_temp[df_temp['PACP_Code'] == 'Tap']))
        m.append(len(df_temp[df_temp['PACP_Code'] == 'Manhole']))
        j.append(len(df_temp[df_temp['PACP_Code'] == 'Joint Offset']))
        r.append(len(df_temp[df_temp['PACP_Code'] == 'Root']))
        d.append(len(df_temp[df_temp['PACP_Code'] == 'Deposit']))
        b.append(len(df_temp[df_temp['PACP_Code'] == 'Broken']))
        h.append(len(df_temp[df_temp['PACP_Code'] == 'Hole']))
        cp.append(len(df_temp[df_temp['PACP_Code'] == 'Collapse']))

    df_insp['Fracture'] = f
    df_insp['Crack'] = c
    df_insp['Tap'] = t
    df_insp['Manhole'] = m
    df_insp['Joint Offset'] = j
    df_insp['Root'] = r
    df_insp['Deposit'] = d
    df_insp['Broken'] = b
    df_insp['Hole'] = h
    df_insp['Collapse'] = cp
    
    return df_insp

In [4]:
cluster_size = 1

# Read the database of condition information from CSV
df_cond = pd.read_csv('Conditions_Taylors.csv', sep=',')
df_cond.head();
print("Total number of inspections to begin with are: {}".format(df_cond['InspectionID'].max()))

# Change names of defects
df_cond = change_defect_names(df_cond)

# Keep only the rows which contain these defects. Delete all else
keep_defects_struct = ['Fracture', 'Crack', 'Joint Offset', 'Broken', 'Hole', 'Collapse']
keep_defects_operat = ['Root', 'Deposit']

df_cond = delete_rows(df_cond, keep_defects_struct)

df_cond = delete_inspections_with_no_defects(df_cond, keep_defects_struct)
print("Length : {}".format(calculate_length_of_pipeline(df_cond)))


# Count the number of inspections that remain after the deletion
print("\nNumber of inspections which contain defects is: {}".format(df_cond['InspectionID'].nunique()))

# List the number of defects
print("\nNumber of defects is as follows: \n{}".format(df_cond['PACP_Code'].value_counts()))

"""
Counting only Fracture and Tap
Delete all inspections which do not have both Fracture and Tap
"""

keep_defects = ['Joint Offset', 'Fracture']

# Deletes those inspections which don't have defect1 or defect2
df_cond = delete_inspections_with_no_defects(df_cond, [keep_defects[0], keep_defects[1]])

# Count the number of inspections that remain after the deletion
print("\nNumber of inspections with {} and {} is: {}".format(keep_defects[0], keep_defects[1], df_cond['InspectionID'].nunique()))

# List the number of defects
print("\nNumber of defects in these inspections is as follows: \n{}".format(df_cond['PACP_Code'].value_counts()))

# Counting the number of defect clusters within 5 feet
defect_pair_counts = count_defect_pairs(df_cond, keep_defects, cluster_size)

print('\nNumber of ({}, {}) clusters is: {}'.format(keep_defects[0], keep_defects[1], defect_pair_counts[0][2]))

# Count the total length of pipe in this dataframe
length = calculate_length_of_pipeline(df_cond)

print("Length : {}".format(length))

Total number of inspections to begin with are: 3618
Length : 108581.10015941903

Number of inspections which contain defects is: 768

Number of defects is as follows: 
Crack           789
Fracture        354
Broken          172
Joint Offset    144
Hole             34
Name: PACP_Code, dtype: int64

Number of inspections with Joint Offset and Fracture is: 352

Number of defects in these inspections is as follows: 
Fracture        354
Crack           225
Joint Offset    144
Broken           64
Hole             10
Name: PACP_Code, dtype: int64

Number of (Joint Offset, Fracture) clusters is: 4
Length : 52995.300032654


In [6]:
num_defect1 = df_cond['PACP_Code'].value_counts()[keep_defects[0]]
num_defect2 = df_cond['PACP_Code'].value_counts()[keep_defects[1]]
print("Number of {} is {}".format(keep_defects[0], num_defect1))
print("Number of {} is {}".format(keep_defects[1], num_defect2))

spacing1 = length/num_defect1
spacing2 = length/num_defect2

print("Length: {}".format(length))
print("Average {} spacing: {} feet".format(keep_defects[0], spacing1))
print("Average {} spacing: {} feet".format(keep_defects[1], spacing2))

Number of Joint Offset is 1943
Number of Fracture is 8558
Length: 419601.8005263697
Average Joint Offset spacing: 215.955635885934 feet
Average Fracture spacing: 49.03035762168377 feet


In [7]:
# If Taps and Fractures were uniformly distributed, calculate the number of co-occurrences
import numpy as np

defect1 = [[keep_defects[0], i, 1] for i in np.arange(0, length, spacing1)]
defect2 = [[keep_defects[1], i, 1] for i in np.arange(0, length, spacing2)]
df_cond_fake = pd.DataFrame(defect1+defect2, columns = ['PACP_Code', 'Distance', 'InspectionID'])
defect_pair_counts = count_defect_pairs(df_cond_fake, [keep_defects[0], keep_defects[1]], cluster_size)
print('\nNumber of ({}, {}) clusters is: {}'.format(keep_defects[0], keep_defects[1], defect_pair_counts[0][2]))


Number of (Joint Offset, Fracture) clusters is: 793


In [5]:
"""
Count number of defect hotspots

TODO: Compare with uiform distribution/Weibull distribution/Try fitting curve

"""

'\nCount number of defect hotspots\n\nTODO: Compare with uiform distribution/Weibull distribution/Try fitting curve\n\n'

In [6]:
# Helper functions

def delete_rows(df_cond, keep_defects):
    """
    Delete rows where the defects that we want to consider are not present
    """
    df_cond = df_cond[df_cond['PACP_Code'].isin(keep_defects)]
    return df_cond
    

def select_df(df, insp_id, defect_list):
    """
    Helper function used by count_defect_pairs()
    This function only keeps the rows where ['defect1', 'defect2'] are present
    It helps speed up the computation
    """
    df = df[(df['InspectionID'] == insp_id)]
    return  df[df['PACP_Code'].isin(defect_list)]

In [7]:
# Defect Codes

deposit_codes = ['DAE', 'DAGS', 'DAR', 'DAZ', 'DSV', 'DSGV', 'DSC', 'DSZ', 'DNF', 'DNGV', 'DNZ']
deformed_codes = ['DR', 'DFBR', 'DFBI', 'DFC', 'DFE', 'DTBR', 'DTBI']
infiltration_codes = ['IS', 'ISB', 'ISJ', 'ISC', 'ISL', 'IW', 'IWB', 'IWC', 'IWJ', 'IWL', 'ID', 'IDB', 'IDC', 'IDJ', 'IDL', 'IR', 'IRB', 'IRC', 'IRJ', 'IRL', 'IG', 'IGB', 'IGC', 'IGL', 'IGJ' ]
hole_codes = ['HSV', 'HVV']
fracture_codes = ['FL', 'FC', 'FM', 'FS', 'FH', 'FH2', 'FH3', 'FH4']
crack_codes = ['CL', 'CC', 'CM', 'CS', 'CH', 'CH2', 'CH3', 'CH4']
broken_codes = ['BSV', 'BVV']
collapse_codes = ['X']

tap_codes = ['TB', 'TBI', 'TBD', 'TBC', 'TBA', 'TF', 'TFI', 'TFD', 'TFC', 'TFA', 'TFB', 'TR', 'TRI', 'TRD', 'TRC', 'TRA', 'TRB', 'TS', 'TSI', 'TSD', 'TSA', 'TSB']
root_codes = ['RFB', 'RFL', 'RFC', 'RFJ', 'RMB', 'RML', 'RMC', 'RMJ', 'RBB', 'RBL', 'RBC', 'RBJ', 'RTB', 'RTL', 'RTC', 'RTJ']
joint_offset_codes = ['JOS', 'JOM', 'JOL', 'JOSD', 'JOMD', 'JOLD', 'JSS', 'JSM', 'JSL', 'JAS', 'JAM', 'JAL']

defects_all = deposit_codes+deformed_codes+infiltration_codes+hole_codes+fracture_codes+crack_codes+broken_codes+root_codes+joint_offset_codes+collapse_codes
defects_struct = deformed_codes+hole_codes+fracture_codes+crack_codes+broken_codes+joint_offset_codes+collapse_codes
defects_operat = root_codes + deposit_codes

# keep_defects = defects_struct
keep_defects = defects_struct

In [48]:
# Count number of defect zones
df_cond = pd.read_csv('Conditions_Hazen_Sawyer.csv', sep=',')
df_cond.head();
print("Total number of inspections to begin with are: {}".format(df_cond['InspectionID'].nunique()))

# Change names of defects
# df_cond = change_defect_names(df_cond)

# Keep only the rows which contain these defects. Delete all else
# keep_defects = ['Fracture', 'Crack', 'Joint Offset', 'Root', 'Deposit', 'Broken', 'Hole', 'Infiltration', 'Deformed']


df_cond = delete_rows(df_cond, keep_defects)

# Delete inspections which have no 'defects', i.e, fracture, crack, root, joint offset, deposit, broken
df_cond = delete_inspections_with_no_defects(df_cond, keep_defects)

print("Number of inspections after deletion are: {}".format(df_cond['InspectionID'].nunique()))

Total number of inspections to begin with are: 1872
Number of inspections after deletion are: 938


In [49]:
# List the number of defects
df_counts = df_cond['PACP_Code'].value_counts()
print("\nNumber of defects in these inspections is as follows: \n{}".format(df_counts))
df_counts.to_csv('defect_counts.csv')

# Count the total length of pipe in this dataframe
length = calculate_length_of_pipeline(df_cond)
print("Length : {}".format(length))


Number of defects in these inspections is as follows: 
FM     2052
CM     1861
CL     1319
CC     1033
FC     1013
FL      559
JOM     161
FS      106
BSV      46
BVV      42
CS       29
HSV      25
JOL      24
HVV      21
JSM      10
CH2       7
JAM       7
CH3       4
FH2       4
JSL       2
FH3       1
JAL       1
Name: PACP_Code, dtype: int64
Length : 148927.5001701849


In [50]:
thresh = 3
insps = df_cond['InspectionID'].unique()
zones = []

for insp in insps:
    
    df_temp = df_cond[df_cond['InspectionID'] == insp]
    df_temp = df_temp.sort_values(by=["Distance"])
    indices = df_temp.index
    defect_prev, defect_curr = "",""
    dist_prev, dist_curr = 0, 0
    zone_curr = []
    
    for index in indices:
        defect_curr = df_temp.at[index, 'PACP_Code'] # Defect code at current index
        dist_curr = float(df_temp.at[index, 'Distance']) # Distance of defect at current index        
    
        if abs(dist_curr - dist_prev) < thresh:
            zone_curr.append((insp, defect_curr, dist_curr))
        else:           
            zones.append(zone_curr)
            zone_curr = []
            zone_curr.append((insp, defect_curr, dist_curr))
                        
        dist_prev = dist_curr
        defect_prev = defect_curr
        
    zones.append(zone_curr)

In [51]:
len(zones)
zones[0:10]

[[],
 [(1, 'JOL', 11.10000038)],
 [],
 [(6, 'FM', 21.10000038)],
 [(6, 'FL', 75.5)],
 [],
 [(24, 'FL', 101.4000015)],
 [],
 [(25, 'CC', 13.69999981)],
 [(25, 'CL', 19.5)]]

In [52]:
# Delete empty zones
zones = list(filter(lambda a: a!=[], zones))

In [53]:
"""
Calculate number of colocated defects
"""

colocated = 0
total = 0
max_colocated = 0
num_def_in_cluster = 5
filtered_zones = []

for zone in zones:
    if len(zone) >= num_def_in_cluster:
        colocated += len(zone)
        filtered_zones.append(zone)
        if len(zone) > max_colocated:
            max_colocated = len(zone)
            max_zone = zone
    total += len(zone)
    
print(f'Number of defects in clusters with min {num_def_in_cluster} defects is {colocated}')
print(f'Total number of defects is {total}')
print(f'Max defects in a zone is {max_colocated}')
print(f'Number of clusters {len(filtered_zones)}')

Number of defects in clusters with min 5 defects is 48
Total number of defects is 8327
Max defects in a zone is 6
Number of clusters 9


In [54]:
max_zone

[(917, 'CM', 21.70000076),
 (917, 'CM', 24.60000038),
 (917, 'CL', 27.39999962),
 (917, 'CM', 27.70000076),
 (917, 'FM', 27.70000076),
 (917, 'FM', 29.70000076)]

In [152]:
# Count number of zones which have fractures or cracks only
num_zones_with_major_defects = 0


for zone in filtered_zones:
    for _, defect_code, _ in zone:
        if defect_code in broken_codes + deformed_codes + hole_codes:
            num_zones_with_major_defects += 1
            break

print(num_zones_with_major_defects)

266


In [153]:
filtered_zones

import csv

with open("filered_zones.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(filtered_zones)

In [56]:
"""
Calculate the number of zones of various lengths
"""

num_zones = {i:0 for i in range(1, max_colocated + 1)}
for zone in zones:
    if len(zone) >= 1:
        num_zones[len(zone)] += 1
        
num_zones

{1: 6075, 2: 887, 3: 110, 4: 25, 5: 6, 6: 3}

In [155]:
"""
Count the number of zones with more than 3 fractures
"""
hotspots = []
count_hotspots = 0
for zone in zones:
    count_def = 0
    if len(zone) >= 5:
        for _, pacp_code, _ in zone:
            if pacp_code in fracture_codes:
                count_def += 1
        if count_def > 3:
            count_hotspots += 1
            hotspots.append(zone)

hotspots

[[(30, 'FL', 120.5),
  (30, 'FM', 125.6999969),
  (30, 'FM', 157.8999939),
  (30, 'CL', 179.8000031),
  (30, 'FL', 200.6000061),
  (30, 'CM', 264.3999939)],
 [(61, 'FM', 2.900000095),
  (61, 'CM', 42.29999924),
  (61, 'FM', 122.0),
  (61, 'CL', 179.6000061),
  (61, 'CL', 199.8999939),
  (61, 'CL', 201.3999939),
  (61, 'FM', 216.89999390000003),
  (61, 'CM', 247.0),
  (61, 'CL', 253.1999969),
  (61, 'CL', 258.8999939),
  (61, 'FC', 263.7999878)],
 [(65, 'FS', 0.0),
  (65, 'FL', 3.0),
  (65, 'FC', 61.0),
  (65, 'FM', 65.80000305),
  (65, 'CL', 86.0)],
 [(68, 'FM', 5.0),
  (68, 'FM', 15.0),
  (68, 'FM', 17.29999924),
  (68, 'FM', 24.79999924),
  (68, 'FM', 29.5),
  (68, 'CL', 32.09999847),
  (68, 'FM', 34.90000153),
  (68, 'FM', 39.40000153),
  (68, 'CL', 45.0),
  (68, 'FM', 96.59999847),
  (68, 'FL', 106.3000031)],
 [(94, 'FM', 56.70000076),
  (94, 'FM', 62.0),
  (94, 'FM', 130.3999939),
  (94, 'FM', 151.8000031),
  (94, 'FC', 201.6000061),
  (94, 'FC', 217.60000609999997),
  (94, 'FM', 

In [57]:
"""
Calculate severity scores

"""

grades = {'JOM':1, 'JOL':2, 'JOMD':1, 'JOLD':2, 'JSM':1, 'JSL':2, 'JAM':1, 'JAL':2, 'X':5, 'BSV':5, 'BVV':5, 'DR':5, 'DFBR':5, 'DFBI':5, 'DFC':5, 'DFE':5, 'DTBR':5, 'DTBI':5, 'HSV':5, 'HVV':5, 'FL':3, 'FC':2, 'FM':4, 'FS':3, 'FH2':4, 'FH3':5, 'FH4':5, 'CL':2, 'CC':1, 'CM':3, 'CS':2, 'CH2':4, 'CH3':5, 'CH4':5}

def calc_severity_zone(zone):
    zone_length = 0
    grade = 0
    
    for _, def_code, _ in zone:
        grade += grades[def_code]
    
    num_defects = len(zone)
    zone_length = zone[num_defects-1][2] - zone[0][2] # Length is distance of last - first
    if zone_length < thresh:
        zone_length = thresh
    return grade/zone_length


In [61]:
filtered_zones = [zone for zone in zones if len(zone) == 6]
tot_severity = 0
num_zones = 0
max_severity = 0
max_zone = []
min_severity = 1000
min_zone = []

for zone in filtered_zones:
    zone_severity = calc_severity_zone(zone)
    tot_severity  += zone_severity
    num_zones += 1 
    if zone_severity > max_severity:
        max_severity = zone_severity
        max_zone = zone
    if zone_severity < min_severity:
        min_severity = zone_severity
        min_zone = zone

print(f'Average severity is {tot_severity/num_zones}')
print(f'Maximum severity is {max_severity}')
# print(f'Maximum severity zone is {max_zone}')
print(f'Minimum severity is {min_severity}')
# print(f'Minimum severity zone is {min_zone}')
filtered_zones

Average severity is 2.0662877044715144
Maximum severity is 2.375
Minimum severity is 1.6363636363636365


[[(917, 'CM', 21.70000076),
  (917, 'CM', 24.60000038),
  (917, 'CL', 27.39999962),
  (917, 'CM', 27.70000076),
  (917, 'FM', 27.70000076),
  (917, 'FM', 29.70000076)],
 [(1150, 'CM', 15.39999962),
  (1150, 'CL', 18.29999924),
  (1150, 'CM', 20.5),
  (1150, 'CM', 23.39999962),
  (1150, 'CM', 26.20000076),
  (1150, 'FM', 26.39999962)],
 [(1238, 'CM', 57.0),
  (1238, 'CL', 58.5),
  (1238, 'CM', 61.0),
  (1238, 'CL', 62.29999924),
  (1238, 'CL', 63.40000153),
  (1238, 'CL', 63.40000153)]]

In [158]:
"""
Most frequent defect in colocated zone
"""
defects = {defect_code:0 for defect_code in keep_defects}
for zone in zones:
    if len(zone) >= 3:
        for defect in zone:
            defects[defect[1]] += 1 

In [159]:
defects

{'DR': 0,
 'DFBR': 0,
 'DFBI': 0,
 'DFC': 0,
 'DFE': 0,
 'DTBR': 0,
 'DTBI': 0,
 'HSV': 278,
 'HVV': 121,
 'FL': 1261,
 'FC': 1180,
 'FM': 4006,
 'FS': 1437,
 'FH': 0,
 'FH2': 13,
 'FH3': 27,
 'FH4': 26,
 'CL': 3074,
 'CC': 1612,
 'CM': 3074,
 'CS': 710,
 'CH': 0,
 'CH2': 11,
 'CH3': 14,
 'CH4': 10,
 'BSV': 265,
 'BVV': 87,
 'JOS': 0,
 'JOM': 1450,
 'JOL': 171,
 'JOSD': 0,
 'JOMD': 0,
 'JOLD': 0,
 'JSS': 0,
 'JSM': 53,
 'JSL': 16,
 'JAS': 0,
 'JAM': 14,
 'JAL': 3,
 'X': 0}

In [160]:
"""
Finding severity of defect clusters and finding length of these clusters.
Will allow for the best patch repair

"""

'\nFinding severity of defect clusters and finding length of these clusters.\nWill allow for the best patch repair\n\n'

In [161]:
"""
Interdefect association rule mining
"""

'\nInterdefect association rule mining\n'

In [162]:
# Create matrix: https://towardsdatascience.com/association-rule-mining-be4122fc1793
import numpy as np

co_matrix = []

for zone in zones:
    row = np.zeros(len(keep_defects))
    for _, defect, _ in zone:
        defect_index = keep_defects.index(defect)
        row[defect_index] = 1
    co_matrix.append(row)

In [163]:
zones[:5]

[[(1, 'JOL', 11.10000038)],
 [(6, 'FM', 21.10000038), (6, 'FL', 75.5)],
 [(24, 'FL', 101.4000015)],
 [(25, 'CC', 13.69999981),
  (25, 'CL', 19.5),
  (25, 'CC', 30.70000076),
  (25, 'CM', 40.29999924),
  (25, 'CM', 50.20000076),
  (25, 'FL', 64.90000153)],
 [(27, 'CM', 98.69999695), (27, 'FC', 113.5), (27, 'CL', 192.1000061)]]

In [164]:
len(co_matrix)

2825

In [165]:
co_matrix[:2]

[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.])]

In [166]:
len(co_matrix[0]) == len(keep_defects)

True

In [167]:
len(co_matrix[0])

40

In [168]:
def count_pairs(x, y, keep_defects, co_matrix):
    count_x, count_y, count_xy = 0, 0, 0
    for row in co_matrix:
        if row[x] == 1 and row[y] == 1.0:
            count_xy += 1
        if row[x] == 1.0:
            count_x += 1
        if row[y] == 1.0:
            count_y += 1
            
    return count_x, count_y, count_xy

In [169]:
def support_confidence_xy(x, y, keep_defects, co_matrix):
    count_x, count_y, count_xy = count_pairs(x, y, keep_defects, co_matrix)
    support_xy = count_xy/len(co_matrix)
    support_x = count_x/len(co_matrix)
    support_y = count_y/len(co_matrix)
    if count_x ==0 or count_y == 0 or count_xy ==0:
        confidence_xy = 0
    else:
        confidence_xy = count_xy/count_x
    return support_x, support_y, support_xy, confidence_xy

In [170]:
min_support = 0.001
min_confidence = 0.1
rules = []

for x in range(0, len(keep_defects)):
    for y in range(0, len(keep_defects)):
        if x!=y:
            support_x, support_y, support_xy, confidence_xy = support_confidence_xy(x, y, keep_defects, co_matrix)
            if support_xy > min_support and confidence_xy > min_confidence:
                lift_xy = support_xy/(support_x*support_y)
                rules.append([keep_defects[x], keep_defects[y], support_xy, confidence_xy, lift_xy])


In [171]:
rules.sort(key=lambda tup: tup[3], reverse=True)  # sorts in place by confidence
rules

[['FH3', 'FM', 0.006017699115044248, 1.0, 2.443771626297578],
 ['FH4', 'FM', 0.005663716814159292, 1.0, 2.4437716262975777],
 ['FH4', 'CL', 0.005663716814159292, 1.0, 2.632805219012116],
 ['CH3', 'FM', 0.002831858407079646, 1.0, 2.4437716262975777],
 ['CH3', 'CL', 0.002831858407079646, 1.0, 2.632805219012116],
 ['CH4', 'FM', 0.001415929203539823, 1.0, 2.4437716262975777],
 ['CH4', 'FH3', 0.001415929203539823, 1.0, 166.17647058823528],
 ['CH4', 'CL', 0.001415929203539823, 1.0, 2.632805219012116],
 ['CH4', 'CM', 0.001415929203539823, 1.0, 3.2583621683967703],
 ['CH4', 'BSV', 0.001415929203539823, 1.0, 14.790575916230367],
 ['JAL', 'JAM', 0.0010619469026548673, 1.0, 156.94444444444443],
 ['FH2', 'FM', 0.0038938053097345134, 0.9166666666666666, 2.24012399077278],
 ['CH2', 'FM', 0.00247787610619469, 0.875, 2.1383001730103803],
 ['CH2', 'CL', 0.00247787610619469, 0.875, 2.3037045666356013],
 ['CH2', 'CC', 0.00247787610619469, 0.875, 2.722329295154185],
 ['CH3', 'CM', 0.00247787610619469, 0.8

In [143]:
""" 

Interesting Rules from Association Rules on Pipe-Level 

['JOL', 'FM', 0.011782032400589101, 0.5384615384615384, 2.213933989885547]
['RFB', 'FM', 0.014727540500736377, 0.6542056074766355, 2.689826342851599],
['RFB', 'CL', 0.011782032400589101, 0.5233644859813084, 2.318314447221956],

Interesting Rules from Association Rules on Cluster-Level (Threshold = 20) 
['JOM', 'FM', 0.019923103809856693, 0.24332977588046958, 1.5513459360312503],
['JOM', 'CL', 0.01896190143306536, 0.23159018143009605, 1.6111355843684008]
['JOM', 'CM', 0.01494232785739252, 0.1824973319103522, 1.4917853331300504]]

Interesting Rules from Association Rules on Cluster-Level (Threshold = 30) 
['HSV', 'FM', 0.01151047887935715, 0.6198830409356725, 3.8209524256871545],
['BSV', 'FM', 0.01205342599630796, 0.5522388059701493, 3.40399408579592],
['JOM', 'FM', 0.0253013356499077, 0.2757396449704142, 1.6996562185626136],
['JOM', 'CL', 0.023672494299055272, 0.2579881656804734, 1.7291215558598831],
['CS', 'JOM', 0.011619068302747313, 0.25235849056603776, 2.7502595735179187],
['JOM', 'CM', 0.021609295254642197, 0.23550295857988165, 1.9040796712573573]
['FS', 'JOM', 0.013465088500380064, 0.20261437908496732, 2.2081370615307265]


"""

" \n\nInteresting Rules from Association Rules on Pipe-Level \n\n['JOL', 'FM', 0.011782032400589101, 0.5384615384615384, 2.213933989885547]\n['RFB', 'FM', 0.014727540500736377, 0.6542056074766355, 2.689826342851599],\n['RFB', 'CL', 0.011782032400589101, 0.5233644859813084, 2.318314447221956],\n\nInteresting Rules from Association Rules on Cluster-Level (Threshold = 20) \n['JOM', 'FM', 0.019923103809856693, 0.24332977588046958, 1.5513459360312503],\n['JOM', 'CL', 0.01896190143306536, 0.23159018143009605, 1.6111355843684008]\n['JOM', 'CM', 0.01494232785739252, 0.1824973319103522, 1.4917853331300504]]\n\nInteresting Rules from Association Rules on Cluster-Level (Threshold = 30) \n['HSV', 'FM', 0.01151047887935715, 0.6198830409356725, 3.8209524256871545],\n['BSV', 'FM', 0.01205342599630796, 0.5522388059701493, 3.40399408579592],\n['JOM', 'FM', 0.0253013356499077, 0.2757396449704142, 1.6996562185626136],\n['JOM', 'CL', 0.023672494299055272, 0.2579881656804734, 1.7291215558598831],\n['CS', '

In [78]:
num_rows = 0
for row in co_matrix:
    if np.count_nonzero(row == 1) > 1:
        num_rows += 1
        
print(f'Number of rows with more than 1 defect is {num_rows}')

Number of rows with more than 1 defect is 3995


In [102]:
lift = support/(support_x*support_y)
lift

1.0530775826485135

In [181]:
support

0.07973102785782901

In [36]:
"""
Make a database with external factors for econometric analysis
"""
# Read the inspections database
df_insp = pd.read_csv('Inspections_Hazen_Sawyer.csv')

# Columns that we want to keep
cols = ['InspectionID', 'Height', 'Down_Rim_to_Invert', 'Up_Rim_to_Invert', 'Height', 'Total_Length', 'Location_Code', 'Location_Details', 'Material', 'Weather', 'Street']

# Defect pairs to keep
keep_defects = ['Fracture', 'Crack', 'Tap', 'Manhole', 'Joint Offset', 'Root', 'Deposit', 'Broken']

df_insp = create_econometric_database(df_insp, cols, keep_defects)

df_insp.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin