In [69]:
"""
Association rule mining
"""
import os
import numpy as np
import pandas as pd

In [70]:
path = os.getcwd()
files = os.listdir(path)

In [71]:
"""

Key ideas to report in the paper
1. General statistics: Number of pipes with defects, most frequent defects 
2. Number of instances of two defects being within the same vicinity, probabilities of these occurrences
3. Support, lift, etc.

"""

'\n\nKey ideas to report in the paper\n1. General statistics: Number of pipes with defects, most frequent defects \n2. Number of instances of two defects being within the same vicinity, probabilities of these occurrences\n3. Support, lift, etc.\n\n'

In [72]:
"""
Helper functions
"""

def change_defect_names(df_cond):
    """ 
    Change the names of defects: e.g., FL => Fracture, TB => Tap
    Takes df_cond as input. 
    """
    df_cond.loc[df_cond['PACP_Code'].str.startswith('F'), 'PACP_Code'] = 'Fracture'
    df_cond.loc[df_cond['PACP_Code'].str.startswith('C'), 'PACP_Code'] = 'Crack'
    df_cond.loc[df_cond['PACP_Code'].str.startswith('T'), 'PACP_Code'] = 'Tap'
    df_cond.loc[df_cond['PACP_Code'].str.startswith('A'), 'PACP_Code'] = 'Manhole'
    df_cond.loc[df_cond['PACP_Code'].str.startswith('J'), 'PACP_Code'] = 'Joint Offset'
    df_cond.loc[df_cond['PACP_Code'].str.startswith('R'), 'PACP_Code'] = 'Root'
    df_cond.loc[df_cond['PACP_Code'].str.startswith('D'), 'PACP_Code'] = 'Deposit'
    df_cond.loc[df_cond['PACP_Code'].str.startswith('B'), 'PACP_Code'] = 'Broken'  
    return df_cond

def delete_rows(df_cond, keep_defects):
    """
    Delete rows where the defects that we want to consider are not present
    """
    df_cond = df_cond[df_cond['PACP_Code'].isin(keep_defects)]
    return df_cond
    

def select_df(df, insp_id, defect_list):
    """
    Helper function used by count_defect_pairs()
    This function only keeps the rows where ['defect1', 'defect2'] are present
    It helps speed up the computation
    """
    df = df[(df['InspectionID'] == insp_id)]
    return  df[df['PACP_Code'].isin(defect_list)]

def delete_inspections_with_no_defects(df_cond, defects):
    """
    Delete those inspections where there aren't any defects at all. 
    By defects we mean fractures, cracks, etc. Not tap and manhole
    """
    insp_ids = np.unique(list(df_cond['InspectionID']))

    keep_ids = [] # Inspection IDs that we want to keep

    for insp_id in insp_ids:
        df_temp = df_cond[(df_cond['InspectionID'] == insp_id)]
        if df_temp['PACP_Code'].isin(defects).sum() > 0:
            keep_ids.append(insp_id)
    
    return df_cond[df_cond['InspectionID'].isin(keep_ids)]

def count_defect_pairs(df_cond, keep_defects, distance_threshold):
    """
    Count the number of defect pairs that are < distance_threshold within each other
    """
    defect_pair_counts = []
    
    for a in range(0, len(keep_defects)): 
        for b in range(a+1, len(keep_defects)):

            pair_count = 0
            defect_pair = [keep_defects[a], keep_defects[b]] # Pair of defects whose no. of occurrences we wish to calculate

            for insp_id in df_cond['InspectionID'].unique():
            
            # Inspection id which we are counting for
                df = select_df(df_cond, insp_id, defect_pair)
                insp_id = list(df['InspectionID'])
                dist = list(df['Distance'])
                defects = list(df['PACP_Code']) # Defects which are selected from a particular inspection

                for i in range (0, len(defects) - 1):
                    for j in range (i+1, len(defects)):
                        defect_1, defect_2 = defects[i], defects[j]
                        if (defect_1 == defect_pair[0] and defect_2 == defect_pair[1]) or (defect_2 == defect_pair[0] and defect_1 == defect_pair[1]):
                            if abs(dist[i] - dist[j]) <= distance_threshold:
                                pair_count = pair_count + 1

            defect_pair_counts.append([keep_defects[a], keep_defects[b], pair_count])
            
    return defect_pair_counts

def calculate_length_of_pipeline(df_cond):
    # Find total length of pipeline
    insps = df_cond['InspectionID'].unique()
    length = 0.0
    for insp in insps:
        df_temp = df_cond[df_cond['InspectionID'] == insp]
        length += df_temp['Distance'].max()
    return length
   

def create_econometric_database(df_insp, cols, defects):
    df_insp = df_insp[cols]

    f, c, t, m, j, r, d, b = [], [], [], [], [], [], [], []

    insp_ids = list(df_insp['InspectionID'])

    for insp_id in insp_ids:

        df_temp = df_cond.loc[df_cond['InspectionID'] == insp_id]

        f.append(len(df_temp[df_temp['PACP_Code'] == 'Fracture']))
        c.append(len(df_temp[df_temp['PACP_Code'] == 'Crack']))
        t.append(len(df_temp[df_temp['PACP_Code'] == 'Tap']))
        m.append(len(df_temp[df_temp['PACP_Code'] == 'Manhole']))
        j.append(len(df_temp[df_temp['PACP_Code'] == 'Joint Offset']))
        r.append(len(df_temp[df_temp['PACP_Code'] == 'Root']))
        d.append(len(df_temp[df_temp['PACP_Code'] == 'Deposit']))
        b.append(len(df_temp[df_temp['PACP_Code'] == 'Broken']))

    df_insp['Fracture'] = f
    df_insp['Crack'] = c
    df_insp['Tap'] = t
    df_insp['Manhole'] = m
    df_insp['Joint Offset'] = j
    df_insp['Root'] = r
    df_insp['Deposit'] = d
    df_insp['Broken'] = b
    
    return df_insp

In [73]:
cluster_size = 5.0

# Read the database of condition information from CSV
df_cond = pd.read_csv('Conditions_Combined.csv', sep=',')
df_cond.head();
print("Total number of inspections to begin with are: {}".format(df_cond['InspectionID'].max()))

# Change names of defects
df_cond = change_defect_names(df_cond)

# Keep only the rows which contain these defects. Delete all else
keep_defects = ['Fracture', 'Crack', 'Tap', 'Manhole', 'Joint Offset', 'Root', 'Deposit', 'Broken']
df_cond = delete_rows(df_cond, keep_defects)

# Delete inspections which have no 'defects', i.e, fracture, crack, root, joint offset, deposit, broken
defects = ['Fracture', 'Crack', 'Joint Offset', 'Root', 'Deposit', 'Broken']
# df_cond = delete_inspections_with_no_defects(df_cond, defects)
# print("Length : {}".format(calculate_length_of_pipeline(df_cond)))


# Count the number of inspections that remain after the deletion
# print("\nNumber of inspections which contain defects is: {}".format(df_cond['InspectionID'].nunique()))

# List the number of defects
print("\nNumber of defects is as follows: \n{}".format(df_cond['PACP_Code'].value_counts()))

"""
Counting only Fracture and Tap
Delete all inspections which do not have both Fracture and Tap
"""

keep_defects = ['Root', 'Fracture']

# Deletes those inspections which don't have defect1 or defect2
df_cond = delete_inspections_with_no_defects(df_cond, [keep_defects[0], keep_defects[1]])

# Count the number of inspections that remain after the deletion
print("\nNumber of inspections with {} and {} is: {}".format(keep_defects[0], keep_defects[1], df_cond['InspectionID'].nunique()))

# List the number of defects
print("\nNumber of defects in these inspections is as follows: \n{}".format(df_cond['PACP_Code'].value_counts()))

# Counting the number of defect clusters within 5 feet
defect_pair_counts = count_defect_pairs(df_cond, keep_defects, cluster_size)

print('\nNumber of ({}, {}) clusters is: {}'.format(keep_defects[0], keep_defects[1], defect_pair_counts[0][2]))

# Count the total length of pipe in this dataframe
length = calculate_length_of_pipeline(df_cond)

print("Length : {}".format(length))

Total number of inspections to begin with are: 8891

Number of defects is as follows: 
Tap             29688
Manhole         17167
Crack            9233
Fracture         8625
Deposit          8287
Root             7943
Broken           2058
Joint Offset     1943
Name: PACP_Code, dtype: int64

Number of inspections with Root and Fracture is: 2974

Number of defects in these inspections is as follows: 
Tap             14056
Fracture         8625
Root             7943
Crack            7468
Manhole          5592
Deposit          4665
Broken           1848
Joint Offset     1333
Name: PACP_Code, dtype: int64

Number of (Root, Fracture) clusters is: 1428
Length : 630738.5004255187


In [74]:
num_defect1 = df_cond['PACP_Code'].value_counts()[keep_defects[0]]
num_defect2 = df_cond['PACP_Code'].value_counts()[keep_defects[1]]
print("Number of {} is {}".format(keep_defects[0], num_defect1))
print("Number of {} is {}".format(keep_defects[1], num_defect2))

spacing1 = length/num_defect1
spacing2 = length/num_defect2

print("Length: {}".format(length))
print("Average {} spacing: {} feet".format(keep_defects[0], spacing1))
print("Average {} spacing: {} feet".format(keep_defects[1], spacing2))

Number of Root is 7943
Number of Fracture is 8625
Length: 630738.5004255187
Average Root spacing: 79.40809523171582 feet
Average Fracture spacing: 73.12910149861086 feet


In [75]:
# If Taps and Fractures were uniformly distributed, calculate the number of co-occurrences
import numpy as np

defect1 = [[keep_defects[0], i, 1] for i in np.arange(0, length, spacing1)]
defect2 = [[keep_defects[1], i, 1] for i in np.arange(0, length, spacing2)]
df_cond_fake = pd.DataFrame(defect1+defect2, columns = ['PACP_Code', 'Distance', 'InspectionID'])
defect_pair_counts = count_defect_pairs(df_cond_fake, [keep_defects[0], keep_defects[1]], cluster_size)
print('\nNumber of ({}, {}) clusters is: {}'.format(keep_defects[0], keep_defects[1], defect_pair_counts[0][2]))


Number of (Root, Fracture) clusters is: 1087


In [36]:
"""
Make a database with external factors for econometric analysis
"""
# Read the inspections database
df_insp = pd.read_csv('Inspections_Hazen_Sawyer.csv')

# Columns that we want to keep
cols = ['InspectionID', 'Height', 'Down_Rim_to_Invert', 'Up_Rim_to_Invert', 'Height', 'Total_Length', 'Location_Code', 'Location_Details', 'Material', 'Weather', 'Street']

# Defect pairs to keep
keep_defects = ['Fracture', 'Crack', 'Tap', 'Manhole', 'Joint Offset', 'Root', 'Deposit', 'Broken']

df_insp = create_econometric_database(df_insp, cols, keep_defects)

df_insp.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

[1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0,
 20.0,
 21.0,
 22.0,
 23.0,
 24.0,
 25.0,
 26.0,
 27.0,
 28.0,
 29.0,
 30.0,
 31.0,
 32.0,
 33.0,
 34.0,
 35.0,
 36.0,
 37.0,
 38.0,
 39.0,
 40.0,
 41.0,
 42.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 49.0,
 50.0,
 51.0,
 52.0,
 53.0,
 54.0,
 55.0,
 56.0,
 57.0,
 58.0,
 59.0,
 60.0,
 61.0,
 62.0,
 63.0,
 64.0,
 65.0,
 66.0,
 67.0,
 68.0,
 69.0,
 70.0,
 71.0,
 72.0,
 73.0,
 74.0,
 75.0,
 76.0,
 77.0,
 78.0,
 79.0,
 80.0,
 81.0,
 82.0,
 83.0,
 84.0,
 85.0,
 86.0,
 87.0,
 88.0,
 89.0,
 90.0,
 91.0,
 92.0,
 93.0,
 94.0,
 95.0,
 96.0,
 97.0,
 98.0,
 99.0,
 100.0,
 101.0,
 102.0,
 103.0,
 104.0,
 105.0,
 106.0,
 107.0,
 108.0,
 109.0,
 110.0,
 111.0,
 112.0,
 113.0,
 114.0,
 115.0,
 116.0,
 117.0,
 118.0,
 119.0,
 120.0,
 121.0,
 122.0,
 123.0,
 124.0,
 125.0,
 126.0,
 127.0,
 128.0,
 129.0,
 130.0,
 131.0,
 132.0,
 133.0,
 134.0,
 135.0,
 136.0,
 137.0,
 138.0,
 139