# Association Pattern Mining

In [9]:
from pathlib import Path
import numpy as np
import pandas as pd

from mlxtend.frequent_patterns import association_rules


In [10]:
# creating Path object for current working directory
directory = Path('./')
# creating Path object for additional data directory
additional_directory = directory / 'additional_data'
# create new directory for additional data
Path(additional_directory).mkdir(exist_ok=True)

# defining the directory to original data
directory = Path('./data/')
additional_directory = Path('./additional_data')

# list the .csv files for the project
for file in directory.glob('*.csv'):
    print(file)
    
# reading in .csv files to dataframes
vehicles = pd.read_csv(directory / 'vehicles2019.csv', dtype={'Accident_Index': str})
casualties = pd.read_csv(directory / 'casualties2019.csv', dtype={'Accident_Index': str})
# cleaned accidents DataFrame
accidents = pd.read_pickle('accidents_cleaned.pkl')

# convert column names to lowercase for ease of indexing
def lower_columns(df):
    """
    Defintion:
        convert column names to lower case
    """
    df.columns = map(str.lower, df.columns)
    
# converting all column names to lower case
lower_columns(vehicles)
lower_columns(casualties)

accidents.head(5)

data/vehicles2019.csv
data/accidents2019.csv
data/casualties2019.csv


Unnamed: 0,accident_index,longitude,latitude,police_force,accident_severity,number_of_vehicles,number_of_casualties,day_of_week,local_authority_(district),local_authority_(highway),...,carriageway_hazards,urban_or_rural_area,did_police_officer_attend_scene_of_accident,lsoa_of_accident_location,district,converted_date,converted_time,datetime,decimal_time,day_of_year
0,2019010152270,-0.127949,51.436208,1,3,2,1,3,9,E09000022,...,0,1,3,E01003117,lambeth,2019-01-15,21:45:00,2019-01-15 21:45:00,21.75,15
1,2019010157567,-0.123427,51.44931,1,3,2,2,3,9,E09000022,...,0,1,1,E01003023,lambeth,2019-01-15,08:42:00,2019-01-15 08:42:00,8.7,15
2,2019010157732,-0.145106,51.461256,1,2,1,2,3,9,E09000022,...,0,1,1,E01003026,lambeth,2019-01-15,07:08:00,2019-01-15 07:08:00,7.133333,15
3,2019010157896,-0.240823,51.533125,1,3,2,1,3,28,E09000005,...,0,1,1,E01000535,brent,2019-01-15,21:05:00,2019-01-15 21:05:00,21.083333,15
4,2019010157795,0.161736,51.550272,1,3,1,1,3,16,E09000002,...,0,1,1,E01000035,barking and dagenham,2019-01-15,16:10:00,2019-01-15 16:10:00,16.166667,15


In [11]:
class AssociationRule:
    def __init__(self, df):
        self.df = df
    
    def apriori(self,
                min_support=0.5,
                use_colnames=False,
                max_len=None,
                verbose=0):
        """
        Uses the mlxtend.frequent_patterns.apriori algorithm
        """
        df = self.df.iloc[:, 1:]
        return apriori(df, min_support,
                       use_colnames, max_len, verbose)
        
    def support(self, Y, X):
        """
        Determine support for two items.
        Inputs:
            X: antecedent
            Y: consequent
        Returns:
            support value
        """
        if X not in self.df.columns:
            raise TypeError("Invalid antecedent.")
        elif Y not in self.df.columns:
            raise TypeError("Invalid consequent.")
        else:
            freq_XY = self.df.groupby(X)[Y].value_counts()[1][1]
            return freq_XY / self.df.shape[0]

    def confidence(self, Y, X):
        """
        Determine confidence for two items.
        Inputs:
            X: antecedent
            Y: consequent
        Returns:
            confidence value
        """
        if X not in self.df.columns:
            raise TypeError("Invalid antecedent.")
        elif Y not in self.df.columns:
            raise TypeError("Invalid consequent.")
        else:
            freq_X = self.df[X].value_counts()[1] / self.df.shape[0]
            return self.support(X, Y) / freq_X
        
    def lift(self, Y, X):
        """
        Determine the confidence for two items.
        Inputs:
            X: antecedent
            Y: consequent
        Returns:
            lift value
        """
        if X not in self.df.columns:
            raise TypeError("Invalid antecenent.")
        elif Y not in self.df.columns:
            raise TypeError("Invalid consequent.")
        else:
            freq_X = self.df[X].value_counts()[1] / self.df.shape[0]
            freq_Y = self.df[Y].value_counts()[1] / self.df.shape[0]
            return self.support(X, Y) / (freq_X * freq_Y)
        
        
    def report(self, Y, X):
        """
        Prints a short summary report.
        Inputs:
            X: antecedent
            Y: consequent
        """
        if X not in self.df.columns:
            raise TypeError("Invalid antecedent.")
        elif Y not in self.df.columns:
            raise TypeError("Invalid consequent.")
        else:
            sup = self.support(X, Y)
            conf = self.confidence(X, Y)
            title = f'{X} -> {Y}'
            print(title)
            print('-' * len(title))
            print(f'Support: {sup:.2f}%')
            print(f'Confidence: {conf:.2f}%')

How speed limit affects casualty rates

$$ \text{Speed Limit} \rightarrow \text{Accident Severity}$$

In [12]:
apm_cols = ['accident_severity', 'speed_limit', 'weather_conditions']

one_hot = pd.DataFrame()

for col in te.columns:
    dummy = pd.get_dummies(test.loc[:, col], prefix=col)
    one_hot = pd.concat([one_hot, dummy], axis=1)
    
one_hot

NameError: name 'te' is not defined

In [13]:
ar = AssociationRule(dummies)

min_support = 0.2

frequent_itemsets = ar.apriori(min_support=min_support, use_colnames=True)

NameError: name 'dummies' is not defined

In [14]:

rules = association_rules(frequent_itemsets,
                          metric='lift',
                          min_threshold=0.5)
rules

NameError: name 'frequent_itemsets' is not defined