# Imports

In [1]:
import numpy as np
from sklearn.cluster import KMeans
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth
from elbow import KElbow
import warnings
warnings.filterwarnings(action="ignore")
%load_ext autotime

# Binner Class

In [2]:
class Binner():
    """Turn continuous data into discrete data using KMeans and find patterns in a subset of data above a threshold using FP Growth
    Attributes
    ----------
    models_ : dictionary, (column name -> model)
        Dictionary mapping a column to a KMeans model
    number_of_bins_range_ : tuple, 
        The range of values for number of bins to try, the algorithm will automatically 
        select the best one from the range (default: (2,10)).
    minimal_support_rate_ : float
        A float between 0 and 1 for minimum support of the itemsets returned
    threshold_ : float
        The threshold to use for the conditon: support_rate(full data) * threshold < support_rate(subset)
    full_df_ : pandas dataframe object
        the full dataset we are using for patterns
    subset_df_ : pandas dataframe object
        the subset dataset we are using for patterns
        
    Notes
    -----
    This class has many helper functions, which can be used out of the box as well. You can create bins for a single column using `create_bins` and 
    find patterns for a dataset using `find_patterns`. The main function is `get_best_subset_patterns`, which finds the best patterns in a subset 
    of data when compared to the full dataset using support rates. The groups for the different binned columns can be accessed from self.models_
    
    Examples
    --------
    >>> import seaborn as sns
    >>> iris = sns.load_dataset('iris')
    >>> titanic = sns.load_dataset('titanic')
    >>> binner = Binner()
    >>> binned_values_iris_petal_length = binner.create_bins('petal_length', iris, number_of_bins_range=(2,10), verbose=True)
    >>> subset_patterns_iris = binner.find_patterns(iris[iris['sepal_length'] > 4.7], minimal_support_rate=.25, number_of_bins_range=(2,10), verbose=True)
    >>> subset_best_patterns_titanic = binner.get_best_subset_patterns(full_df=titanic, subset_df=titanic[titanic['fare'] > 30], 
                                                      minimal_support_rate=.3, threshold=3.8, number_of_bins_range=(2,10))
    """
    
    def __init__(self):
        self.models_ = {}
        self.number_of_bins_range_ = (2,10)
        self.minimal_support_rate_ = .25
        self.threshold_ = 1.0
        self.full_df_ = pd.DataFrame()
        self.subset_df_ = pd.DataFrame()
        
    def sort_model_labels(self, model, original_labels):
        """Sorts labels in ascending order (cluster means) using a fit KMeans model
        After sorting, the higher the label number, the higher the average values of the data with that label
        Parameters
        ----------
        model : a fit KMeans model
        original_labels : the labels we want to sort 
        Returns
        -------
        the sorted labels
        """
        label_idx = np.argsort(model.cluster_centers_.sum(axis=1))
        lookup_table = np.zeros_like(label_idx)
        lookup_table[label_idx] = np.arange(model.n_clusters)
        sorted_labels = lookup_table[original_labels]
        return sorted_labels
        
    def create_bins(self, column_name, df, number_of_bins_range=(2,10), replace_in_df=True, verbose=False):
        """An adaptive binning algorithm to convert a continuous pandas dataframe column to discrete. K means algorithm is used to create bins. 
        Mean sum of squared distances to center is used for evaluation. Knee point detection algorithm is used to select the best number of bins.
        Parameters
        ----------
        column_name : the name of pandas dataframe column to convert
        df : pandas dataframe object
        number_of_bins_range : tuple, optional
            The range of values for number of bins to try, the algorithm will automatically 
            select the best one from the range (default: (2,10)).
        replace_in_df : boolean, optional
            Whether to replace the column in the dataframe with the binned values (default: True).
        verbose : boolean, optional
            Whether to print out info
        Returns
        -------
        the binned values 
        """
        # Assign value to class variable
        self.number_of_bins_range_ = number_of_bins_range
        
        # Format data
        data_to_bin = np.array(df[column_name]).reshape(-1, 1)
        
        # Initialize model
        model = KMeans(random_state=100, n_init=10, n_jobs=-1)
        
        # Fit the model, trying different number of bins (clusters), selecting the best number
        elbow = KElbow(model, k=number_of_bins_range)
        elbow.fit(data_to_bin)
        optimal_number_of_bins = elbow.elbow_value_
        model.set_params(n_clusters=optimal_number_of_bins)
        model.fit(data_to_bin)
        
        # Get the binned value (labels)
        model.labels_ = self.sort_model_labels(model, model.labels_)
        
        # Add model to class variable for all models
        self.models_[column_name] = model
        
        # Replace the numeric column with the discrete values
        if replace_in_df:
            df[column_name] = model.labels_
            
        # Print out message if verbose
        if verbose:
            print("For column: {}, optimal number of bins: {}".format(column_name, optimal_number_of_bins))
            
        # Return discrete values (labels)
        return model.labels_
    
    
    def find_patterns(self, df, columns_to_drop=[], minimal_support_rate=.33, number_of_bins_range=(2,10), verbose=False):
        """FP-growth algorithm to find patterns in the dataframe with a minimal support rate, after converting continuous features to discrete
        Parameters
        ----------
        df : pandas dataframe object
        columns_to_drop : list of column names to be excluding when finding patterns
        minimal_support_rate : a float between 0 and 1 for minimum support of the itemsets returned
        number_of_bins_range : tuple, optional
            The range of values for number of bins to try, the algorithm will automatically 
            select the best one from the range (default: (2,10)).
        verbose : boolean, optional
            Whether to print out info
        Returns
        -------
        all patterns found above minimal support rate
        """
        # Reinitialize class models
        self.models_ = {}
        
        # Assign value to class variable
        self.minimal_support_rate_ = minimal_support_rate
        
        # Drop columns that are to be excluded
        for column in columns_to_drop:
            if column in df:
                del df[column]
        
        # Iterate over each column in the dataset
        for column in df.columns:
            
            # Try to see if column is numeric (continuous)
            try:
                data_is_numeric = np.issubdtype(df[column].dtype, np.number)
            except Exception as e:
                if verbose:
                    print("Warning: cannot create bins for column: {}\n{}\n".format(column, e))
                data_is_numeric = False
                
            # If column is continuous, get discrete values from binning algorithm (KMeans)
            if data_is_numeric:
                try:
                    binned_values = self.create_bins(column, df, number_of_bins_range=number_of_bins_range, verbose=verbose)
                except Exception as e:
                    if verbose:
                        print("Warning: cannot create bins for column: {}\n{}\n".format(column, e))
                    
        # Convert dataset into all discrete valued columns
        df = pd.get_dummies(df, columns=df.columns)
        
        # Use FP Growth algorithm to find patterns above support rate
        self.patterns = fpgrowth(df, min_support=minimal_support_rate, use_colnames=True)
        
        # Return found patterns
        return self.patterns.sort_values('support', ascending=False)
    
    def transform_dataset(self, df):
        """Transforms a dataset, converting all continuous features to discrete using the previously fit KMeans models for labels
        Parameters
        ----------
        df : pandas dataframe object to transform
        Returns
        -------
        the transformed dataset
        """
        # Iterate over each column in the dataset
        for column in df.columns:
            
            # If column has been turned into a discrete column already (if there is a saved model for it)
            if column in self.models_:
                
                # Get the model for the column
                model = self.models_[column]
                
                # Get the discrete values using the model on the new data
                predicted_labels = model.predict(np.array(df[column]).reshape(-1,1))
                
                # Sort the labels
                sorted_labels = self.sort_model_labels(model, predicted_labels)
                
                # Change the values to discrete in the dataframe column
                df[column] = sorted_labels
                
        # Convert dataset into all discrete valued columns
        df = pd.get_dummies(df, columns=df.columns)
        return df
    
    def get_pattern_support_rate(self, pattern, df, verbose=False):
        """Gets the support rate for a pattern for a new dataset
        Parameters
        ----------
        pattern : a pattern object
        df : pandas dataframe object
        verbose : boolean, optional
            Whether to print out info
        Returns
        -------
        all patterns found above minimal support rate
        """
        # Get the column names for the pattern items
        pattern_columns = []
        for item in pattern['itemsets']:
            
            # No support if column from pattern does not exist in the dataset
            if item not in df:
                return 0
            pattern_columns.append(item)
            
        # Calculate the support rate for the full dataset
        total_len = len(df)
        support_num = len(pd.np.where(df[pattern_columns].eq(1).all(1))[0])
        support_rate = support_num/total_len
        
        # Print out message if verbose
        if verbose:
            print("Pattern Items: {}".format(pattern['itemsets']))
            print("Original Pattern Support Rate: {}".format(pattern['support']))
            print("Dataset Pattern Support Rate: {}".format(support_rate))
            
        # Return support rate for full dataset
        return support_rate

    def get_best_subset_patterns(self, full_df, subset_df, columns_to_drop=[], minimal_support_rate=.25, threshold=1.0, number_of_bins_range=(2,10), verbose=False):
        """Finds the patterns with the biggest difference between subset data support rate and full data support rate
        Parameters
        ----------
        full_df : pandas dataframe object, full dataset to use for comparison
        subset_df : pandas dataframe object, subset dataset to use for comparison
        columns_to_drop : list of column names to be excluding when finding patterns
        minimal_support_rate : float, optional (default: .25)
            a float between 0 and 1 for minimum support of the itemsets returned
        threshold : float, optional (default: 1.0)
            the threshold to use for the conditon: support_rate(full data) * threshold < support_rate(subset)
        number_of_bins_range : tuple, optional (defualt: (2,10))
            The range of values for number of bins to try, the algorithm will automatically 
            select the best one from the range.
        verbose : boolean, optional
            Whether to print out info
        Returns
        -------
        a dataframe with all the patterns that exceed the threshold equation for subset vs full dataset support rate
        """
        # Assign value to class variable
        self.threshold_ = threshold
        self.full_df_ = full_df.copy()
        self.subset_df_ = subset_df.copy()
        
        # Drop columns that are to be excluded
        for column in columns_to_drop:
            if column in self.full_df_:
                del self.full_df_[column]
            if column in self.subset_df_:
                del self.subset_df_[column]
        
        # Find all patterns above support rate in full dataframe
        full_df_patterns = self.find_patterns(self.full_df_, minimal_support_rate=minimal_support_rate, number_of_bins_range=number_of_bins_range, verbose=verbose)
        subset_key_patterns = []
        if verbose:
            print("All Patterns:")
            
        
        # Transform the datasets to be of the same form
        self.full_df_ = pd.get_dummies(self.full_df_, columns=self.full_df_.columns)
        self.subset_df_ = self.transform_dataset(self.subset_df_)
            
        # Iterate over each pattern found
        for idx,pattern in full_df_patterns.iterrows():
            
            # Get the subset dataset support rate for the pattern
            subset_support_rate = self.get_pattern_support_rate(pattern, self.subset_df_)
            full_dataset_support_rate = pattern['support']
            
            # If above threshold in equation, add pattern to list
            if full_dataset_support_rate * threshold < subset_support_rate:
                pattern_list = [item for item in pattern['itemsets']]
                subset_key_patterns.append({"pattern": pattern_list, "subset_support": subset_support_rate, "full_dataset_support": full_dataset_support_rate})
                if verbose:
                    print(pattern['itemsets'])
                
        # Sort best patterns
        subset_key_patterns = sorted(subset_key_patterns, key=lambda x: x['subset_support'] / x['full_dataset_support'], reverse=True)
        
        # Return dataframe with all pattern information
        return pd.DataFrame(subset_key_patterns)

time: 40.6 ms


# Example Usage

In [3]:
df = pd.read_csv("hr.csv")
binner = Binner()

time: 20.6 ms


In [4]:
subset_key_patterns_hr_df = binner.get_best_subset_patterns(full_df=df, subset_df=df[df['Attrition'] == "Yes"], 
                                                  columns_to_drop=['Attrition'], minimal_support_rate=.2, threshold=1.5, number_of_bins_range=(2,10))
subset_key_patterns_hr_df

Unnamed: 0,pattern,subset_support,full_dataset_support
0,"[StandardHours_80, Over18_Y, YearsAtCompany_0,...",0.392405,0.202041
1,"[EmployeeCount_1, YearsAtCompany_0, MaritalSta...",0.392405,0.202041
2,"[StockOptionLevel_0, YearsAtCompany_0, Marital...",0.392405,0.202041
3,"[Over18_Y, YearsAtCompany_0, MaritalStatus_Sin...",0.392405,0.202041
4,"[StandardHours_80, YearsAtCompany_0, MaritalSt...",0.392405,0.202041
...,...,...,...
595,"[StandardHours_80, JobLevel_1, MonthlyIncome_0...",0.320675,0.213605
596,"[JobLevel_1, MonthlyIncome_0, YearsAtCompany_0...",0.320675,0.213605
597,"[JobLevel_1, MonthlyIncome_0, YearsAtCompany_0...",0.320675,0.213605
598,"[StandardHours_80, JobLevel_1, MonthlyIncome_0...",0.320675,0.213605


time: 50.2 s


In [6]:
binner.subset_df_.MonthlyRate_0[:12]

0     0
2     1
14    0
21    1
24    0
26    1
33    1
34    1
36    1
42    0
45    0
50    0
Name: MonthlyRate_0, dtype: uint8

time: 13.4 ms


In [7]:
binner.full_df_.MonthlyRate_0[:50]

0     0
1     0
2     1
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    1
18    0
19    1
20    1
21    1
22    0
23    0
24    0
25    0
26    1
27    0
28    1
29    0
30    1
31    0
32    0
33    1
34    1
35    0
36    1
37    0
38    0
39    1
40    0
41    0
42    0
43    1
44    0
45    0
46    0
47    0
48    0
49    1
Name: MonthlyRate_0, dtype: uint8

time: 6.31 ms


In [24]:
import seaborn as sns
iris = sns.load_dataset('iris')
titanic = sns.load_dataset('titanic')

time: 1.32 s


### Turn continuous column into discrete

In [25]:
binned_values_iris_petal_length = binner.create_bins('petal_length', iris, number_of_bins_range=(2,10), verbose=True)

For column: petal_length, optimal number of bins: 4
time: 403 ms


### Find best patterns in dataset

In [26]:
subset_patterns_iris = binner.find_patterns(iris[iris['sepal_length'] > 4.7], columns_to_drop=['sepal_length'], minimal_support_rate=.25, number_of_bins_range=(2,10), verbose=True)
subset_patterns_iris

For column: sepal_width, optimal number of bins: 4
Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required by check_pairwise_arrays.

For column: petal_width, optimal number of bins: 3


Unnamed: 0,support,itemsets
4,0.47482,(sepal_width_1)
5,0.374101,(petal_width_1)
6,0.359712,(species_versicolor)
9,0.359712,(species_virginica)
8,0.345324,(petal_width_2)
14,0.345324,"(petal_width_1, species_versicolor)"
15,0.330935,"(petal_width_2, species_virginica)"
7,0.323741,(petal_length_2)
0,0.280576,(species_setosa)
1,0.280576,(petal_width_0)


time: 865 ms


### Find best patterns when comparing subset to full dataset

In [27]:
subset_key_patterns_titanic = binner.get_best_subset_patterns(full_df=titanic, subset_df=titanic[titanic['fare'] > 30], 
                                                  columns_to_drop=['fare'], minimal_support_rate=.2, threshold=1, number_of_bins_range=(2,10))
subset_key_patterns_titanic

Unnamed: 0,pattern,subset_support,full_dataset_support
0,"[pclass_1, class_First]",0.722222,0.242424
1,[pclass_1],0.722222,0.242424
2,[class_First],0.722222,0.242424
3,"[survived_1, alive_yes, alone_False]",0.393162,0.200898
4,"[survived_1, alone_False]",0.393162,0.200898
...,...,...,...
72,"[parch_0, sex_female]",0.256410,0.217733
73,"[parch_0, adult_male_False]",0.256410,0.219978
74,"[alive_yes, sibsp_0]",0.273504,0.235690
75,"[survived_1, alive_yes, sibsp_0]",0.273504,0.235690


time: 13.4 s


In [28]:
subset_key_patterns = binner.get_best_subset_patterns(full_df=iris, subset_df=iris[iris['species'] == 'setosa'], 
                                                      columns_to_drop=['species'], minimal_support_rate=.25, threshold=1.45, number_of_bins_range=(2,10))
subset_key_patterns

Unnamed: 0,pattern,subset_support,full_dataset_support
0,[petal_width_0],1.0,0.333333
1,[petal_length_0],1.0,0.333333
2,"[petal_length_0, petal_width_0]",1.0,0.333333
3,"[petal_length_0, sepal_length_0]",0.78,0.26
4,"[petal_width_0, sepal_length_0]",0.78,0.26
5,"[petal_length_0, petal_width_0, sepal_length_0]",0.78,0.26
6,[sepal_length_0],0.78,0.3
7,[sepal_width_2],0.5,0.273333


time: 1.46 s


In [29]:
subset_key_patterns = binner.get_best_subset_patterns(full_df=iris, subset_df=iris[iris['sepal_length'] > 4.7], 
                                                      columns_to_drop=['sepal_length'], minimal_support_rate=.25, threshold=1.07, number_of_bins_range=(2,10), verbose=True)
subset_key_patterns

For column: sepal_width, optimal number of bins: 4
Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required by check_pairwise_arrays.

For column: petal_width, optimal number of bins: 3
All Patterns:
frozenset({'petal_width_1'})
frozenset({'species_versicolor'})
frozenset({'species_virginica'})
frozenset({'petal_width_2'})
frozenset({'petal_width_1', 'species_versicolor'})
frozenset({'petal_width_2', 'species_virginica'})
frozenset({'petal_length_2'})


Unnamed: 0,pattern,subset_support,full_dataset_support
0,"[petal_width_2, species_virginica]",0.330935,0.306667
1,[petal_width_1],0.374101,0.346667
2,[species_versicolor],0.359712,0.333333
3,[species_virginica],0.359712,0.333333
4,[petal_width_2],0.345324,0.32
5,"[petal_width_1, species_versicolor]",0.345324,0.32
6,[petal_length_2],0.323741,0.3


time: 858 ms
