# Contrast Pattern Mining



In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from pattern_mining import VectorToTransactions
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns.fpgrowth import fpgrowth

In [2]:
X, y = load_breast_cancer(return_X_y=True, as_frame=True)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [4]:
train_set = pd.concat((X_train, y_train), axis=1)
test_set = pd.concat((X_test, y_test), axis=1)

train_set.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
190,14.22,23.12,94.37,609.9,0.1075,0.2413,0.1981,0.06618,0.2384,0.07542,...,37.18,106.4,762.4,0.1533,0.9327,0.8488,0.1772,0.5166,0.1446,0
134,18.45,21.91,120.2,1075.0,0.0943,0.09709,0.1153,0.06847,0.1692,0.05727,...,31.39,145.6,1590.0,0.1465,0.2275,0.3965,0.1379,0.3109,0.0761,0
386,12.21,14.09,78.78,462.0,0.08108,0.07823,0.06839,0.02534,0.1646,0.06154,...,19.29,87.65,529.9,0.1026,0.2431,0.3076,0.0914,0.2677,0.08824,1
118,15.78,22.91,105.7,782.6,0.1155,0.1752,0.2133,0.09479,0.2096,0.07331,...,30.5,130.3,1272.0,0.1855,0.4925,0.7356,0.2034,0.3274,0.1252,0
316,12.18,14.08,77.25,461.4,0.07734,0.03212,0.01123,0.005051,0.1673,0.05649,...,16.47,81.6,513.1,0.1001,0.05332,0.04116,0.01852,0.2293,0.06037,1


In [5]:
vec2trans = VectorToTransactions(5).fit(train_set)

In [6]:
pos_trainset = train_set[train_set.target==1].drop(columns="target")
neg_trainset = train_set[train_set.target==0].drop(columns="target")

In [7]:
pos_train_trans = vec2trans.transform(pos_trainset)
neg_train_trans = vec2trans.transform(neg_trainset)

In [8]:
class FrequentPattern:
    def __init__(self, dataset, min_supp):
        self.ds = dataset
        self.mine(min_supp)
        self.closed_items = None
        self.max_items = None
        
    def mine(self, supp):
        self.enc = TransactionEncoder().fit(self.ds)
        self.enc_ds = self.enc.transform(self.ds)
        self.enc_ds = pd.DataFrame(data=self.enc_ds, columns=self.enc.columns_)
        self.freq_items = fpgrowth(self.enc_ds, min_support=supp, use_colnames=True)
    
    def get_support(self, pattern):
        if pattern in self.freq_items:
            idx = self.freq_items.itemsets==pattern
            return self.freq_items[idx]
        
        N=len(self.ds)
        supp=0
        for transaction in self.ds:
            if pattern.issubset(transaction):
                supp+=1/N
        return supp
    
    def __iter__(self):
        self.index=-1
        return self
    
    def __next__(self):
        self.index+=1
        if self.index == len(self.freq_items):
            raise StopIteration    
        return self.freq_items.loc[self.index, "itemsets"]

In [9]:
fp = FrequentPattern(vec2trans.transform(train_set.drop(columns="target")), 0.15)

In [10]:
fp_pos = FrequentPattern(pos_train_trans, 0.15)

In [11]:
fp_neg = FrequentPattern(neg_train_trans, 0.15)

In [13]:
def contrast_patterns(fp, min_ratio, pos_patterns, neg_patterns, α=0.001):
    cp = dict()
    for pattern in fp:
        ratio = (pos_patterns.get_support(pattern) + α) / (neg_patterns.get_support(pattern) + α)
        
        if ratio >= min_ratio:
            cp[pattern] = ratio
    return cp

In [14]:
cp = contrast_patterns(fp, 1, fp_pos, fp_neg)

In [15]:
len(cp)

2167