In [1]:
import numpy as np
import pandas as pd

from scipy.sparse import dok_matrix
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from pattern_mining import VectorToTransactions, FrequentPattern

In [2]:
X, y = load_breast_cancer(return_X_y=True, as_frame=True)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [4]:
train_set = pd.concat((X_train, y_train), axis=1)
test_set = pd.concat((X_test, y_test), axis=1)

train_set.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
190,14.22,23.12,94.37,609.9,0.1075,0.2413,0.1981,0.06618,0.2384,0.07542,...,37.18,106.4,762.4,0.1533,0.9327,0.8488,0.1772,0.5166,0.1446,0
134,18.45,21.91,120.2,1075.0,0.0943,0.09709,0.1153,0.06847,0.1692,0.05727,...,31.39,145.6,1590.0,0.1465,0.2275,0.3965,0.1379,0.3109,0.0761,0
386,12.21,14.09,78.78,462.0,0.08108,0.07823,0.06839,0.02534,0.1646,0.06154,...,19.29,87.65,529.9,0.1026,0.2431,0.3076,0.0914,0.2677,0.08824,1
118,15.78,22.91,105.7,782.6,0.1155,0.1752,0.2133,0.09479,0.2096,0.07331,...,30.5,130.3,1272.0,0.1855,0.4925,0.7356,0.2034,0.3274,0.1252,0
316,12.18,14.08,77.25,461.4,0.07734,0.03212,0.01123,0.005051,0.1673,0.05649,...,16.47,81.6,513.1,0.1001,0.05332,0.04116,0.01852,0.2293,0.06037,1


In [5]:
vec2trans = VectorToTransactions(5).fit(train_set)

In [6]:
train_set_trans = vec2trans.transform(train_set.drop(columns="target"))

In [7]:
fp = FrequentPattern(train_set_trans, 0.15)

In [9]:
fp.get_maximal().loc[42, "itemsets"].issubset(train_set_trans[0])

False

In [18]:
class PatternFeatureGenerator:
    def __init__(self):
        pass
    
    def fit(self, patterns):
        self.patterns = list(patterns)
        return self
    
    def transform(self, data):
        N = len(data)
        transformation = dok_matrix( (N, len(self.patterns)) )
        
        for i, transaction in enumerate(data):
            for j, pattern in enumerate(self.patterns):
                if pattern.issubset(transaction):
                    transformation[i,j]=1
                    
        return transformation

In [19]:
pfg = PatternFeatureGenerator().fit(fp.get_maximal().itemsets)

In [20]:
train_patterns = pfg.transform(train_set_trans)

In [21]:
train_patterns

<455x826 sparse matrix of type '<class 'numpy.float64'>'
	with 59671 stored elements in Dictionary Of Keys format>

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
LogisticRegression().fit(train_patterns, y_train).score(train_patterns, y_train)

1.0

In [25]:
LogisticRegression().fit(train_patterns, y_train).score(
    pfg.transform(
        vec2trans.transform(test_set.drop(columns="target"))
    ), y_test)

0.956140350877193

Good pattern features were generated in an unsupervised manner, so that a simple Linear Model can obtain good performance