In [1]:
import ruleset as rs
from ruleset import RIPPER, ripper, base
import pickle
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score

In [3]:
def make_rs_dataset(dataset_filename, random_state=42):
    # Load df
    df = pd.read_csv(dataset_filename)
    
    # Split
    train, test = train_test_split(df, test_size=.33, random_state=random_state)
    
    return train, test

In [4]:
def make_ripper(train, class_feat, pos_class, k=2, random_state=42, verbosity=0):
    # Train
    rip_clf = rs.RIPPER(k=k, verbosity=verbosity)
    rip_clf.fit(train, class_feat=class_feat, pos_class=pos_class, 
                n_discretize_bins=5, random_state=random_state)
    return rip_clf

In [2]:
datasets_path = '../datasets/'
random_state = 42

In [6]:
dataset = 'mushroom.csv'
filename = datasets_path + dataset
class_feat = 'Poisonous/Edible'
pos_class = 'p'

result = pickle.load(open(filename.replace('.csv','.pkl'), "rb"))
model = result['ripper_models'][0]

In [7]:
train, test = make_rs_dataset(filename, random_state=42)
#model = make_ripper(train, class_feat, pos_class, k=0, random_state=42, verbosity=0)
#pickle.dump(model, open('temp_rip.pkl','wb'))
model = pickle.load(open('temp_rip.pkl','rb'))

In [8]:
ruleset = model.ruleset_
stats = ripper.RulesetStats()
pos_df, neg_df = base.pos_neg_split(train, class_feat, pos_class)
possible_conds = model.possible_conds

In [10]:
stats.update(ruleset, possible_conds, pos_df, neg_df, verbosity=5)

updating stats from index 0
rule theory bits| [Stalk-surface-above-ring=k^Gill-spacing=c] k 2 n 119 pr 0.02: 7.8 bits
ruleset theory bits| 7.8
exceptions_bits| [Stalk-surface-above-ring=k^Gill-spacing=c]: 
 N 5443 p 1505 fp 0 fn 1108: exceptions_bits 3370
total ruleset bits | 3378
rule theory bits| [Stalk-surface-above-ring=k^Gill-spacing=c] k 2 n 119 pr 0.02: 7.8 bits
rule theory bits| [Gill-size=n^Spore-print-color=w^Gill-spacing=c] k 3 n 119 pr 0.03: 10.9 bits
ruleset theory bits| 18.7
exceptions_bits| [Stalk-surface-above-ring=k^Gill-spacing=c] V [Gill-size=n^Spore-print-color=w^Gill-spacing=c]...: 
 N 5443 p 2073 fp 0 fn 540: exceptions_bits 2134
total ruleset bits | 2153
rule theory bits| [Stalk-surface-above-ring=k^Gill-spacing=c] k 2 n 119 pr 0.02: 7.8 bits
rule theory bits| [Gill-size=n^Spore-print-color=w^Gill-spacing=c] k 3 n 119 pr 0.03: 10.9 bits
rule theory bits| [Gill-size=n^Population=s] k 2 n 119 pr 0.02: 7.8 bits
ruleset theory bits| 26.5
exceptions_bits| [Stalk-surfa

exceptions_bits| [Stalk-surface-above-ring=k^Gill-spacing=c] V [Gill-size=n^Spore-print-color=w^Gill-spacing=c]...: 
 N 5443 p 2599 fp 0 fn 14: exceptions_bits 124
total ruleset bits | 250
rule theory bits| [Stalk-surface-above-ring=k^Gill-spacing=c] k 2 n 119 pr 0.02: 7.8 bits
rule theory bits| [Gill-size=n^Spore-print-color=w^Gill-spacing=c] k 3 n 119 pr 0.03: 10.9 bits
rule theory bits| [Gill-size=n^Population=s] k 2 n 119 pr 0.02: 7.8 bits
rule theory bits| [Spore-print-color=h^Cap-surface=s] k 2 n 119 pr 0.02: 7.8 bits
rule theory bits| [Gill-size=n^Cap-surface=s^Stalk-shape=e] k 3 n 119 pr 0.03: 10.9 bits
rule theory bits| [Gill-size=n^Habitat=g] k 2 n 119 pr 0.02: 7.8 bits
rule theory bits| [Gill-size=n^Spore-print-color=k^Stalk-root=b] k 3 n 119 pr 0.03: 10.9 bits
rule theory bits| [Population=v^Stalk-shape=e^Bruises?=t^Cap-shape=b] k 4 n 119 pr 0.03: 13.6 bits
rule theory bits| [Gill-size=n^Cap-surface=y^Bruises?=t] k 3 n 119 pr 0.03: 10.9 bits
rule theory bits| [Gill-size=n^S

In [11]:
list(zip(range(0,len(stats.subset_dls)), stats.subset_dls))

[(0, 3377.7235010761033),
 (1, 2152.5167396808397),
 (2, 1778.1709835519496),
 (3, 1142.696786306302),
 (4, 827.9312750686732),
 (5, 734.9581516407509),
 (6, 663.9150892363692),
 (7, 538.3074896667496),
 (8, 412.7397059699782),
 (9, 377.6444895630475),
 (10, 294.9152417352419),
 (11, 250.31921401773693),
 (12, 213.42243213548036),
 (13, 166.730637899361),
 (14, 152.62125713656573)]

In [12]:
stats.subset_dls[1] = 4000
stats.subset_dls[4] = 3000

In [13]:
#temprs = base.Ruleset(ruleset.rules[:9]+[ruleset.rules[4]]+ruleset.rules[10:]+ruleset.rules[1:3])
#stats.update(temprs, possible_conds, pos_df, neg_df, bestsubset_dl=False, ret_bestsubset=False, verbosity=5)

In [14]:
list(zip(range(0,len(stats.subset_dls)), stats.subset_dls))

[(0, 3377.7235010761033),
 (1, 4000),
 (2, 1778.1709835519496),
 (3, 1142.696786306302),
 (4, 3000),
 (5, 734.9581516407509),
 (6, 663.9150892363692),
 (7, 538.3074896667496),
 (8, 412.7397059699782),
 (9, 377.6444895630475),
 (10, 294.9152417352419),
 (11, 250.31921401773693),
 (12, 213.42243213548036),
 (13, 166.730637899361),
 (14, 152.62125713656573)]

In [17]:
#x=stats.dl_pruned_ruleset(possible_conds, pos_df, neg_df)

In [18]:
stats.ruleset

<Ruleset object: [Stalk-surface-above-ring=k^Gill-spacing=c] V [Gill-size=n^Spore-print-color=w^Gill-spacing=c] V [Gill-size=n^Population=s] V [Spore-print-color=h^Cap-surface=s] V [Gill-size=n^Cap-surface=s^Stalk-shape=e] V [Gill-size=n^Habitat=g] V [Gill-size=n^Spore-print-color=k^Stalk-root=b] V [Population=v^Stalk-shape=e^Bruises?=t^Cap-shape=b] V [Gill-size=n^Cap-surface=y^Bruises?=t] V [Gill-size=n^Stalk-root=c] V [Population=v^Stalk-shape=e^Stalk-root=b^Stalk-color-below-ring=w^Cap-surface=f] V [Ring-number=t^Population=v^Cap-shape=f^Gill-color=g] V [Ring-number=t^Population=v^Habitat=g] V [Habitat=m^Cap-shape=f] V [Habitat=l^Cap-color=w]>

In [19]:
stats.subset_dls

[3377.7235010761033,
 4000,
 1778.1709835519496,
 1142.696786306302,
 3000,
 734.9581516407509,
 663.9150892363692,
 538.3074896667496,
 412.7397059699782,
 377.6444895630475,
 294.9152417352419,
 250.31921401773693,
 213.42243213548036,
 166.730637899361,
 152.62125713656573]

In [None]:
%timeit stats.update(x, possible_conds, pos_df, neg_df, bestsubset_dl=False, ret_bestsubset=False, verbosity=5)

In [None]:
%timeit stats.subset_dls

In [21]:
%timeit stats.potential_dl_stats(possible_conds, pos_df, neg_df)

3.14 s ± 63.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
3*2*14

84

In [25]:
%timeit ripper._rs_total_bits(stats.ruleset, possible_conds, pos_df, neg_df, bestsubset_dl=True, verbosity=0)


2.35 s ± 41.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [3]:
dataset = 'mushroom.csv'
filename = datasets_path + dataset
pd.read_csv(filename).head()


Unnamed: 0,Poisonous/Edible,Cap-shape,Cap-surface,Cap-color,Bruises?,Odor,Gill-attachment,Gill-spacing,Gill-size,Gill-color,...,Stalk-surface-below-ring,Stalk-color-above-ring,Stalk-color-below-ring,Veil-type,Veil-color,Ring-number,Ring-type,Spore-print-color,Population,Habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
