In [4]:
import numpy as np, pandas as pd

In [5]:
import fim

def process_dataset(df):
    dataset = []
    for i in range(0,df.shape[0]):
        temp = []
        for col_name in df.columns:
            temp.append(col_name+"="+str(df[col_name][i]))
        dataset.append(temp)
        
    return dataset
    

def run_fim_apriori(df, minsup):
    processed_df = process_dataset(df)
    
    result_raw = fim.apriori(processed_df, supp=(minsup*100))
    result = list(map(lambda i: list(i[0]), result_raw))
    
    return result

In [8]:
df = pd.read_csv('../data/iris0.csv')

df_raw = df.iloc[:, :-1]
Y = df.iloc[:, -1]


rules = run_fim_apriori(df_raw, 0.35)

len(rules)


5

In [9]:
%run ../IDS_smooth_local.py

list_of_rules = createrules(rules, list(set(Y)))

len(list_of_rules)

15

In [14]:
def prepare_caches(list_of_rules):
    for rule in list_of_rules:
        rule.cover = rule.get_cover(df)
        rule.correct_cover = rule.get_correct_cover(df, Y)

    for r1 in list_of_rules:
        for r2 in list_of_rules:
            OVERLAP_CACHE[frozenset([r1, r2])] = overlap(r1, r2, df)
            
prepare_caches(list_of_rules)

In [17]:
def optimize():
    
    lambda_array = [1.0]*7     # use separate hyperparamter search routine
    s1 = smooth_local_search(list_of_rules, df, Y, lambda_array, 0.33, 0.33)
    s2 = smooth_local_search(list_of_rules, df, Y, lambda_array, 0.33, -1.0)
    f1 = func_evaluation(s1, list_of_rules, df, Y, lambda_array)
    f2 = func_evaluation(s2, list_of_rules, df, Y, lambda_array)

    result_set = {}
    if f1 > f2:
        print("The Solution Set is: "+str(s1))
        result_set = list(s1)
    else:
        print("The Solution Set is: "+str(s2))
        result_set = list(s2)

In [19]:
import cProfile

cProfile.run("optimize()")

2/n*n OPT value is 549.9377777777778
Estimating omega for rule 0
Standard Error 112.6159002983149
Estimating omega for rule 1
Standard Error 85.4942863587971
Estimating omega for rule 2
Standard Error 102.74428451257033
Estimating omega for rule 3
Standard Error 80.94457980618591
Estimating omega for rule 4
Standard Error 66.6066438127609
Estimating omega for rule 5
Standard Error 93.35271286898951
Estimating omega for rule 6
Standard Error 93.9888823212618
Estimating omega for rule 7
Standard Error 83.97545474720575
Estimating omega for rule 8
Standard Error 62.721798443603326
Estimating omega for rule 9
Standard Error 84.06393995049244
Estimating omega for rule 10
Standard Error 85.72399897344967
Estimating omega for rule 11
Standard Error 108.42075908238238
Estimating omega for rule 12
Standard Error 97.94843541374206
Estimating omega for rule 13
Standard Error 108.74368947207924
Estimating omega for rule 14
Standard Error 99.48751177911728
2/n*n OPT value is 550.9688888888888
Estim

In [55]:
np_rules = np.array(list_of_rules)
solution_rules = np_rules[result_set]

list(map(lambda r: r.print_rule(), solution_rules))

If petalwidth == 0.8_to_1.75, then Iris-virginica
If petalwidth == 0.8_to_1.75, then Iris-versicolor
If petallength == 4.75_to_inf, then Iris-setosa
If sepalwidth == 2.95_to_3.35, then Iris-virginica
If sepalwidth == 2.95_to_3.35, then Iris-versicolor
If sepallength == -inf_to_5.55, then Iris-virginica


[None, None, None, None, None, None]

In [87]:
def prepare_dataset_to_tuples(df):
    df_dict = df.iloc[:, :-1].to_dict(orient="records")
    df_items = list(map(lambda i: set(i.items()), df_dict))

    return df_items


def predict(rule_list, instance):
    matches = []
    
    for rule in rule_list:
        if r.itemset <= instance:
            matches.append(r)
            
    return matches


df_items = prepare_dataset_to_tuples(df)


predict(solution_rules, df_items[3])

print(df_items[1])

list(map(lambda r: r.print_rule(), solution_rules))

{('petallength', '-inf_to_2.45'), ('sepalwidth', '2.95_to_3.35'), ('petalwidth', '-inf_to_0.8'), ('sepallength', '-inf_to_5.55')}
If petalwidth == 0.8_to_1.75, then Iris-virginica
If petalwidth == 0.8_to_1.75, then Iris-versicolor
If petallength == 4.75_to_inf, then Iris-setosa
If sepalwidth == 2.95_to_3.35, then Iris-virginica
If sepalwidth == 2.95_to_3.35, then Iris-versicolor
If sepallength == -inf_to_5.55, then Iris-virginica


[None, None, None, None, None, None]

In [145]:
def get_cover(r, df):
    dfnew = df.copy()
    for pattern in r.itemset: 
        dfnew = dfnew[dfnew[pattern[0]] == pattern[1]]
    return list(dfnew.index.values)

def get_cover_faster(r, df):
    mask = np.ones(len(df), dtype=bool)
    for pattern in r.itemset:
        mask &= df[pattern[0]] == pattern[1]
        
    return list(df[mask].index.values)


cache = {}
def get_cover_memoized(r, df):    
    result = cache.get(repr(r))
    
    if result:
        return result
    
    mask = np.ones(len(df), dtype=bool)
    for pattern in r.itemset:
        mask &= df[pattern[0]] == pattern[1]
        
    result = list(df[mask].index.values)
    
    cache[repr(r)] = result 
        
    return result
    
df.copy()

#%timeit get_cover(r, df)
#%timeit get_cover_faster(r, df)


get_cover_memoized(r, df)

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 13,
 14,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 47,
 51,
 53,
 54,
 71,
 74,
 79,
 80,
 83,
 88,
 96]

In [130]:
d = {"1": 2}

d.get("1")

2

In [120]:
faster_ratio = 1.03 / 1.3

faster_ratio * 1.3

39.067 * faster_ratio

30.953084615384615

In [12]:
def get_correct_cover(r, df, Y):
    indexes_points_covered = r.get_cover(df) # indices of all points satisfying the rule
    Y_arr = pd.Series(Y)                    # make a series of all Y labels
    labels_covered_points = list(Y_arr[indexes_points_covered])   # get a list only of Y labels of the points covered
    correct_cover = []
    for ind in range(0,len(labels_covered_points)):
        if labels_covered_points[ind] == r.class_label:
            correct_cover.append(indexes_points_covered[ind])
    return correct_cover, indexes_points_covered


def get_correct_cover_faster(r, df, Y):
    indexes_points_covered = r.get_cover(df) # indices of all points satisfying the rule
    Y_arr = np.array(Y)                    # make a series of all Y labels
    labels_covered_points = Y_arr[indexes_points_covered]   # get a list only of Y labels of th
    mask = labels_covered_points == r.class_label
    result = np.array(indexes_points_covered)[mask]

    return result, indexes_points_covered


CORRECT_COVER_CACHE = {}

def get_correct_cover_memoized(r, df, Y):
    result = CORRECT_COVER_CACHE.get(repr(r))
    
    if result:
        return result
    
    indexes_points_covered = r.get_cover(df) # indices of all points satisfying the rule
    Y_arr = np.array(Y)                    # make a series of all Y labels
    labels_covered_points = Y_arr[indexes_points_covered]   # get a list only of Y labels of th
    mask = labels_covered_points == r.class_label
    result = np.array(indexes_points_covered)[mask]
    
    CORRECT_COVER_CACHE[repr(r)] = result, indexes_points_covered

    return result, indexes_points_covered
    
    
#%timeit get_correct_cover(r, df, Y)
#%timeit get_correct_cover_faster(r, df, Y)
#%timeit get_correct_cover_memoized(r, df, Y)

get_correct_cover_memoized(r, df, Y)

(array([96], dtype=int64),
 [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  13,
  14,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  47,
  51,
  53,
  54,
  71,
  74,
  79,
  80,
  83,
  88,
  96])

In [12]:



repr(r)
repr(list_of_rules[1])

NameError: name 'r' is not defined

In [25]:
def get_incorrect_cover(r, df, Y):
    correct_cover, full_cover = r.get_correct_cover(df, Y)
    return (sorted(list(set(full_cover) - set(correct_cover))))


def get_incorrect_cover_faster(r, df, Y):
    correct_cover, full_cover = r.get_correct_cover(df, Y)

    mask = full_cover == correct_cover
    
    return np.array(full_cover)[~mask]


all(get_incorrect_cover_faster(r, df, Y) == get_incorrect_cover(r, df, Y))




11.6 µs ± 77.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
20.9 µs ± 581 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [15]:
OVERLAP_CACHE = {}

for r1 in list_of_rules:
    for r2 in list_of_rules:
        OVERLAP_CACHE[repr(r1) + repr(r2)] = overlap(r1, r2, df)
        
        
len(OVERLAP_CACHE)

6561