## 10. (coding) Discretization

#### 1) Implement equal width discretization

In [5]:
import numpy as np

def equal_width(data, num_bins):
    min_val = np.min(data)
    max_val = np.max(data)
    
    bins = np.linspace(min_val, max_val, num_bins + 1)
    
    return bins

# Continuous data
data = np.array([1.5, 2.3, 4.7, 3.6, 5.8, 7.9, 8.2, 9.4])

# Define the number of bins
num_bins = 3

# Compute intervals using the equal width function
bins = equal_width(data, num_bins)

# Using bins, discretize the data by assigning it to bins
disc_data = np.digitize(data, bins, right=True)

# Display the results
print("Original Data: ", data)
print("Computed Bins: ", bins)
print("Discretized Data: ", disc_data)

Original Data:  [1.5 2.3 4.7 3.6 5.8 7.9 8.2 9.4]
Computed Bins:  [1.5        4.13333333 6.76666667 9.4       ]
Discretized Data:  [0 1 2 1 2 3 3 3]


#### 2) Entropy-based discretization: The following program finds the best split point in a numeric data array.

#### Calculate Entropy

In [16]:
def calculate_entropy(y):

    class_labels, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)

    entropy = -np.sum([p * np.log2(p) for p in probabilities if p > 0])
    
    return entropy

#### Calculate Information Gain

In [17]:
def information_gain(y, y_left, y_right):
    
    entropy_before = calculate_entropy(y)

    weight_left = len(y_left) / len(y)
    weight_right = len(y_right) / len(y)
    entropy_after = (weight_left * calculate_entropy(y_left) + 
                     weight_right * calculate_entropy(y_right))
    
    info_gain = entropy_before - entropy_after
    
    return info_gain

#### Find the Best Split Point

In [18]:
def best_split(X, y):
    best_info_gain = -1
    best_split_point = None

    for value in np.unique(X):
        y_left = y[X <= value]
        y_right = y[X > value]
        
        current_info_gain = information_gain(y, y_left, y_right)
        
        if current_info_gain > best_info_gain:
            best_info_gain = current_info_gain
            best_split_point = value
    
    return best_split_point

#### Example Usage and Explanation

In [19]:
X = np.array([1, 2, 3, 4, 5, 6])
y = np.array([0, 0, 1, 0, 1, 1])

split_point = best_split(X, y)

print(f"Best Split Point: {split_point}")

Best Split Point: 2


## 11. (coding) Aprior

#### Transaction Database and the Minimum Support

In [32]:
trans_db = [['milk','bread','biscuit'], ['bread','milk','biscuit','cornflakes'],
            ['bread','tea','bournvita'], ['jam','maggi','bread','milk'], ['maggi','tea','biscuit'],
            ['bread','tea','bournvita'], ['maggi','tea','cornflakes'], ['maggi','bread','tea','biscuit'],
            ['jam','maggi','bread','tea'], ['bread','milk'], ['coffee','cock','biscuit','cornflakes'],
            ['coffee','cock','biscuit','cornflakes'], ['coffee','sugar','bournvita'], ['bread','coffee','cock'],
            ['bread','sugar','biscuit'], ['coffee','sugar','cornflakes'], ['bread','sugar','bournvita'],
            ['bread','coffee','sugar'], ['bread','coffee','sugar'], ['tea','milk','coffee','cornflakes']]

min_support = 3
infreq_itemsets = []

# Compute support for an itemset
def compute_support(itemset):
    support = 0
    for trans in trans_db:
        if set(itemset).issubset(set(trans)):
            support += 1
    return support

#### Generate k+1 Frequent Itemsets 

In [33]:
# Generate (k+1) itemsets from k itemsets
def generate_k_1_itemsets(k_itemsets):
    k_1_itemsets = []
    for i in range(len(k_itemsets)):
        for j in range(i + 1, len(k_itemsets)):
            # Create new itemset by combining the two k-itemsets
            new_itemset = list(set(k_itemsets[i]) | set(k_itemsets[j]))
            # Ensure new itemset has length k+1 and is not infrequent
            if len(new_itemset) == len(k_itemsets[0]) + 1 and new_itemset not in infreq_itemsets:
                # Check support of the new itemset
                if compute_support(new_itemset) >= min_support:
                    k_1_itemsets.append(new_itemset)
                else:
                    infreq_itemsets.append(new_itemset)  # Mark as infrequent if support < min_support
    return k_1_itemsets

#### Generate Frequent 1-itemsets

In [34]:
all_list_items = list(set(i for j in trans_db for i in j))
itemsets_1 = []

for item in all_list_items:
    if compute_support([item]) >= min_support:
        itemsets_1.append([item])

print("Frequent 1-itemsets:", itemsets_1)

Frequent 1-itemsets: [['sugar'], ['bournvita'], ['tea'], ['cock'], ['bread'], ['maggi'], ['milk'], ['biscuit'], ['coffee'], ['cornflakes']]


#### Generate Frequent 2-itemsets and 3-itemsets

In [35]:
itemsets_2 = generate_k_1_itemsets(itemsets_1)
print("Frequent 2-itemsets:", itemsets_2)

itemsets_3 = generate_k_1_itemsets(itemsets_2)
print("Frequent 3-itemsets:", itemsets_3)

Frequent 2-itemsets: [['sugar', 'bread'], ['sugar', 'coffee'], ['bournvita', 'bread'], ['tea', 'bread'], ['maggi', 'tea'], ['coffee', 'cock'], ['maggi', 'bread'], ['milk', 'bread'], ['bread', 'biscuit'], ['coffee', 'bread'], ['cornflakes', 'biscuit'], ['coffee', 'cornflakes']]
Frequent 3-itemsets: []


## 12. Discretization using sklearn

In [30]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import KBinsDiscretizer
import pandas as pd

dataset = load_iris()
X = dataset.data 
y = dataset.target 
feature_names = dataset.feature_names

df = pd.DataFrame(X, columns=feature_names)

equal_width_discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
X_equal_width = equal_width_discretizer.fit_transform(X)

equal_freq_discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')
X_equal_freq = equal_freq_discretizer.fit_transform(X)

df_equal_width = pd.DataFrame(X_equal_width, columns=feature_names)
df_equal_freq = pd.DataFrame(X_equal_freq, columns=feature_names)

print("Original Data:\n", df.head(), "\n")
print("Equal-width discretized Data:\n", df_equal_width.head(), "\n")
print("Equal-frequency discretized Data:\n", df_equal_freq.head(), "\n")

Original Data:
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2 

Equal-width discretized Data:
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                0.0               1.0                0.0               0.0
1                0.0               1.0                0.0               0.0
2                0.0               1.0                0.0               0.0
3                0.0               1.0                0.0               0.0
4                0.0               1.0                0.0               0.0 

Equal-frequency discretized Data:
   

## 13. Apriori using library

In [36]:
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

trans_db = [
    ['milk', 'bread', 'biscuit'],
    ['bread', 'milk', 'biscuit', 'cornflakes'],
    ['bread', 'tea', 'bournvita'],
    ['jam', 'maggi', 'bread', 'milk'],
    ['maggi', 'tea', 'biscuit'],
    ['bread', 'tea', 'bournvita'],
    ['maggi', 'tea', 'cornflakes'],
    ['maggi', 'bread', 'tea', 'biscuit'],
    ['jam', 'maggi', 'bread', 'tea'],
    ['bread', 'milk'],
    ['coffee', 'cock', 'biscuit', 'cornflakes'],
    ['coffee', 'cock', 'biscuit', 'cornflakes'],
    ['coffee', 'sugar', 'bournvita'],
    ['bread', 'coffee', 'cock'],
    ['bread', 'sugar', 'biscuit'],
    ['coffee', 'sugar', 'cornflakes'],
    ['bread', 'sugar', 'bournvita'],
    ['bread', 'coffee', 'sugar'],
    ['bread', 'coffee', 'sugar'],
    ['tea', 'milk', 'coffee', 'cornflakes']
]

te = TransactionEncoder()
trans_array = te.fit(trans_db).transform(trans_db)
trans_df = pd.DataFrame(trans_array, columns=te.columns_)

min_support = 3 / len(trans_db)  
frequent_itemsets = apriori(trans_df, min_support=min_support, use_colnames=True)

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

print("Frequent Itemsets:")
print(frequent_itemsets)

print("\nAssociation Rules:")
print(rules)

Frequent Itemsets:
    support               itemsets
0      0.35              (biscuit)
1      0.20            (bournvita)
2      0.65                (bread)
3      0.15                 (cock)
4      0.40               (coffee)
5      0.30           (cornflakes)
6      0.25                (maggi)
7      0.25                 (milk)
8      0.30                (sugar)
9      0.35                  (tea)
10     0.20       (bread, biscuit)
11     0.15  (cornflakes, biscuit)
12     0.15     (bournvita, bread)
13     0.15        (coffee, bread)
14     0.15         (maggi, bread)
15     0.20          (milk, bread)
16     0.20         (sugar, bread)
17     0.20           (tea, bread)
18     0.15         (coffee, cock)
19     0.20   (coffee, cornflakes)
20     0.20        (sugar, coffee)
21     0.20           (maggi, tea)

Association Rules:
   antecedents consequents  antecedent support  consequent support  support  \
0  (bournvita)     (bread)                0.20                0.65     0.15  