    Apriori Algorithm

Association rule mining

Support and confidence calculations

Frequent itemset generation

Rule generation

In [None]:
def apriori(transactions, min_support=0.5, min_conf=0.7):
    from itertools import combinations

    total = len(transactions)
    items = set(item for t in transactions for item in t)
    supports = {}
    frequent = []
    current = [frozenset([item]) for item in items]

    while current:
        next_frequent = []
        for itemset in current:
            count = sum(1 for t in transactions if itemset.issubset(t))
            support = count / total
            if support >= min_support:
                supports[itemset] = support
                next_frequent.append(itemset)
        if not next_frequent:
            break
        frequent.append(next_frequent)
        current = [a | b for i, a in enumerate(next_frequent) for b in next_frequent[i+1:]
                   if len(a | b) == len(a)+1]
    
    rules = []
    for group in frequent[1:]:
        for itemset in group:
            for i in range(1, len(itemset)):
                for left in combinations(itemset, i):   
                    left = frozenset(left)
                    right = itemset - left
                    if supports[itemset] / supports[left] >= min_conf:
                        rules.append((left, right))
    return frequent, rules

    Classification Models

Precision, recall, F1-score calculations

Model comparison (Random Forest vs SVM)

Threshold analysis

In [None]:
def calculate_metrics(y_true, y_pred):
    # True positives, false positives, etc.
    tp = sum((y_pred == 1) & (y_true == 1))
    fp = sum((y_pred == 1) & (y_true == 0))
    fn = sum((y_pred == 0) & (y_true == 1))
    tn = sum((y_pred == 0) & (y_true == 0))
    
    # Calculate metrics
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

    Closed and Maximal Patterns 

Closed frequent itemsets

Maximal frequent itemsets

Vertical database format

In [None]:
closed_patterns = []
for i, row in frequent_itemsets.iterrows():
    is_closed = True
    for j, superset in frequent_itemsets.iterrows():
        if set(row['itemsets']).issubset(set(superset['itemsets'])) and row['support'] == superset['support'] and row['itemsets'] != superset['itemsets']:
            is_closed = False  
            break
    if is_closed:
        closed_patterns.append(row)

maximal_patterns = []
for _, row in fp_frequent_itemsets.iterrows():
    is_maximal = True
    for _, superset in fp_frequent_itemsets.iterrows():
        if row['itemsets'] < superset['itemsets'] and row['support'] <= superset['support']:
            is_maximal = False
            break 

    if is_maximal:
        maximal_patterns.append(row)

    Decision Tree

Entropy calculation

Information gain

Bootstrap sampling

In [None]:
def entropy(labels):
    from math import log2
    total = len(labels)
    freq = {label: labels.count(label) for label in set(labels)}
    return -sum((count/total) * log2(count/total) for count in freq.values())

def info_gain(df, attr, target):
    total_ent = entropy(df[target].tolist())
    values = df[attr].unique()
    weighted = 0
    for val in values:
        subset = df[df[attr] == val][target].tolist()
        weighted += len(subset)/len(df) * entropy(subset)
    return total_ent - weighted

    FP-Growth

FP-tree construction

Header table

Conditional pattern bases

In [None]:
# FP-tree construction
fp_tree = {'root': {}}
header_table = defaultdict(list)

for transaction in sorted_transactions:
    current_node = fp_tree['root']
    for item in transaction:
        if item in current_node:
            current_node[item]['count'] += 1
        else:
            current_node[item] = {'count': 1, 'children': {}}
            header_table[item].append(current_node[item])
        current_node = current_node[item]['children']

    K-Means Clustering

Centroid initialization

Cluster assignment

Centroid update

Silhouette score

In [None]:
def kmeans(X, k, max_iter=10):
    centroids = X[np.random.choice(len(X), k, replace=False)]
    
    for i in range(max_iter):
        # Assignment step
        distances = np.sqrt(((X[:, np.newaxis] - centroids) ** 2).sum(axis=2))
        labels = np.argmin(distances, axis=1)
        
        # Update step
        new_centroids = []
        for j in range(k):
            if np.any(labels == j):
                new_centroids.append(X[labels == j].mean(axis=0))
            else:
                new_centroids.append(X[np.random.choice(len(X))])
        
        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids
    
    return labels, centroids

    Naive Bayes

Categorical vs continuous features

Prior probability calculation

Likelihood estimation

Posterior probability calculation

In [None]:
# For categorical data
for col in train_data.columns[:-1]:
    for label in [0, 1]:
        subset = train_data[train_data['Class'] == label]
        value_counts = subset[col].value_counts(normalize=True).to_dict()
        for val, prob in value_counts.items():
            feature_probs[col][val][label] = prob

# For continuous data (Gaussian Naive Bayes)
for col in train_data.columns[:-1]:
    mean_std[col] = {}
    for label in class_counts.index:
        subset = train_data[train_data['Class'] == label][col]
        mean_std[col][label] = (subset.mean(), subset.std())

    OLAP Operations

Slice

Dice

Roll-up

Drill-down

In [None]:
def slice(dim, value):
    if dim == "location":
        index = locations.index(value)
        return data[index, :, :]
    elif dim == "time":
        index = times.index(value)
        return data[:, index, :]

def dice(locs, times_, items_):
    loc_indices = [locations.index(loc) for loc in locs]
    time_indices = [times.index(t) for t in times_]
    item_indices = [items.index(it) for it in items_]
    return data[np.ix_(loc_indices, time_indices, item_indices)]

def rollup(dim):
    if dim == "time":
        return np.sum(data, axis=1)
    elif dim == "item":
        return np.sum(data, axis=2)