In [1]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import pandas as pd
import seaborn as sns
from sklearn.tree import _tree
from kmodes.kprototypes import KPrototypes
from kmodes.kmodes import KModes
from plotnine import *
import plotnine
import warnings
from catboost import CatBoostClassifier, Pool, CatBoost
warnings.filterwarnings('ignore', category = FutureWarning)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
def filter_rows_with_threshold(dataset, thresholds):
    filtered_dataset = dataset.copy()  # Create a copy of the dataset to modify

    type_counts = {}  # Dictionary to store total occurrences of each type

    # Identify the non-numerical features
    non_numerical_features = dataset.select_dtypes(exclude=[np.number]).columns.tolist()

    # Iterate over each row in the dataset
    for _, row in dataset.iterrows():
        # Iterate over each feature and update type counts
        for feature_name in non_numerical_features:
            feature_value = row[feature_name]

            # Check if the feature value is already counted
            if feature_name in type_counts:
                if feature_value in type_counts[feature_name]:
                    type_counts[feature_name][feature_value] += 1
                else:
                    type_counts[feature_name][feature_value] = 1
            else:
                type_counts[feature_name] = {feature_value: 1}


    # Iterate over each row again to check threshold conditions and remove rows
    for idx, row in dataset.iterrows():
        remove_row = False

        # Iterate over each feature and its corresponding threshold
        for feature_idx, threshold in enumerate(thresholds):
            feature_name = dataset.columns[feature_idx]

            # Skip numerical features
            if feature_name not in non_numerical_features:
                continue

            feature_value = row[feature_name]
            occurrence_count = type_counts[feature_name].get(feature_value, 0)
            # Check if the occurrences meet the threshold condition
            threshold_count = threshold * len(dataset) / 100
            if occurrence_count < threshold_count:
                remove_row = True
                break  # No need to check other features, mark row for removal

        # If row doesn't satisfy the threshold condition, remove it
        if remove_row:
            filtered_dataset.drop(idx, inplace=True)

    return filtered_dataset

In [3]:
X = pd.read_csv("./datasets/insurance.csv")
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.552
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.471
4,32,male,28.88,0,no,northwest,3866.855


In [4]:
filter_rows_with_threshold(X, [0, 50, 0, 0, 0, 27, 0])

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1,18,male,33.770,1,no,southeast,1725.552
2,28,male,33.000,3,no,southeast,4449.462
14,27,male,42.130,0,yes,southeast,39611.758
22,18,male,34.100,0,no,southeast,1137.011
42,41,male,21.780,1,no,southeast,6272.477
...,...,...,...,...,...,...,...
1292,21,male,23.210,0,no,southeast,1515.345
1300,45,male,30.360,0,yes,southeast,62592.873
1317,18,male,53.130,0,no,southeast,1163.463
1322,62,male,38.830,0,no,southeast,12981.346


In [28]:
from collections import defaultdict

def get_high_confidence_rows(data, num_entries, confidence_thresholds):
    num_columns = len(data[0])
    total_entries = len(data)
    high_confidence_dict = defaultdict(list)

    for row in data:
        num_high_confidence_entries = 0
        high_confidence_entries = []

        for i, entry in enumerate(row):
            entry_occurrences = sum([1 for r in data if entry == r[i]])

            if entry_occurrences / total_entries >= confidence_thresholds[i]:
                num_high_confidence_entries += 1
                high_confidence_entries.append(entry)

        if num_high_confidence_entries >= num_entries:
            high_confidence_dict[tuple(high_confidence_entries)].append(row)

    print(total_entries, len(high_confidence_dict.keys()))

    return high_confidence_dict


In [29]:
data = [[random.randint(1, 10) for _ in range(10)] for _ in range(1000)]

num_entries = 2
confidence_thresholds = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]

high_confidence_dict = get_high_confidence_rows(data, num_entries, confidence_thresholds)
print(high_confidence_dict)


1000 1000
defaultdict(<class 'list'>, {(9, 1, 9, 3, 9, 5, 8, 7, 4): [[2, 9, 1, 9, 3, 9, 5, 8, 7, 4]], (2, 5, 5, 8, 5, 8, 6, 3, 6): [[10, 2, 5, 5, 8, 5, 8, 6, 3, 6]], (9, 7, 10, 6, 10, 9, 4, 7, 7): [[8, 9, 7, 10, 6, 10, 9, 4, 7, 7]], (6, 4, 5, 6, 5, 7, 3, 1, 6): [[10, 6, 4, 5, 6, 5, 7, 3, 1, 6]], (9, 5, 7, 5, 3, 7, 4, 9, 4): [[7, 9, 5, 7, 5, 3, 7, 4, 9, 4]], (7, 3, 2, 8, 5, 7, 5, 2, 1): [[3, 7, 3, 2, 8, 5, 7, 5, 2, 1]], (1, 4, 8, 5, 7, 3, 2, 4, 9): [[1, 1, 4, 8, 5, 7, 3, 2, 4, 9]], (6, 5, 6, 9, 5, 8, 10, 8, 8): [[1, 6, 5, 6, 9, 5, 8, 10, 8, 8]], (10, 1, 10, 9, 6, 7, 3, 5, 3): [[6, 10, 1, 10, 9, 6, 7, 3, 5, 3]], (7, 7, 1, 9, 9, 9, 3, 2, 4): [[9, 7, 7, 1, 9, 9, 9, 3, 2, 4]], (5, 1, 7, 1, 10, 10, 1, 8, 10): [[3, 5, 1, 7, 1, 10, 10, 1, 8, 10]], (5, 3, 9, 10, 4, 1, 2, 9, 6): [[1, 5, 3, 9, 10, 4, 1, 2, 9, 6]], (6, 8, 5, 3, 7, 2, 1, 8, 4): [[5, 6, 8, 5, 3, 7, 2, 1, 8, 4]], (9, 2, 7, 10, 8, 4, 10, 1, 7): [[8, 9, 2, 7, 10, 8, 4, 10, 1, 7]], (4, 8, 4, 6, 1, 7, 3, 2, 2): [[1, 4, 8, 4, 6, 1, 7, 3, 

In [68]:
import pandas as pd
from itertools import combinations

def find_interesting_patterns(dataset, num_columns):
    """
    Finds interesting patterns in a given dataset with respect to a specified number of columns and returns them as sentences, along with the occurrence percentage.

    Args:
    dataset: A pandas DataFrame containing the dataset.
    num_columns: An integer specifying the number of columns to consider in the patterns.

    Returns:
    A list of tuples, where each tuple contains a sentence describing an interesting pattern in the dataset, along with the percentage of times it occurs.
    """

    # Find the total number of rows in the dataset
    total_rows = len(dataset)

    # Get all combinations of column names with the specified number of columns
    column_combinations = combinations(dataset.columns, num_columns)

    # Create a dictionary to store the occurrence count for each unique combination of values
    occurrence_count = {}

    # Iterate over each row in the dataset
    for _, row in dataset.iterrows():
        # Generate a unique key for the specified columns in the row
        key = tuple(row[column] for column in column_combinations)

        # Increment the occurrence count for the unique combination of values
        occurrence_count[key] = occurrence_count.get(key, 0) + 1

    # Create a list to store the interesting patterns
    interesting_patterns = []

    # Iterate over the unique combinations of values
    for key, count in occurrence_count.items():
        occurrence_percentage = (count / total_rows) * 100

        # Check if the occurrence percentage exceeds a threshold (e.g., 5%)
        if occurrence_percentage > 5:
            pattern_sentence = f"The combination of values {key} occurs with a percentage of {occurrence_percentage:.2f}%."
            interesting_patterns.append((pattern_sentence, occurrence_percentage))

    # Return the list of tuples containing the interesting patterns and their occurrence percentages
    return interesting_patterns


In [69]:
X = pd.read_csv("./datasets/None_data2.csv")
interesting_patterns = find_interesting_patterns(X, 2)

for pattern, occurrence_percentage in interesting_patterns:
    print(f"{pattern} (Occurrence: {occurrence_percentage}%)")


KeyError: 'key of type tuple not found and not a MultiIndex'