In [84]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import pandas as pd
import seaborn as sns
from sklearn.tree import _tree

In [85]:
import openai
openai.api_key = "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"

In [86]:
# Parameters

# Clustering Parameters
cluster_distance = 50000
minimum_similar = 10

# Conditonal Parameters
max_pattern_features = 5
max_confidence = 98
rarity_threshold = 100

In [87]:
# Data extraction and manipulation

X = pd.read_csv("./datasets/avocado.csv")

X = X.drop(["Unnamed: 0", "Date", "type", "region"], axis=1)
# Create a dictionary to map the old column names to the new names
new_columns = {
    'AveragePrice': 'Average Price',
    'Total Volume': 'Total Volume',
    '4046': 'Volume of Small Seed',
    '4225': 'Volume of Medium Seed',
    '4770': 'Volume of Large Seed',
    'Total Bags': 'Total Bags',
    'Small Bags': 'Small Bags',
    'Large Bags': 'Large Bags',
    'XLarge Bags': 'XLarge Bags',
    'year': 'Year'
}

# Rename the columns using the rename() method
X = X.rename(columns=new_columns)
X.head()


Unnamed: 0,Average Price,Total Volume,Volume of Small Seed,Volume of Medium Seed,Volume of Large Seed,Total Bags,Small Bags,Large Bags,XLarge Bags,Year
0,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,2015
1,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,2015
2,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,2015
3,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,2015
4,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,2015


In [88]:
db = DBSCAN(eps=cluster_distance, min_samples=minimum_similar).fit(X)
y_pred = db.fit_predict(X)
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of outliers: %d" % n_noise_)

Estimated number of clusters: 9
Estimated number of outliers: 3515


In [89]:
class_names = [i for i in range(n_clusters_)]
feature_names = list(X.columns)

In [90]:
# Assign cluster labels to data points
cluster_labels = db.labels_

# Create a decision tree classifier
clf = DecisionTreeClassifier(max_depth=max_pattern_features)

# Train the decision tree classifier on the labeled data
clf.fit(X, cluster_labels)

In [91]:
text_representation = tree.export_text(clf, feature_names=feature_names, show_weights=True, spacing=4)
print(text_representation)

|---- Total Volume <= 584276.22
|    |---- Total Bags <= 197910.45
|    |    |---- Volume of Large Seed <= 81240.57
|    |    |    |---- Total Volume <= 509127.12
|    |    |    |    |---- Volume of Large Seed <= 58151.18
|    |    |    |    |    |---- weights: [31.00, 13908.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 5.00] class: 0
|    |    |    |    |---- Volume of Large Seed >  58151.18
|    |    |    |    |    |---- weights: [10.00, 37.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00] class: 0
|    |    |    |---- Total Volume >  509127.12
|    |    |    |    |---- Large Bags <= 78413.94
|    |    |    |    |    |---- weights: [39.00, 186.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 2.00, 0.00] class: 0
|    |    |    |    |---- Large Bags >  78413.94
|    |    |    |    |    |---- weights: [6.00, 0.00, 0.00, 0.00, 14.00, 0.00, 0.00, 0.00, 0.00, 0.00] class: 3
|    |    |---- Volume of Large Seed >  81240.57
|    |    |    |---- Average Price <= 1.50
|    |    |    |    |---- Total

In [92]:
rare_patterns = []
frequent_patterns = []
all_patterns = []

In [93]:
def unique_conditions(condition_list):
    unique_dict = {}
    condition_list_head, condition_list_tail = condition_list[:-1], condition_list[-1:]
    for condition in condition_list_head:
        if " <= " in condition:
            variable, value = condition.split(" <= ")
            variable += " <= "
            unique_dict[variable.strip()] = value.strip()
        elif " > " in condition:
            variable, value = condition.split(" > ")
            variable += " > "
            unique_dict[variable.strip()] = value.strip()

    unique_list = []
    for key in unique_dict.keys():
        unique_list.append(key + " " + unique_dict[key])
        
    result = unique_list + condition_list_tail
    return result

def get_rules(tree, feature_names, class_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []
    
    def recurse(node, path, paths):
        
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            p1 += [f"({name} <= {np.round(threshold, 3)})"]
            recurse(tree_.children_left[node], p1, paths)
            p2 += [f"({name} > {np.round(threshold, 3)})"]
            recurse(tree_.children_right[node], p2, paths)
        else:
            path += [(tree_.value[node], tree_.n_node_samples[node])]
            path = unique_conditions(path)
            paths += [path]
            
    recurse(0, path, paths)

    # sort by samples count
    samples_count = [p[-1][1] for p in paths]
    ii = list(np.argsort(samples_count))
    paths = [paths[i] for i in reversed(ii)]
    
    confidence = 0
    rules = []
    rare_rules = []
    frequent_rules = []

    for path in paths:
        rule = "if "
        
        for p in path[:-1]:
            if rule != "if ":
                rule += " and "
            rule += str(p)
        rule += " then "
        if class_names is None:
            rule += "response: "+str(np.round(path[-1][0][0][0],3))
        else:
            classes = path[-1][0][0]
            l = np.argmax(classes)
            try:
                confidence = np.round(100.0*classes[l]/np.sum(classes),2)
                rule += f"confidence: {confidence}%"
                # rule += f"class: {class_names[l]} (confidence: {confidence}%)"
            except:
                continue

        sample_occurences = path[-1][1]
        rule += f" of occurence, based on {path[-1][1]:,} samples"
        if (confidence >= max_confidence):
            rules.append(rule)
            if sample_occurences < rarity_threshold:
                rare_rules.append(rule)
            else:
                frequent_rules.append(rule)
        
    return rules, rare_rules, frequent_rules

In [94]:
all_patterns, rare_patterns, frequent_patterns = get_rules(clf, feature_names, class_names)

In [95]:
# Setup OpenAI

prompt = "Imagine you are a person who takes in if conditions and returns understandable information to human as statements. Formulate simple sentences based on list of conditional statements given and only use max, min value of a given conditional variable. Give numbers to each of the output. The list of rules are as follows: \n"
model = "text-davinci-003"

In [96]:
# Rare Patterns

if rare_patterns != []:
    rare_prompt = str(prompt)
    rare_prompt += str(rare_patterns)

    response = openai.Completion.create(engine=model, prompt=rare_prompt, max_tokens=1000)

    generated_text = response.choices[0].text
    print("All Rare Patterns in given data are as follows:\n" + generated_text)
else:
    print("No Rare Patterns in given dataset !!!")

All Rare Patterns in given data are as follows:


1. If Total Volume is 584276.219 or less, Total Bags is more than 197910.453 and Volume of Small Seed is 165233.234 or less, Total Volume is more than 422214.531 and Small Bags is less than 277894.766, then there is a 98.82% chance of occurence, based on 85 samples. 
2. If Total Volume is more than 862939.938 and less than 896908.781, then there is a 98.75% chance of occurence, based on 80 samples. 
3. If Total Volume is more than 584276.219 and less than 804939.938, Volume of Medium Seed is more than 332840.75, Volume of Large Seed is less than 77672.469 and Large Bags is more than 44099.234, then there is a 100% chance of occurence, based on 30 samples.
4. If Total Volume is 584276.219 or less, Total Bags is less than 197910.453 and Volume of Large Seed is more than 81240.566 and Average Price is less than 1.495 and Total Volume is more than 405197.094, then there is a 100% chance of occurence, based on 16 samples.
5. If Total Volume 

In [97]:
# Frequent Patterns

if frequent_patterns != []:
    frequent_prompt = str(prompt)
    frequent_prompt += str(frequent_patterns)

    response = openai.Completion.create(engine=model, prompt=frequent_prompt, max_tokens=1000)

    generated_text = response.choices[0].text
    print("All Frequent Patterns in given data are as follows:\n" + generated_text)
else:
    print("No Frequent Patterns in given dataset !!!")

All Frequent Patterns in given data are as follows:


1. Occurrences with a Total Volume less than or equal to 509127.125, Total Bags less than or equal to 197910.453, and Volume of Large Seed less than or equal to 58151.176 have a 99.74% chance of occuring, based on 13,944 samples. 
2. Occurrences with a Total Volume greater than 897075.219 have a 100.0% chance of occuring, based on 2,617 samples.
3. Occurrences with a Total Volume greater than 804939.938 and less than or equal to 862939.938 and Volume of Small Seed greater than 87638.0 have a 100.0% chance of occuring, based on 124 samples.


In [98]:
# All Patterns

if all_patterns != []:
    all_prompt = str(prompt)
    all_prompt += str(all_patterns)

    response = openai.Completion.create(engine=model, prompt=all_prompt, max_tokens=1000)

    generated_text = response.choices[0].text
    print("All Patterns in given data are as follows:\n" + generated_text)
else:
    print("No Patterns in given dataset !!!")

All Patterns in given data are as follows:


1. If the total volume is less than or equal to 509127.125 and total bags are less than or equal to 197910.453 and the volume of large seed is less than or equal to 58151.176 then there is a 99.74% chance of occurence, based on 13,944 samples.
2. If the total volume is greater than 897075.219 then there is a 100.0% chance of occurence, based on 2,617 samples.
3.If the total volume is greater than 804939.938 and less than or equal to 862939.938 and the volume of small seed is greater than 87638.0 then there is a 100.0% chance of occurence, based on 124 samples.
4. If the total volume is less than or equal to 584276.219 and total bags is greater than 197910.453 and the volume of small seed is less than or equal to 165233.234 and the total volume is greater than 422214.531 and small bags is less than or equal to 277894.766 then there is a 98.82% chance of occurence, based on 85 samples.
5.If the total volume is greater than 862939.938 and less 