In [2]:
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from mlxtend.preprocessing import TransactionEncoder
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import parallel_coordinates

In [3]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kouro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kouro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
def correct_types(df, column='SO_Omschrijving'):
    """
    Preprocess the specified column in a DataFrame by replacing NaN values with an empty string
    and converting non-string values to string objects.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - column (str): The column to be preprocessed. Default is 'SO_Omschrijving'.

    Returns:
    - pd.DataFrame: The DataFrame with the specified column preprocessed.
    """

    df[column] = df[column].fillna('')
    df[column] = df[column].astype(str)
    return df

def replace_punctuation(text):
    """
    Replace punctuation in the input text.

    Parameters:
    - text (str): The input text in which punctuation will be replaced.

    Returns:
    - str: The input text with punctuation replaced.
    """
    # Replace "'s" with empty space.
    text = text.replace("'s", '')

    # Create a translation table to replace '.' and "'" with empty space, and other punctuation with spaces.
    translator = str.maketrans({'.': '', "'": '', **{p: ' ' for p in string.punctuation if p not in ['.', "'"]}})

    cleaned_text = text.translate(translator)

    return cleaned_text

def remove_numerical_values(text):
    """
    Remove numbers and model-related patterns from the input text.

    Parameters:
    - text (str): The input text from which numbers and model-related patterns will be removed.

    Returns:
    - str: The input text with numbers and model-related patterns removed.
    """
    # Remove standalone numbers with optional floating points; such as 154, 99.31.
    text = re.sub(r'\b\d+(\.\d+)?\b', '', text)

    # Remove ordinal numbers; such as 1e, 2ste, 3de, 4e.
    text = re.sub(r'\b\d+(e|ste|de|e)\b', '', text)

    # Remove numerical quantifiers; such as 1x OH.
    text = re.sub(r'\b\d+[xX]\b', '', text)

    return text

def stemize(text):
    """
    Perform stemming on the input text using the Dutch Snowball Stemmer.

    Parameters:
    - text (str): The input text to be stemmed.

    Returns:
    str: The stemmed text.

    Example:
    >>> stemize("This is an example text for stemming.")
    'thi is an exampl text for stem.'
    """
    stemmer = SnowballStemmer("dutch")
    tokens = word_tokenize(text)
    text = ' '.join([stemmer.stem(token) for token in tokens])
    return text

def normalize_lookups():
    """
    Normalize and preprocess the global lookup lists.

    This function performs the following operations on each global lookup list:
    1. Orders lookup lists by length in descending order.
    2. Initializes the Dutch Snowball Stemmer.
    3. Replaces punctuations using the `replace_punctuation` function.
    4. Removes numerical values using the `remove_numerical_values` function.
    5. Applies stemming using the `stemize` function.
    6. Converts the resulting list to a set.

    Note: The original lookup lists are modified in-place.

    Returns:
    None
    """
    # Get all lookup lists using regular expression.
    lookup_lists = [var for var in globals() if re.match(r'^lookup_', var)]

    # Initialize stemmer.
    # stemmer = SnowballStemmer("dutch")

    for lookup_list_name in lookup_lists:

        lookup_list = globals()[lookup_list_name]

        # Replace punctuations, remove numerical values, and apply stemming.
        normalized_list = [stemize(remove_numerical_values(replace_punctuation(word))) for word in lookup_list]

        # Convert to set.
        lookup_list.clear()
        lookup_list.extend(set(normalized_list))

def group_items(description):
    """
    Group items in a description based on predefined lookup lists.

    Parameters:
    - description (str): The input description to be categorized.

    Returns:
    str: The category type of the description based on predefined lookup lists.
         If no match is found, 'Unknown' is returned.

    Example:
    >>> group_items("This is a description containing ventilation keywords.")
    'Ventilation'
    """
    # Get all lookup lists using regular expression.
    lookup_lists = [var for var in globals() if re.match(r'^lookup_', var)]

    # Order lookup lists by length in descending order.
    lookup_lists = sorted(lookup_lists, key=lambda x: len(globals()[x]), reverse=True)

    for lookup_list_name in lookup_lists:
        lookup_list = globals()[lookup_list_name]

        for word in lookup_list:
            if word in description:
                type_name = module_names[lookup_list_name.split('_')[1]]
                return type_name
    return 'Unknown'

def rules_to_coordinates(rules):
    """
    Convert association rules to coordinates.

    Parameters:
    - rules (pd.DataFrame): DataFrame containing association rules with 'antecedents', 'consequents', and other columns.

    Returns:
    - pd.DataFrame: DataFrame with 'antecedent', 'consequent', and 'rule' columns representing coordinates.
    """

    # Extract the first item from antecedents and consequents.
    rules['antecedent'] = rules['antecedents'].apply(lambda antecedent: list(antecedent)[0])
    rules['consequent'] = rules['consequents'].apply(lambda consequent: list(consequent)[0])

    # Assign rule index to a new column 'rule'.
    rules['rule'] = rules.index

    # Select relevant columns for coordinates.
    coords = rules[['antecedent', 'consequent', 'rule']]

    return coords

def contains_word_regex(sentence, target_word):
    """
    Check if a given word is present in a sentence using regex.

    Parameters:
    - sentence (str): The input sentence to check.
    - target_word (str): The word to look for in the sentence.

    Returns:
    - bool: True if the word is found, False otherwise.
    """
    # Construct a regex pattern to match the whole word, case-insensitive.
    pattern = r'\b' + re.escape(target_word) + r'\b'
    
    # Use re.search to find the pattern in the sentence.
    match = re.search(pattern, sentence, flags=re.IGNORECASE)
    
    # Return True if a match is found, False otherwise.
    return bool(match)

In [5]:
lookup_ventilation = [
    "lbk",
    "luchtbehandeling",
    "luchtbehandelen",
    "luchbehandeling",
    "luchtbehandeling",
    "ventilatiesysteem",
    "ventilatie",
    "luchtklep",
    "stoombevochtiger",
    "stoombevochtiging",
    "bevochtiger",
    "toevoerventilator",
    "afvoerventilator",
    "ventilatormotor",
    "dakventilatoren",
    "Dakventilator",
    "Afzuiventilator",
    "wiel",
    "afzuigvent",
    "Vsnaren",
    "V snaren",
    "filters",
    "snaarbreuk",
    "condensafvoer",
    "condensor",
    "filter",
    "Luchtbeh",
    "LBH",
    "vorst",
    "verwarmingsbatterij",
    "stoomvochtiger",
    "luchtdebiet",
    "luchtzakken",
    "fancoil",
    "splitunit",
    "fancoil",
    "splitunit",
    "fancoil",
    "split-unit"
]
lookup_cooling = [
    "airco",
    "aircos",
    "drogekoeler",
    "koelmachine",
    "koeling",
    "KM",
    "koelunit",
    "koelinstallatie",
    "topcooling",
    "gkw",
    "koeltoren",
    "chillers",
    "drycooler",
    "koelplafond",
    "koelklep",
    "koelwaterpomp",
    "carrier",
    "DX koeler",
    "DXkoeler",
    "draaikoeler",
    "condensventilator",
    "chiller",
    "koelventilator",
    "Condensorventilator",
    "condenser"
]
lookup_heating = [
    "ketel",
    "CV",
    "c.v.",
    "c.v",
    "Kachel",
    "verwarming",
    "radiatoren",
    "radiator",
    "vloerverwarming",
    "rookgasventilator"
]
lookup_fireSafety = [
    "brandmeld",
    "rookmelder",
    "branddeur",
    "brandklep",
    "brandweer",
    "brandhaspel",
    "brandblus",
    "ontruiming",
    "Brandventilatoren",
    "trappenhuis"
]
lookup_entrance = [
    "toegangspoort",
    "tourniqet",
    "paslezer","tourniquet",
    "tourniqeut",
    "tourniqut",
    "toegang",
    "Toerniqet",
    "tourniget",
    "garagedeur",
    "schuifhek",
    "slagboom"
]
lookup_shading = [
    "zonwering",
    "zonneschermen",
    "zonnewering"
]
lookup_sanitary = [
    "toilet",
    "WC",
    "urinoir",
    "wastafel",
    "sanitair",
    "wasbak",
    "afvoer"
]
lookup_heatPump = [
    "warmtepomp",
    "warmte pomp",
    "WKO"
]
lookup_lighting = [
    "verlichting",
    "lamp",
    "Tl-buis",
    "licht",
    "tlarmatuur",
    "tlbuizen",
    "armatuur",
    "armaturen",
    "armanturen"
]
lookup_elevator = [
    "Lift"
]

lookup_wkk = [
    "WKK"
]
lookup_bms = [
    "GBS",
    "Data",
    "logger",
    "lon",
    "BMC",
    "priva",
    "software",
    "regeling",
    "hardware",
    "regelkast",
    "RK 1",
    "rk5",
    "rk2",
    "RK5",
    "rk3",
    "Rk 3",
    "rk4",
    "rk 4",
    "rk1",
    "rk2",
    "rk6",
    "rk 6",
    "RK7",
    "sensor",
    "regelaar",
    "opnemer",
    "thermostaat",
    "meting",
    "onderstation",
    "Kloktijden",
    "kastventilatoren",
    "kastventilatoren",
    "kastventilator",
    "Kastventilator"
]


lookup_waterDistribution = [
    "regelklep",
    "driewegklep",
    "TSA",
    "Warmtewisselaar",
    "pomp",
    "hydrofoor",
    "expansie",
    "drukvat",
    "waterleiding",
    "3wegklep",
    "transportnet"
]

lookup_office = [
    "werkvoorbereiding",
    "Contractbeheerder",
    "materiaal",
    "Onderaanneming",
    "Werkvoorbereider",
    "kantoor",
    "Materiaalbon",
    "Urenbon",
    "Contractbegeleiding",
    "Contractbeheer",
    "overleg",
    "Meet-enregeltechniek",
    "Inlenen",
    "Weekplanning",
    "calculatie",
    "Onderaannemering",
    "contractmanager",
    "Contractmanagement",
    "Onderaannemer"
]

lookup_domesticWater = [
    "warm water",
    "ww",
    "w.w.",
    "w.w",
    "warmtapwater",
    "warmwater",
    "boiler"
]

loopup_faultRedemption = [
    "Storingsafkoop",
    "Afkoopstoringen",
    "Verrekening afkoop"
]

lookup_regularMaintenance = [
    "onderhoud",
    "inspectie",
    "OH",
    "OHD",
    "controle",
    "Preventief",
    "testen",
    "Bedrijfvoering",
    "Bedrijfsvoering"
]

lookup_complaints = [
    "klachten",
    "klacht",
    "te warm",
    "tekoud",
    "luchtvochtigheid",
    "klimaatbeheersing",
    "klimaat",
    "teheet",
    "benauwd",
    "tocht",
    "R.V.",
    "ergkoud",
    "RV telaag",
    "lekkage",
    "Ruimtevochtigheid",
    "erg warm",
    "erg koud"
]
module_names = {
    "complaints": "Complaints",
    "regularMaintenance": "Regular Maintenance",
    "faultRedemption": "Fault Redemption",
    "domesticWater": "Domestic Water",
    "office": "Office",
    "waterDistribution": "Water Distribution",
    "bms": "BMS",
    "wkk": "WKK",
    "elevator": "Elevator",
    "lighting": "Lighting",
    "heatPump": "Heat Pump",
    "sanitary": "Sanitary",
    "fireSafety": "Fire Safety",
    "shading": "Shading",
    "entrance": "Entrance",
    "ventilation": "Ventilation",
    "heating": "Heating",
    "cooling": "Cooling"
}

In [6]:
normalize_lookups()

In [7]:
path = './Navision Serviceorder data.xlsx'

# Read the Excel file into a DataFrame.
df_so = pd.read_excel(path)

df_so.reset_index(drop=True, inplace=True)

# print(df_so.info())

columns_to_select = ['SO_Omschrijving', 'SO_Orderdatum (Begindatum)', 'Order technisch gereed (Einddatum)', 'Factuurkosten SO']
df_so = df_so[columns_to_select]

df_so.head()


Unnamed: 0,SO_Omschrijving,SO_Orderdatum (Begindatum),Order technisch gereed (Einddatum),Factuurkosten SO
0,storing stoombevochtiger ruimte 4.29,44608,44609.0,154.18
1,Wasbak pantry ruimte 4.38 loopt slecht door,44624,44624.0,229.58
2,Storing bevochtiger van LBK-Links.,44627,44628.0,188.14
3,Afzuig schakelaar werkt niet. Keuken.,44704,44704.0,250.17
4,Snaarbreuk afzuigvent. LBK rechts,44742,44746.0,447.16


**Data Cleaning**

In [8]:
# Create a copy from the original dataset.
df_so_cleaned = correct_types(df_so, column='SO_Omschrijving')

# Replace punctuations.
df_so_cleaned['SO_Omschrijving'] = df_so_cleaned['SO_Omschrijving'].apply(replace_punctuation)

# Remove numerical values.
df_so_cleaned['SO_Omschrijving'] = df_so_cleaned['SO_Omschrijving'].apply(remove_numerical_values)

# Remove Dutch stop words.
stop_words = set(stopwords.words('dutch'))
stop_words.add('via')
df_so_cleaned['SO_Omschrijving'] = df_so_cleaned['SO_Omschrijving'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

# Stemize the text.
df_so_cleaned['SO_Omschrijving'] = df_so_cleaned['SO_Omschrijving'].apply(stemize)

df_so_cleaned.head()

Unnamed: 0,SO_Omschrijving,SO_Orderdatum (Begindatum),Order technisch gereed (Einddatum),Factuurkosten SO
0,storing stoombevochtiger ruimt,44608,44609.0,154.18
1,wasbak pantry ruimt loopt slecht,44624,44624.0,229.58
2,storing bevochtiger lbk link,44627,44628.0,188.14
3,afzuig schakelar werkt keuk,44704,44704.0,250.17
4,snaarbreuk afzuigvent lbk recht,44742,44746.0,447.16


In [9]:
df_so_cleaned['target'] = df_so_cleaned['SO_Omschrijving'].apply(group_items)
categorized_count = 0
for _, value in module_names.items():
    categorized_count = categorized_count + len(df_so_cleaned["target"][df_so_cleaned["target"] == value])
    print(f'{value} count: {len(df_so_cleaned["target"][df_so_cleaned["target"] == value])}')

Complaints count: 1834
Regular Maintenance count: 110
Fault Redemption count: 0
Domestic Water count: 930
Office count: 80
Water Distribution count: 655
BMS count: 1976
WKK count: 1
Elevator count: 374
Lighting count: 885
Heat Pump count: 68
Sanitary count: 584
Fire Safety count: 81
Shading count: 22
Entrance count: 59
Ventilation count: 604
Heating count: 1848
Cooling count: 818


In [10]:
df_so_ventilation = df_so_cleaned[df_so_cleaned["target"].isin(["Ventilation"])]
df_so_cooling = df_so_cleaned[df_so_cleaned["target"].isin(["Cooling"])]
df_so_heating = df_so_cleaned[df_so_cleaned["target"].isin(["Heating"])]
df_so_cleaned.head()

Unnamed: 0,SO_Omschrijving,SO_Orderdatum (Begindatum),Order technisch gereed (Einddatum),Factuurkosten SO,target
0,storing stoombevochtiger ruimt,44608,44609.0,154.18,Ventilation
1,wasbak pantry ruimt loopt slecht,44624,44624.0,229.58,Sanitary
2,storing bevochtiger lbk link,44627,44628.0,188.14,Ventilation
3,afzuig schakelar werkt keuk,44704,44704.0,250.17,BMS
4,snaarbreuk afzuigvent lbk recht,44742,44746.0,447.16,Ventilation


In [11]:
# Split transaction strings into lists.
transactions = df_so_ventilation['SO_Omschrijving'].apply(lambda t: t.split(' '))
# Convert DataFrame column into list of strings.
transactions_ventilation = list(transactions)
print(transactions_ventilation)

transactions = df_so_cooling['SO_Omschrijving'].apply(lambda t: t.split(' '))
transactions_cooling = list(transactions)
print(transactions_cooling)

transactions = df_so_heating['SO_Omschrijving'].apply(lambda t: t.split(' '))
transactions_heating = list(transactions)
print(transactions_heating)

[['storing', 'stoombevochtiger', 'ruimt'], ['storing', 'bevochtiger', 'lbk', 'link'], ['snaarbreuk', 'afzuigvent', 'lbk', 'recht'], ['lbk', 'recht', 'stoomomvormer', '100sb01'], ['stoomomvormer', 'lbk', 'recht'], ['rk', 'lbk', 'link', 'afzuigventilator', '100av01'], ['mer', 'ruimt', 'rk', 'lbk', 'link', 'communicatie', 'alarm'], ['snaarbreuk', 'afzuigvent', 'rk', 'lbk', 'recht'], ['storing', 'snaarbreuk', 'afzuigvent', '100pdt02'], ['snaarbreuk', 'afzuigvent'], ['lbk', 'recht', 'snaarbreuk'], ['stoomomvormer', 'rk3', 'lbk', 'link', 'storing'], ['storing', 'lbk'], ['fancoil', 'unit', 'mak', 'lawaai'], ['ventilatiesystem', 'stat', 'storing'], ['afzuigventilator', 'produktie', 'start'], ['storing', 'frequentieregelar', 'ventilatie', 'keuk'], ['luchtbehandelingsystem', 'valt', 'sted', 'storing'], ['pomp', 'storing', 'fancoil'], ['pomp', 'storing', 'fancoil'], ['ruimt', 'lbk', 'melding'], ['storing', 'stoomvormer', 'lbk'], ['ruimtetemp', 'ruimt', 'lbk', 'max', 'gr'], ['lbk', 'schakel'], ['s

In [12]:
# Instantiate transaction encoder and identify unique items in transactions.
encoder = TransactionEncoder().fit(transactions_ventilation)
# One-hot encode transactions.
onehot_ventilation = encoder.transform(transactions_ventilation)
onehot_ventilation = pd.DataFrame(onehot_ventilation, columns=encoder.columns_)

encoder = TransactionEncoder().fit(transactions_cooling)
onehot_cooling = encoder.transform(transactions_cooling)
onehot_cooling = pd.DataFrame(onehot_cooling, columns=encoder.columns_)

encoder = TransactionEncoder().fit(transactions_heating)
onehot_heating = encoder.transform(transactions_heating)
onehot_heating = pd.DataFrame(onehot_heating, columns=encoder.columns_)
onehot_heating

Unnamed: 0,02ke02,03ke01,03ke02,03pt01,0a,10kt1,111ke01,111ke05,111ke06,12kg1,...,zit,zithoek,zoem,zoemend,zoemer,zolder,zorg,zorgcluster,zwar,zwembad
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1843,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1844,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1845,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1846,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


**Algorithm**

Ventilation

In [13]:
# Compute frequent itemsets using the Apriori algorithm.
frequent_itemsets_ventilation = apriori(onehot_ventilation, 
                            min_support =  0.01,
                            use_colnames = True,
                            verbose = 1)

print(len(frequent_itemsets_ventilation))

# frequent_itemsets_ventilation = frequent_itemsets_ventilation[frequent_itemsets_ventilation['itemsets'].apply(lambda x: len(x)) > 1]
frequent_itemsets_ventilation = frequent_itemsets_ventilation.sort_values(by=['support'], ascending=False).reset_index(drop=True)
frequent_itemsets_ventilation.head(50)

Processing 8 combinations | Sampling itemset size 4 32
89


Unnamed: 0,support,itemsets
0,0.324503,(lbk)
1,0.30298,(storing)
2,0.162252,(ventilatie)
3,0.120861,"(lbk, storing)"
4,0.094371,(luchtbehandel)
5,0.077815,(mp)
6,0.06457,(ventilatiesystem)
7,0.057947,(filter)
8,0.046358,(werkt)
9,0.044702,(lekkag)


In [35]:
# Compute all association rules for frequent_itemsets.
rules_ventilation = association_rules(frequent_itemsets_ventilation, 
                            metric = "lift", 
                         	min_threshold = 1)

rules_ventilation = rules_ventilation.sort_values(by=['lift'], ascending=False).reset_index(drop=True)
print(len(rules_ventilation))
# rules_ventilation.head(50)

rules_one_consequent = rules_ventilation[rules_ventilation['antecedents'].apply(len) == 2]
print(len(rules_one_consequent))
rules_one_consequent.head(60)

66
6


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1,"(koud, blaast)",(lucht),0.011589,0.028146,0.011589,1.0,35.529412,0.011263,inf,0.98325
3,"(lucht, koud)",(blaast),0.014901,0.028146,0.011589,0.777778,27.633987,0.01117,4.373344,0.978391
5,"(lucht, blaast)",(koud),0.018212,0.023179,0.011589,0.636364,27.454545,0.011167,2.686258,0.98145
18,"(lbk, storing)",(urgent),0.120861,0.031457,0.016556,0.136986,4.354722,0.012754,1.12228,0.876271
25,"(lbk, urgent)",(storing),0.019868,0.30298,0.016556,0.833333,2.750455,0.010537,4.182119,0.649324
41,"(urgent, storing)",(lbk),0.02649,0.324503,0.016556,0.625,1.92602,0.00796,1.801325,0.493878


**Observe Itemsets____________________________________________________________**

In [15]:
df_original = pd.read_excel(path)
df_original.reset_index(drop=True, inplace=True)
columns_to_select = ['SO_Omschrijving', 'SO_Orderdatum (Begindatum)', 'Order technisch gereed (Einddatum)', 'Factuurkosten SO']
df_original = df_original[columns_to_select]

In [16]:
word_list = list(rules_ventilation.iloc[0]['antecedents'])
indices = []
for transaction in df_so['SO_Omschrijving']:
    if all(contains_word_regex(transaction, word) for word in word_list):
        indices.append(df_so[df_so['SO_Omschrijving'] == transaction].index[0])

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(f"Original Dataset: {len(indices)} items\n{df_original.iloc[indices]['SO_Omschrijving']}")
    print(f"\nCleaned Dataset: {len(indices)} items\n{df_so.iloc[indices]['SO_Omschrijving']}")

Original Dataset: 73 items
88                        Koeling blaast geen koude lucht.
88                        Koeling blaast geen koude lucht.
100                 Geluidsklachten / lucht in installatie
364               (MP)storing hygromatik lucht bevochtiger
1181                             LBK Blaast te koude lucht
1315                             Blowers blazen koud lucht
1648                  (MP)Airco blaast alleen warmte lucht
1806      Luchtventilatie werkt niet, blaast warme lucht. 
1806      Luchtventilatie werkt niet, blaast warme lucht. 
1849                     Linker dakunit zuigt valse lucht.
1903               Warmelucht gordijnen blazen koude lucht
1905                             Heater blaast koude lucht
1972                  Luchtheater geeft alleen koude lucht
2111          ventilatiesysteem voert warme lucht niet af 
2150                    Warmte gordijn blaast koude lucht 
2191                            (MP) Storing lucht toevoer
2468              Pomp is def

**____________________________________________________________**

Cooling

In [17]:
# Compute frequent itemsets using the Apriori algorithm.
frequent_itemsets_cooling = apriori(onehot_cooling, 
                            min_support =  0.00000001,
                            verbose = 1, 
                            use_colnames = True)


print(len(frequent_itemsets_cooling))

frequent_itemsets_cooling = frequent_itemsets_cooling[frequent_itemsets_cooling['itemsets'].apply(lambda x: len(x)) > 1]
frequent_itemsets_cooling = frequent_itemsets_cooling.sort_values(by=['support'], ascending=False).reset_index(drop=True)
frequent_itemsets_cooling.head(50)

Processing 1464 combinations | Sampling itemset size 8765
10809


Unnamed: 0,support,itemsets
0,0.169927,"(storing, airco)"
1,0.101467,"(koelmachin, storing)"
2,0.064792,"(koeling, storing)"
3,0.06357,"(airco, lekkag)"
4,0.040342,"(defect, airco)"
5,0.03423,"(airco, unit)"
6,0.033007,"(ruimt, airco)"
7,0.030562,"(lekt, airco)"
8,0.028117,"(airco, kantor)"
9,0.026895,"(airco, serverruimt)"


**Observe Itemsets____________________________________________________________**

In [18]:
word_list = list(frequent_itemsets_ventilation.iloc[33]['itemsets'])
indices = []
for transaction in df_so['SO_Omschrijving']:
    if all(contains_word_regex(transaction, word) for word in word_list):
        indices.append(df_so[df_so['SO_Omschrijving'] == transaction].index[0])

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(f"Original Dataset: {len(indices)} items\n{df_original.iloc[indices]['SO_Omschrijving']}")
    print(f"\nCleaned Dataset: {len(indices)} items\n{df_so.iloc[indices]['SO_Omschrijving']}")

Original Dataset: 14 items
2339                                    Storing warmtewiel
2848                            (JB) Warmtewiel staat stil
2848                            (JB) Warmtewiel staat stil
2856           storing frequentieregelaar warmtewiel bwd A
3785            Vervolgactie M&R Systemair-kast warmtewiel
3800                        Storing Warmtewiel LBK kantoor
5969                                Warmtewiel storing LBK
5969                                Warmtewiel storing LBK
6049                                warmtewiel draaid niet
6050                             LBK warmtewiel afgebroken
7073                     Storing warmtewiel en bevochtiger
7073                     Storing warmtewiel en bevochtiger
13673    GBS: Storing W installatie hoog urgent Warmtewiel
13681                               Storing warmtewiel LBK
Name: SO_Omschrijving, dtype: object

Cleaned Dataset: 14 items
2339                                  storing warmtewiel
2848                      

**____________________________________________________________**

In [19]:
# Compute all association rules for frequent_itemsets.
rules_cooling = association_rules(frequent_itemsets_cooling, 
                            metric = "lift", 
                         	min_threshold = 1)

print(len(rules_cooling))
rules_cooling

KeyError: "frozenset({'storing'})You are likely getting this error because the DataFrame is missing  antecedent and/or consequent  information. You can try using the  `support_only=True` option"

Heating

In [None]:
# Compute frequent itemsets using the Apriori algorithm.
frequent_itemsets_heating = apriori(onehot_heating, 
                            min_support =  0.005,
                            max_len = 5, 
                            use_colnames = True)

# Print a preview of the frequent itemsets
print(len(frequent_itemsets_heating))
print(frequent_itemsets_heating)

In [None]:
# Compute all association rules for frequent_itemsets.
rules_heating = association_rules(frequent_itemsets_heating, 
                            metric = "lift", 
                         	min_threshold = 1)

print(len(rules_heating))
rules_heating

**Visualization**

Ventilation

In [None]:
rules_ventilation = association_rules(frequent_itemsets_ventilation, metric = 'support', min_threshold = 0.02)

# Replace frozen sets with strings.
rules_ventilation['antecedents'] = rules_ventilation['antecedents'].apply(lambda a: ','.join(list(a)))
rules_ventilation['consequents'] = rules_ventilation['consequents'].apply(lambda a: ','.join(list(a)))

# Transform data to matrix format and generate heatmap.
pivot = rules_ventilation.pivot(index='consequents', columns='antecedents', values='support')
sns.heatmap(pivot)

plt.title('Heatmap of Supports')
plt.yticks(rotation=0)
plt.show()

In [None]:
rules_ventilation = association_rules(frequent_itemsets_ventilation, metric = 'support', 
                          min_threshold = 0.0)

sns.scatterplot(x = "support", y = "confidence", data = rules_ventilation)
plt.show()

In [None]:
rules_ventilation = association_rules(frequent_itemsets_ventilation, metric = "support", 
                          min_threshold = 0.0)

sns.scatterplot(x = "support", y = "confidence", 
                size = "lift", data = rules_ventilation)
plt.show()

In [None]:
# Compute the frequent itemsets.
frequent_itemsets_ventilation = apriori(onehot_ventilation, min_support = 0.02, 
                         use_colnames = True, max_len = 2)

# Compute rules from the frequent itemsets with the confidence metric.
rules_ventilation = association_rules(frequent_itemsets_ventilation, metric = 'support', 
                          min_threshold = 0.0)

# Convert rules into coordinates suitable for use in a parallel coordinates plot.
coords = rules_to_coordinates(rules_ventilation)

# Generate parallel coordinates plot
parallel_coordinates(coords, 'rule')
plt.legend([])
plt.show()

Cooling

In [None]:
rules_cooling = association_rules(frequent_itemsets_cooling, metric = 'support', min_threshold = 0.02)

# Replace frozen sets with strings.
rules_cooling['antecedents'] = rules_cooling['antecedents'].apply(lambda a: ','.join(list(a)))
rules_cooling['consequents'] = rules_cooling['consequents'].apply(lambda a: ','.join(list(a)))

# Transform data to matrix format and generate heatmap.
pivot = rules_cooling.pivot(index='consequents', columns='antecedents', values='support')
sns.heatmap(pivot)

plt.title('Heatmap of Supports')
plt.yticks(rotation=0)
plt.show()

In [None]:
rules_cooling = association_rules(frequent_itemsets_cooling, metric = 'support', 
                          min_threshold = 0.0)

sns.scatterplot(x = "support", y = "confidence", data = rules_cooling)
plt.show()

In [None]:
rules_cooling = association_rules(frequent_itemsets_cooling, metric = "support", 
                          min_threshold = 0.0)

sns.scatterplot(x = "support", y = "confidence", 
                size = "lift", data = rules_cooling)
plt.show()

In [None]:
# Compute the frequent itemsets.
frequent_itemsets_cooling = apriori(onehot_cooling, min_support = 0.02, 
                         use_colnames = True, max_len = 2)

# Compute rules from the frequent itemsets with the confidence metric.
rules_cooling = association_rules(frequent_itemsets_cooling, metric = 'support', 
                          min_threshold = 0.0)

# Convert rules into coordinates suitable for use in a parallel coordinates plot.
coords = rules_to_coordinates(rules_cooling)

# Generate parallel coordinates plot
parallel_coordinates(coords, 'rule')
plt.legend([])
plt.show()

Heating

In [None]:
rules_heating = association_rules(frequent_itemsets_heating, metric = 'support', min_threshold = 0.02)

# Replace frozen sets with strings.
rules_heating['antecedents'] = rules_heating['antecedents'].apply(lambda a: ','.join(list(a)))
rules_heating['consequents'] = rules_heating['consequents'].apply(lambda a: ','.join(list(a)))

# Transform data to matrix format and generate heatmap.
pivot = rules_heating.pivot(index='consequents', columns='antecedents', values='support')
sns.heatmap(pivot)

plt.title('Heatmap of Supports')
plt.yticks(rotation=0)
plt.show()

In [None]:
rules_heating = association_rules(frequent_itemsets_heating, metric = 'support', 
                          min_threshold = 0.0)

sns.scatterplot(x = "support", y = "confidence", data = rules_heating)
plt.show()

In [None]:
rules_heating = association_rules(frequent_itemsets_heating, metric = "support", 
                          min_threshold = 0.0)

sns.scatterplot(x = "support", y = "confidence", 
                size = "lift", data = rules_heating)
plt.show()

In [None]:
# Compute the frequent itemsets.
frequent_itemsets_heating = apriori(onehot_heating, min_support = 0.02, 
                         use_colnames = True, max_len = 2)

# Compute rules from the frequent itemsets with the confidence metric.
rules_heating = association_rules(frequent_itemsets_heating, metric = 'support', 
                          min_threshold = 0.0)

# Convert rules into coordinates suitable for use in a parallel coordinates plot.
coords = rules_to_coordinates(rules_heating)

# Generate parallel coordinates plot
parallel_coordinates(coords, 'rule')
plt.legend([])
plt.show()