In [115]:
import pandas as pd
import os
import numpy as np
import csv
from pathlib import Path
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules

In [64]:
current_dir = os.getcwd()

In [95]:
#dataset_path = os.path.join(current_dir, "../data/raw/dataset-kaggle.csv")

In [99]:
#df = pd.read_csv(dataset_path,header=None, index_col=None, names=[f"Item_{i}" for i in range(1, 21)])

In [106]:
np.random.seed(42)
data = {
    "nivel": np.random.randint(1, 4, size=1000),
    "idade": np.random.randint(18, 65, size=1000),
    "experiencia": np.random.randint(0, 40, size=1000),
    "salario": np.random.randint(2000, 20000, size=1000),
    "horas_trabalho": np.random.randint(20, 60, size=1000),
    "target": np.random.choice([0, 1], size=1000, p=[0.6, 0.4]),
}

In [107]:
df = pd.DataFrame(data)

In [109]:
df.head(5)

Unnamed: 0,nivel,idade,experiencia,salario,horas_trabalho,target
0,3,20,23,5864,22,0
1,1,48,19,2226,25,1
2,3,57,36,17036,34,0
3,3,54,3,13739,46,0
4,1,53,27,18648,29,1


# Filtering only data with target = 1

In [110]:
df_target1 = df[df['target'] == 1].copy()

# Apply discretization for all numerical columns

In [112]:
num_cols = ["idade", "experiencia", "salario", "horas_trabalho"]
for col in num_cols:
    bins = np.percentile(df_target1[col], [0, 25, 50, 75, 100])
    labels = [f"{bins[i]:.0f} <= {col} < {bins[i+1]:.0f}" for i in range(len(bins) - 1)]
    df_target1[col] = pd.cut(df_target1[col], bins=bins, labels=labels, include_lowest=True)

In [121]:
transactions = df_target1.apply(lambda row: [f"{col}={val}" for col, val in row.items() if col != "target"], axis=1).tolist()

In [126]:
te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
df_trans = pd.DataFrame(te_array, columns=te.columns_)

In [129]:
frequent_itemsets = fpgrowth(df_trans, min_support=0.2, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)

In [130]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric


In [120]:
print("Padrões Frequentes:")
print(frequent_itemsets)
print("\nRegras de Associação:")
print(rules[["antecedents", "consequents", "support", "confidence"]])

Padrões Frequentes:
     support                                    itemsets
0   0.371353                                   (nivel=1)
1   0.302387         (experiencia=9 <= experiencia < 20)
2   0.270557  (horas_trabalho=20 <= horas_trabalho < 30)
3   0.251989            (salario=2024 <= salario < 6776)
4   0.244032                    (idade=45 <= idade < 54)
5   0.249337          (salario=14867 <= salario < 19951)
6   0.334218                                   (nivel=3)
7   0.249337          (salario=11101 <= salario < 14867)
8   0.228117  (horas_trabalho=49 <= horas_trabalho < 59)
9   0.270557  (horas_trabalho=38 <= horas_trabalho < 49)
10  0.254642          (experiencia=0 <= experiencia < 9)
11  0.294430                                   (nivel=2)
12  0.249337        (experiencia=29 <= experiencia < 39)
13  0.233422                    (idade=54 <= idade < 64)
14  0.275862                    (idade=18 <= idade < 32)
15  0.230769  (horas_trabalho=30 <= horas_trabalho < 38)
16  0.24668

In [135]:
rules_filtered = rules[(rules["support"] > 0.2) & (rules["confidence"] > 0.3)]
print(rules_filtered)


Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction, zhangs_metric]
Index: []


In [None]:

from joblib import Parallel, delayed

def discretize_column(df, col):
    bins = np.percentile(df[col], [0, 25, 50, 75, 100])
    labels = [f"{bins[i]:.0f} <= {col} < {bins[i+1]:.0f}" for i in range(len(bins) - 1)]
    return pd.cut(df[col], bins=bins, labels=labels, include_lowest=True)

results = Parallel(n_jobs=-1)(delayed(discretize_column)(df_target1, col) for col in num_cols)

for i, col in enumerate(num_cols):
    df_target1[col] = results[i]
