In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import Normalizer

SMALL_SIZE = 15
MEDIUM_SIZE = 18
BIGGER_SIZE = 22

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

# K-NN

In [None]:
df = pd.read_csv('cardio_train.csv', sep=';')
df

## Remoção de outliers

In [None]:
df_outliers = df[(df.ap_hi <= df.ap_lo) | (df.ap_hi < 120) | (df.ap_hi > 300) | (df.ap_lo < 0) | (df.ap_lo > 300)]
df = df.drop(df_outliers.index, axis=0)
df

## Escolha dos atributos

In [None]:
df = df[['age', 'weight', 'ap_hi', 'cardio']]

In [None]:
df = df.astype({'age': 'int32'})
df['age'] = df['age'] / 365
df = df.astype({'age': 'int32'})
df

## Grupos

In [None]:
df['color'] = df.cardio.map(lambda c: 'red' if c == 1 else 'blue')
df

In [None]:
df['id'] = range(0, len(df.index))
df = df.set_index('id')
df

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(15, 10))
ax = fig.add_subplot(projection='3d')
ax = Axes3D(fig)
ax.set_title('Pessoas que possuem ou não alguma doença cardiovascular', fontsize=18)
ax.set_xlabel('Idade', fontsize=18)
ax.set_zlabel('Peso', fontsize=18)
ax.set_zlabel('Pressão sistólica', fontsize=18)

ax.scatter(xs=df.age, ys=df.weight, zs=df.ap_hi, c=df['color'])

## Training and Testing Data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

def training_test(df, k, p=2, weights='uniform'):
    X = df[['age', 'weight', 'ap_hi']]
    Y = df.cardio

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
    
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    classifier = KNeighborsClassifier(n_neighbors = k, p=p, weights=weights)
    classifier.fit(X_train, Y_train)
    result = classifier.predict(X_test)
    return accuracy_score(Y_test, result)

## Como a acurácia se comporta com relação ao tamanho da base?

In [None]:
accuracy_tam_base = {'tam': [], 'accuracy': []}

for n in range(10, 101, 10):
    df_aux = df[df.index < ((len(df.index) * n) / 100)]
    print(len(df_aux.index))
    accuracy = training_test(df=df_aux, k=3, weights='distance', p=1)
    accuracy_tam_base['tam'].append(n)
    accuracy_tam_base['accuracy'].append(accuracy)


plt.figure(figsize=(15, 7))
plt.title('Acurácia X Tamanho da base', fontsize=22)
plt.xlabel('Tamanho da base (%)', fontsize=18)
plt.ylabel('Acurácia', fontsize=18)
plt.plot(accuracy_tam_base['tam'], accuracy_tam_base['accuracy'])
plt.show()

## Quanto ao número de vizinhos?

In [None]:
import math

accuracy_k_base = {'k':[], 'accuracy':[]}
tamanho_base = len(df.index)

for k in range(1, round(math.sqrt(tamanho_base)) + 1):
    accuracy = training_test(df=df, k=k)
    accuracy_k_base['k'].append(k)
    accuracy_k_base['accuracy'].append(accuracy)
    
plt.figure(figsize=(15, 7))
plt.title('Acurácia X Quantidade de vizinhos', fontsize=22)
plt.xlabel('Quantidade de vizinhos', fontsize=18)
plt.ylabel('Acurácia', fontsize=18)
plt.plot(accuracy_k_base['k'], accuracy_k_base['accuracy'])

In [None]:
def melhor_k(accuracy_k_base):
    index_maior = 0
    valor_accuracy = accuracy_k_base['accuracy'][0]
    for i in range(1, len(accuracy_k_base['k'])):
        if(valor_accuracy < accuracy_k_base['accuracy'][i]):
            valor_accuracy = accuracy_k_base['accuracy'][i]
            index_maior = i
    return index_maior       

#Pegado o melhor valor para K
k = melhor_k(accuracy_k_base)
if (k % 2) == 0 : k += 1
k

## Quanto a medida de distância utilizada

In [None]:
accuracy = training_test(df=df, k=k, p=1)
print(accuracy)
accuracy = training_test(df=df, k=k, p=2)
print(accuracy)

## Atribuição de pesos para vizinhos

In [None]:
accuracy = training_test(df=df, k=k, weights='distance')
print(accuracy)
accuracy = training_test(df=df, k=k)
print(accuracy)

# Clusterização

In [None]:
import my_methods as m
from sklearn.metrics import silhouette_score

In [None]:
df = pd.read_csv('Mall_Customers.csv', sep=',')
df = df.rename(columns={'Age': 'age', 'Annual Income (k$)': 'annual_income', 'Spending Score (1-100)': 'score'})
df = df[['age', 'annual_income', 'score']]
df

## Removendo linhas com dados nulos

In [None]:
df = df[~df.isnull().T.any()]
df

## k-means

In [None]:
from sklearn.cluster import KMeans

### Verificando se há uma relação entre a idade e rendimento do consumidor

In [None]:
df_aux = df[['age', 'annual_income']]
X = Normalizer().fit_transform(df_aux.values)

In [None]:
k_values = range(2, 11)
clusters = {}

for k in k_values:
    dados = {'k_elementos':[], 'labels': np.ndarray, 'silhueta': [], 'distancia':[]}
    
    kmeans = KMeans(n_clusters=k).fit(X)
    lables = list(kmeans.labels_)
    s = silhouette_score(X, kmeans.labels_, metric='euclidean')
    d = kmeans.inertia_
    
    dados['labels'] = kmeans.labels_
    dados['k_elementos'] = m.quantidade_elementos_cluster(lables, k)
    dados['silhueta'].append(s)
    dados['distancia'].append(d)
    
    clusters[str(k)] = dados

In [None]:
y = []
for k in k_values:
    distancia = clusters[str(k)]['distancia'][0]
    y.append(distancia)
    
plt.figure(figsize=(15, 7))
plt.plot(k_values, y)
plt.title('n clusters X k-means score')
plt.xlabel('n clusters')
plt.ylabel('k-means score')
plt.show()

In [None]:
# Com k = 2
k = 2
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df_aux.plot(x='age', y='annual_income', kind='scatter', color=df['color'],
        figsize=(10,8), title='K = 2')
plt.show()

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

In [None]:
# Com k = 3
k = 3
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df.plot(x='age', y='annual_income', kind='scatter', color=df['color'], figsize=(10,8), title='K = 3')

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

In [None]:
# Com k = 4
k = 4
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df.plot(x='age', y='annual_income', kind='scatter', color=df['color'], figsize=(10,8), title='K = 4')

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

In [None]:
# Com k = 5
k = 5
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df.plot(x='age', y='annual_income', kind='scatter', color=df['color'], figsize=(10,8), title='K = 5')

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

In [None]:
# Com k = 6
k = 6
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df.plot(x='age', y='annual_income', kind='scatter', color=df['color'], figsize=(10,8), title='K = 6')

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

In [None]:
# Com k = 7
k = 7
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df.plot(x='age', y='annual_income', kind='scatter', color=df['color'], figsize=(10,8), title='K = 7')

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

### Verificando se há uma relação entre idade e score

In [None]:
df_aux = df[['age', 'score']]
X = Normalizer().fit_transform(df_aux.values)

In [None]:
k_values = range(2, 11)
clusters = {}

for k in k_values:
    dados = {'k_elementos':[], 'labels': np.ndarray, 'silhueta': [], 'distancia':[]}
    
    kmeans = KMeans(n_clusters=k).fit(X)
    lables = list(kmeans.labels_)
    s = silhouette_score(X, kmeans.labels_, metric='euclidean')
    d = kmeans.inertia_
    
    dados['labels'] = kmeans.labels_
    dados['k_elementos'] = m.quantidade_elementos_cluster(lables, k)
    dados['silhueta'].append(s)
    dados['distancia'].append(d)
    
    clusters[str(k)] = dados

In [None]:
y = []
for k in k_values:
    distancia = clusters[str(k)]['distancia'][0]
    y.append(distancia)
    
plt.figure(figsize=(15, 7))
plt.plot(k_values, y)
plt.title('n clusters X k-means score')
plt.xlabel('n clusters')
plt.ylabel('k-means score')
plt.show()

In [None]:
# Com k = 2
k = 2
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df.plot(x='age', y='score', kind='scatter', color=df['color'], figsize=(10,8), title='K = 2')

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

In [None]:
# Com k = 3
k = 3
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df.plot(x='age', y='score', kind='scatter', color=df['color'], figsize=(10,8), title='K = 3')

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

In [None]:
# Com k = 4
k = 4
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df.plot(x='age', y='score', kind='scatter', color=df['color'], figsize=(10,8), title='K = 4')

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

In [None]:
# Com k = 5
k = 5
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df.plot(x='age', y='score', kind='scatter', color=df['color'], figsize=(10,8), title='K = 5')

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

In [None]:
# Com k = 6
k = 6
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df.plot(x='age', y='score', kind='scatter', color=df['color'], figsize=(10,8), title='K = 6')

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

In [None]:
# Com k = 7
k = 7
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df.plot(x='age', y='score', kind='scatter', color=df['color'], figsize=(10,8), title='K = 7')

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

### Verificando se há uma relação entre redimento anual e score

In [None]:
df_aux = df[['annual_income', 'score']]
X = Normalizer().fit_transform(df_aux.values)

In [None]:
k_values = range(2, 11)
clusters = {}

for k in k_values:
    dados = {'k_elementos':[], 'labels': np.ndarray, 'silhueta': [], 'distancia':[]}
    
    kmeans = KMeans(n_clusters=k).fit(X)
    lables = list(kmeans.labels_)
    s = silhouette_score(X, kmeans.labels_, metric='euclidean')
    d = kmeans.inertia_
    
    dados['labels'] = kmeans.labels_
    dados['k_elementos'] = m.quantidade_elementos_cluster(lables, k)
    dados['silhueta'].append(s)
    dados['distancia'].append(d)
    
    clusters[str(k)] = dados

In [None]:
y = []
for k in k_values:
    distancia = clusters[str(k)]['distancia'][0]
    y.append(distancia)
    
plt.figure(figsize=(15, 7))
plt.plot(k_values, y)
plt.title('n clusters X k-means score')
plt.xlabel('n clusters')
plt.ylabel('k-means score')
plt.show()

In [None]:
# Com k = 2
k = 2
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df.plot(x='score', y='annual_income', kind='scatter', color=df['color'], figsize=(10,8), title='K = 2')

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

In [None]:
# Com k = 3
k = 3
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df.plot(x='score', y='annual_income', kind='scatter', color=df['color'], figsize=(10,8), title='K = 3')

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

In [None]:
# Com k = 4
k = 4
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df.plot(x='score', y='annual_income', kind='scatter', color=df['color'], figsize=(10,8), title='K = 4')

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

In [None]:
# Com k = 5
k = 5
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df.plot(x='score', y='annual_income', kind='scatter', color=df['color'], figsize=(10,8), title='K = 5')

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

In [None]:
# Com k = 6
k = 6
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df.plot(x='score', y='annual_income', kind='scatter', color=df['color'], figsize=(10,8), title='K = 6')

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

In [None]:
# Com k = 7
k = 7
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df.plot(x='score', y='annual_income', kind='scatter', color=df['color'], figsize=(10,8), title='K = 7')

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

In [None]:
# Com k = 8
k = 8
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df.plot(x='score', y='annual_income', kind='scatter', color=df['color'], figsize=(10,8))

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

In [None]:
# Com k = 10
k = 10
df_labels = df
df_labels['color'] = clusters[str(k)]['labels']
df_labels['color'] = df_labels.color.map(lambda c: m.get_color(c))
df_labels

df.plot(x='score', y='annual_income', kind='scatter', color=df['color'], figsize=(10,8))

In [None]:
m.mostrar_quantidade_elementos_cluster(k, clusters)

In [None]:
m.plot_silhouette(X, k, clusters[str(k)]['labels'])

## AgglomerativeClustering

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
df = pd.read_csv('Mall_Customers.csv', sep=',')
df = df.rename(columns={'Age': 'age', 'Annual Income (k$)': 'annual_income', 'Spending Score (1-100)': 'score'})
df = df[['age', 'annual_income']]
df

In [None]:
X = Normalizer().fit_transform(df.values)

In [None]:
# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(n_clusters=None, distance_threshold=0)
model = model.fit(X)
plt.figure(figsize=(15, 7))
plt.xlabel('Clientes')
plt.ylabel('Distância euclidiana')
plt.title('Dendrograma')
m.plot_dendrogram(model,truncate_mode="level", p=3)

In [None]:
# Variando a distância de 1 a 4
distancias = [1, 1.5, 3]
distancias_labels = {}
for distancia in distancias:
    model = AgglomerativeClustering(n_clusters=None, distance_threshold=distancia)
    model = model.fit(X)
    distancias_labels[str(distancia)] = model.labels_    

### Para distância igual a 1, temos 4 grupos

In [None]:
k = 4
distancia = distancias[0]
df['color'] = distancias_labels[str(distancia)]
df

In [None]:
df['color'] = df.color.map(lambda c: m.get_color(c))
df

In [None]:
df.plot(x='age', y='annual_income', kind='scatter', color=df['color'], figsize=(10,8), title='K = 4')
plt.show()

In [None]:
m.plot_silhouette(X, k, distancias_labels[str(distancia)])

### Para distância igual a 1.5, temos 3 grupos

In [None]:
k = 3
distancia = distancias[1]
df['color'] = distancias_labels[str(distancia)]
df

In [None]:
df['color'] = df.color.map(lambda c: m.get_color(c))
df

In [None]:
df.plot(x='age', y='annual_income', kind='scatter', color=df['color'], figsize=(10,8), title='K = 3')
plt.show()

In [None]:
m.plot_silhouette(X, k, distancias_labels[str(distancia)])

### Para distância igual a 3, temos 2 grupos

In [None]:
k = 2
distancia = distancias[2]
df['color'] = distancias_labels[str(distancia)]
df

In [None]:
df['color'] = df.color.map(lambda c: m.get_color(c))
df

In [None]:
df.plot(x='age', y='annual_income', kind='scatter', color=df['color'], figsize=(10,8), title='K = 2')
plt.show()

In [None]:
m.plot_silhouette(X, k, distancias_labels[str(distancia)])

# Apriori/FP-Growth

In [53]:
import requests
from mlxtend.frequent_patterns import apriori, fpgrowth
from mlxtend.frequent_patterns import association_rules

In [54]:
response = requests.get('https://dados.es.gov.br/api/3/action/datastore_search?resource_id=38cc5066-020d-4c5a-b4c0-e9f690deb6d4&limit=10000').json()
records = response['result']['records']

In [55]:
def true_false(status):
    return True if status == 'Sim' else False

In [56]:
# Criando o dataframe
df = pd.DataFrame(columns=('febre', 'tosse', 'coriza', 'dor_garganta', 'diarreia', 'cefaleia', 'dificuldade_respirar', 
                           'comorbidade_pulmao', 'comorbidade_cardio', 'comorbidade_renal', 'comorbidade_diabetes', 
                           'comorbidade_obesidade', 'comorbidade_tabagismo'))
for record in records:
    df = df.append({
            'febre': true_false(record['Febre']),
            'tosse': true_false(record['Tosse']),
            'coriza': true_false(record['Coriza']),
            'dor_garganta': true_false(record['DorGarganta']),
            'diarreia': true_false(record['Diarreia']),
            'cefaleia': true_false(record['Cefaleia']),
            'dificuldade_respirar': true_false(record['DificuldadeRespiratoria']),
            'comorbidade_pulmao': true_false(record['ComorbidadePulmao']),
            'comorbidade_cardio': true_false(record['ComorbidadeCardio']),
            'comorbidade_renal': true_false(record['ComorbidadeRenal']),
            'comorbidade_diabetes': true_false(record['ComorbidadeDiabetes']),
            'comorbidade_obesidade': true_false(record['ComorbidadeObesidade']),
            'comorbidade_tabagismo': true_false(record['ComorbidadeTabagismo'])
    }, ignore_index=True)
df    

Unnamed: 0,febre,tosse,coriza,dor_garganta,diarreia,cefaleia,dificuldade_respirar,comorbidade_pulmao,comorbidade_cardio,comorbidade_renal,comorbidade_diabetes,comorbidade_obesidade,comorbidade_tabagismo
0,True,True,False,True,True,False,False,False,False,False,False,False,False
1,False,False,True,False,False,False,True,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,True,True,True,False,False,True,False,False,False,False,False,False,False
4,True,True,True,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,True,True,False,False,False,True,False,False,False,False,False,False,False
9996,True,True,False,False,False,False,False,False,False,False,False,False,False
9997,False,True,False,True,False,False,False,False,False,False,False,False,False
9998,False,False,False,False,True,False,False,False,False,False,False,False,False


In [57]:
#Verificando se há linhas com dados nulos

df[df.isnull().T.any()]

Unnamed: 0,febre,tosse,coriza,dor_garganta,diarreia,cefaleia,dificuldade_respirar,comorbidade_pulmao,comorbidade_cardio,comorbidade_renal,comorbidade_diabetes,comorbidade_obesidade,comorbidade_tabagismo


## Apriori

### Padrões Frequentes

In [58]:
#Padrões frquentes com Apriori
fp = apriori(df, min_support=0.2, use_colnames=True)
fp.sort_values('support', ascending=False)

Unnamed: 0,support,itemsets
4,0.5009,(cefaleia)
1,0.4984,(tosse)
2,0.3719,(coriza)
0,0.3447,(febre)
3,0.3244,(dor_garganta)
8,0.2793,"(cefaleia, tosse)"
6,0.254,"(tosse, coriza)"
9,0.2197,"(cefaleia, coriza)"
10,0.2115,"(dor_garganta, cefaleia)"
7,0.2079,"(dor_garganta, tosse)"


### Regras de Associação

In [59]:
ar = association_rules(fp, metric='confidence', min_threshold=0.5)
ar.sort_values('confidence', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2,(coriza),(tosse),0.3719,0.4984,0.254,0.682979,1.370344,0.068645,1.582231
7,(dor_garganta),(cefaleia),0.3244,0.5009,0.2115,0.651973,1.301603,0.049008,1.434084
3,(dor_garganta),(tosse),0.3244,0.4984,0.2079,0.640875,1.285866,0.046219,1.39673
0,(febre),(tosse),0.3447,0.4984,0.2076,0.602263,1.208393,0.035802,1.261134
6,(coriza),(cefaleia),0.3719,0.5009,0.2197,0.59075,1.179378,0.033415,1.219549
5,(tosse),(cefaleia),0.4984,0.5009,0.2793,0.560393,1.118773,0.029651,1.135333
4,(cefaleia),(tosse),0.5009,0.4984,0.2793,0.557596,1.118773,0.029651,1.133806
1,(tosse),(coriza),0.4984,0.3719,0.254,0.509631,1.370344,0.068645,1.280872


## FP-Growth

### Padrões Frequentes

In [62]:
#Padrões frquentes com Apriori
fp = fpgrowth(df, min_support=0.25, use_colnames=True)
fp.sort_values('support', ascending=False)

Unnamed: 0,support,itemsets
4,0.5009,(cefaleia)
0,0.4984,(tosse)
3,0.3719,(coriza)
1,0.3447,(febre)
2,0.3244,(dor_garganta)
5,0.2793,"(cefaleia, tosse)"
6,0.254,"(tosse, coriza)"


### Regras de Associação

In [64]:
ar = association_rules(fp, metric='confidence', min_threshold=0.5)
ar.sort_values('confidence', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
3,(coriza),(tosse),0.3719,0.4984,0.254,0.682979,1.370344,0.068645,1.582231
1,(tosse),(cefaleia),0.4984,0.5009,0.2793,0.560393,1.118773,0.029651,1.135333
0,(cefaleia),(tosse),0.5009,0.4984,0.2793,0.557596,1.118773,0.029651,1.133806
2,(tosse),(coriza),0.4984,0.3719,0.254,0.509631,1.370344,0.068645,1.280872
