#TB1: Fundamentos de Modelos de Grafos Probabilísticos
##CC58 Tópicos en Ciencias de la Computación
###Pedro Shiguihara
####UPC 2021-1

In [None]:
import ipywidgets as widgets
from ipywidgets import HBox, VBox
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import math
%matplotlib inline

##Generación de DataSet

In [None]:
%%writefile data.txt
outlook,temperature,humidity,windy,play
overcast,hot,high,FALSE,yes
overcast,cool,normal,TRUE,yes
overcast,mild,high,TRUE,yes
overcast,hot,normal,FALSE,yes
rainy,mild,high,FALSE,yes
rainy,cool,normal,FALSE,yes
rainy,cool,normal,TRUE,no
rainy,mild,normal,FALSE,yes
rainy,mild,high,TRUE,no
sunny,hot,high,FALSE,no
sunny,hot,high,TRUE,no
sunny,mild,high,FALSE,no
sunny,cool,normal,FALSE,yes
sunny,mild,normal,TRUE,yes

Overwriting data.txt


##Clase Dataframe

In [None]:
class Dataframe:
    """
    CREA UN OBJETO DATAFRAME QUE IMPLEMENTA MÉTODOS DE FUNCIÓN DE CONTEO
    CARDINALIDAD, PROBABILIDAD MARGINAL, Y PROBABILIDAD CONJUNTA
    """
    def __init__(self,path):
        #LECTURA DE DATOS
        #RECOLECCION DE FILAS
        finput = open("data.txt","r")
        self.M = len([line.strip("\n") for line in finput if line != "\n"]) - 1
        print(self.M)
        finput.close()
        #RECOLECCION DE DATA
        finput = open("data.txt","r")
        self.hiper_param = 1
        self.headers = finput.readline().rstrip().split(',')
        self.data = []
        for i in range(self.M):
            self.data.append(finput.readline().rstrip().split(','))
        #self.M = len(data)
        self.cols = []
        #OBTENER VALORES DISTINTOS DE CADA COLUMNA
        self.unique_val = dict()
        for i, head in enumerate(self.headers):
            self.unique_val[head] = list()
            for row in self.data:
                if row[i] not in self.unique_val[head]:
                    self.unique_val[head].append(row[i])
    
    #IMPRIMIR DATASET
    def __str__(self):
        result = """"""
        for head in self.headers:
            result += '|{:>11}| '.format(head)
        result+="\n---------------------------------------------------------------------\n"
        for row in self.data:
            for val in row:
                result+='|{:>11}| '.format(val)
            result+="\n"
        return result

    #FUNCION IDX
    def idx(self, name, headers):
        for i in range(len(headers)):
		        if(headers[i] == name):
			        return i
        return 0

    #FUNCIÓN CONTEO
    def m(self, param):
        cnt = 0
        for row in self.data:
            founded = True
            for column, value in param.items():
                if row[self.headers.index(column)] != value:
                    founded = False
            if founded:
                cnt += 1
        return cnt

    def m_c(self):
      m = 0
      for x in self.data:
          cnt = 0
          for i in range(len(self.cols)):
              if (x[self.cols[i][0]] == self.cols[i][1]):
                  cnt += 1
          if (cnt == len(self.cols)):
              m += 1
      return m

    #FUNCIÓN CARDINALIDAD
    def card(self, column):
        try:
            return len(self.unique_val[column])
        except:
            print("No se encuentra la columna")

    #FUNCION CARDINALIDAD CONJUNTA
    def card_c(self, columns):
        cards = 1
        for column in columns:
            cards *= self.card(column)
        return cards

    #FUNCIÓN DE PROBABILIDAD DISTRIBUCIÓN MARGINAL
    def prob_marg(self,column, value):
        # (m[value] + p) / (M + (p * card(column))
        exp1 = self.m({column:value}) + self.hiper_param
        exp2 = self.M + (self.hiper_param * self.card(column))
        return round(exp1/exp2,6)
    
    #FUNCIÓN DE PROBABILIDAD CONJUNTA
    def prob_conjunta(self, param):
        # (m[value] + p) / (M + (p * card(column))
        exp1 = self.m(param) + self.hiper_param
        exp2 = self.M + (self.hiper_param * self.card_c(list(param.keys())))
        return round(exp1/exp2,6)

In [None]:
df = Dataframe("data.txt")
print(df)

14
|    outlook| |temperature| |   humidity| |      windy| |       play| 
---------------------------------------------------------------------
|   overcast| |        hot| |       high| |      FALSE| |        yes| 
|   overcast| |       cool| |     normal| |       TRUE| |        yes| 
|   overcast| |       mild| |       high| |       TRUE| |        yes| 
|   overcast| |        hot| |     normal| |      FALSE| |        yes| 
|      rainy| |       mild| |       high| |      FALSE| |        yes| 
|      rainy| |       cool| |     normal| |      FALSE| |        yes| 
|      rainy| |       cool| |     normal| |       TRUE| |         no| 
|      rainy| |       mild| |     normal| |      FALSE| |        yes| 
|      rainy| |       mild| |       high| |       TRUE| |         no| 
|      sunny| |        hot| |       high| |      FALSE| |         no| 
|      sunny| |        hot| |       high| |       TRUE| |         no| 
|      sunny| |       mild| |       high| |      FALSE| |         no| 
|   

In [None]:
#VALORES ÚNICOS EN EL DATASET
for key, val in df.unique_val.items():
    print(key,val)

outlook ['overcast', 'rainy', 'sunny']
temperature ['hot', 'cool', 'mild']
humidity ['high', 'normal']
windy ['FALSE', 'TRUE']
play ['yes', 'no']


In [None]:
df.m({'outlook':'overcast','temperature':'hot'})

2

In [None]:
df.card('outlook')

3

In [None]:
df.prob_marg('outlook','overcast')

0.294118

##FUNCIÓN CONTEO

In [None]:
def combinations(iterable, r):
    pool = tuple(iterable)
    n = len(pool)
    if r > n:
        return
    indices = list(range(r))
    yield tuple(pool[i] for i in indices)
    while True:
        for i in reversed(range(r)):
            if indices[i] != i + n - r:
                break
        else:
            return
        indices[i] += 1
        for j in range(i+1, r):
            indices[j] = indices[j-1] + 1
        yield tuple(pool[i] for i in indices)

In [None]:
def product(*args, repeat=1):
    pools = [tuple(pool) for pool in args] * repeat
    result = [[]]
    for pool in pools:
        result = [x+[y] for x in result for y in pool]
    for prod in result:
        yield tuple(prod)

In [None]:
#PRUEBA PRODUCTO CARTESIANO
for a in product(df.unique_val['outlook'],df.unique_val['temperature'],repeat=1):
    print(a)

('overcast', 'hot')
('overcast', 'cool')
('overcast', 'mild')
('rainy', 'hot')
('rainy', 'cool')
('rainy', 'mild')
('sunny', 'hot')
('sunny', 'cool')
('sunny', 'mild')


In [None]:
#FUNCIÓN AUXILIAR PARA CONVERTIR UN CONJUNTO A TITULO
def to_title(headers):
    string_large = ""
    for head in headers:
        string_large += (head[:4]+" & ")
    return string_large[:-2]

In [None]:
#FUNCIÓN AUXILIAR PARA CONVERTIR UN CONJUNTO A LABEL
def to_label(values):
    string_large = ""
    for val in values:
        string_large += (val+" & ")
    return string_large[:-2]

In [None]:
#COMBINACIÓN DE VALORES
combs = combinations(df.headers,2)
for comb in combs:
    print(comb)
    for values in product(*[df.unique_val[x] for x in comb],repeat=1):
        print(values)
    print()

('outlook', 'temperature')
('overcast', 'hot')
('overcast', 'cool')
('overcast', 'mild')
('rainy', 'hot')
('rainy', 'cool')
('rainy', 'mild')
('sunny', 'hot')
('sunny', 'cool')
('sunny', 'mild')

('outlook', 'humidity')
('overcast', 'high')
('overcast', 'normal')
('rainy', 'high')
('rainy', 'normal')
('sunny', 'high')
('sunny', 'normal')

('outlook', 'windy')
('overcast', 'FALSE')
('overcast', 'TRUE')
('rainy', 'FALSE')
('rainy', 'TRUE')
('sunny', 'FALSE')
('sunny', 'TRUE')

('outlook', 'play')
('overcast', 'yes')
('overcast', 'no')
('rainy', 'yes')
('rainy', 'no')
('sunny', 'yes')
('sunny', 'no')

('temperature', 'humidity')
('hot', 'high')
('hot', 'normal')
('cool', 'high')
('cool', 'normal')
('mild', 'high')
('mild', 'normal')

('temperature', 'windy')
('hot', 'FALSE')
('hot', 'TRUE')
('cool', 'FALSE')
('cool', 'TRUE')
('mild', 'FALSE')
('mild', 'TRUE')

('temperature', 'play')
('hot', 'yes')
('hot', 'no')
('cool', 'yes')
('cool', 'no')
('mild', 'yes')
('mild', 'no')

('humidity', '

In [None]:
#@title Tabla de Conteo
n_variables =  2#@param {type:"integer"}

children = list()
combs = combinations(df.headers,n_variables)

for head_comb in combs:
    labels = list()
    values = list()

    plt_labels = []
    plt_values = []

    for val_combs in product(*[df.unique_val[x] for x in head_comb],repeat=1):
        ####### SE AGREGAN LAS ETIQUETAS (VALORES DE LA VARIABLE)
        label = widgets.Button(description = to_label(val_combs),layout=widgets.Layout(width="100%"))
        plt_labels.append(to_label(val_combs))

        param = dict()
        for i in range(len(head_comb)):
 
            param[head_comb[i]] = val_combs[i]

        ####### SE AGREGAN LOS VALORES NUMERICOS DE LA DERECHA
        value = widgets.Button(description = str(df.m(param)))
        plt_values.append(df.m(param))
        
        #AGRUPAR LAS ETIQUETAS Y VALORES 
        labels.append(label)
        values.append(value)
    #COLOCAR EN BUCKETS
    out = widgets.Output()
    with out:
        plt.figure(figsize=(10, 5))
        plt.bar(plt_labels,plt_values,align='center',edgecolor='black') # A bar chart
        plt.ylabel('Frecuencia')
        plt.yticks(np.arange(0, max(plt_values)+1, 1))
        plt.xticks(rotation=45)
        plt.show()
    # values.append(fig.canvas.draw())
    label_box = widgets.VBox(labels)
    val_box = widgets.VBox(values)
    #ESTO HACE LA MAGIA PARA COLOCAR LAS COLUMNAS
    children.append(widgets.HBox([label_box,val_box,out]))

tab = widgets.Tab()
tab.children = children
for i,title in enumerate([to_title(comb) for comb in combinations(df.headers,n_variables)]):
    tab.set_title(i,title)
display(tab)

Tab(children=(HBox(children=(VBox(children=(Button(description='overcast & hot & high ', layout=Layout(width='…

# DISTRIBUCIÓN MARGINAL

In [None]:
#@title Tabla de Distribucion Marginal
children = list()
for i, head in enumerate(df.headers):
    labels = [widgets.Button(description = x) for x in df.unique_val[head]]
    values = []

    plt_labels = [x for x in df.unique_val[head]]
    plt_values = []

    for elem in df.unique_val[head]:
      values.append(widgets.Button(description = str(df.prob_marg(head, elem))))
      plt_values.append(float(df.prob_marg(head, elem)))

    out = widgets.Output()
    with out:
        plt.figure(figsize=(10, 5))
        plt.bar(plt_labels,plt_values,align='center',edgecolor='black') # A bar chart
        plt.ylabel('Frecuencia')
        plt.xticks(rotation=45)
        plt.ylim([0, 1])
        plt.yticks(np.arange(0, 1.1, 0.1))
        plt.show()

    label_box = widgets.VBox(labels)
    val_box = widgets.VBox(values)
    children.append(widgets.HBox([label_box,val_box,out]))

tab = widgets.Tab()
tab.children = children
for i in range(len(df.headers)):
    tab.set_title(i,df.headers[i])

tab

Tab(children=(HBox(children=(VBox(children=(Button(description='overcast', style=ButtonStyle()), Button(descri…

# DISTRIBUCIÓN CONJUNTA

In [None]:
#PRUEBA PROB_CONJUNTA
df.prob_conjunta({'outlook':'overcast', 'temperature':'hot'})

0.130435

In [None]:
#@title Tabla de Distribucion Conjunta
n_variables =  3#@param {type:"integer"}

children = list()
combs = combinations(df.headers,n_variables)

for head_comb in combs:
    labels = list()
    values = list()

    plt_labels = []
    plt_values = []

    for val_combs in product(*[df.unique_val[x] for x in head_comb],repeat=1):
        ####### SE AGREGAN LAS ETIQUETAS (VALORES DE LA VARIABLE)
        label = widgets.Button(description = to_label(val_combs),layout=widgets.Layout(width="100%"))
        plt_labels.append(to_label(val_combs))

        param = dict()
        for i in range(len(head_comb)):
 
            param[head_comb[i]] = val_combs[i]
        ####### SE AGREGAN LOS VALORES NUMERICOS DE LA DERECHA
        value = widgets.Button(description = str(df.prob_conjunta(param)))
        plt_values.append(df.prob_conjunta(param))
        
        #AGRUPAR LAS ETIQUETAS Y VALORES 
        labels.append(label)
        values.append(value)

    out = widgets.Output()
    with out:
        plt.figure(figsize=(10, 5))
        plt.bar(plt_labels,plt_values,align='center',edgecolor='black') # A bar chart
        plt.ylabel('Frecuencia')
        plt.ylim([0, 1])
        plt.yticks(np.arange(0, 1.1, 0.1))
        plt.xticks(rotation=90)
        plt.show()

    #COLOCAR EN BUCKETS
    label_box = widgets.VBox(labels)
    val_box = widgets.VBox(values)
    #ESTO HACE LA MAGIA PARA COLOCAR LAS COLUMNAS
    children.append(widgets.HBox([label_box,val_box,out]))

tab = widgets.Tab()
tab.children = children
for i,title in enumerate([to_title(comb) for comb in combinations(df.headers,n_variables)]):
    tab.set_title(i,title)
tab

Tab(children=(HBox(children=(VBox(children=(Button(description='overcast & hot & high ', layout=Layout(width='…

# PROBABILIDAD CONDICIONAL

In [None]:
def fact(n):
  ans = 1
  for i in range(2,n+1):
    ans*=i
  
  return ans

In [None]:
def get_subsets(df):
  n = len(df.headers)
  subs = []
  for i in range(1, (1 << n)):
    A = []
    for j in range(n):
      if((i & (1<<j)) != 0):
        A.append(df.headers[j])

    subs.append(A)
  
  subs.sort(key=lambda tup: len(tup))
  return subs

In [None]:
subsets = get_subsets(df)
combs = []

nfact = fact(len(df.headers))
nkfact = nfact
k = 1
lo = 0

for var in range(1, len(df.headers)):
  nkfact = nkfact/(len(df.headers) - var + 1)
  k*= var 
	
  hi = int(nfact/(k*nkfact))

  aux = []

  for elem in df.headers:
    for i in range(lo, lo + hi):
      if elem in subsets[i]:
        continue

      aux.append("P(" + elem + "|" + ','.join(subsets[i]) + ")" )
  
  combs.append(aux)
  lo+= hi

In [None]:
combs

In [None]:
#@title Probabilidades Condicionales
children = list()
for comb in combs:
    labels = list()
    for elem in comb:
        label = widgets.Button(description=(elem),layout=widgets.Layout(width="100%"))
        labels.append(label)
        
    label_box = widgets.VBox(labels)
    children.append(widgets.HBox([label_box]))

tab = widgets.Tab()
tab.children = children
for i in range(5):
    tab.set_title(i,str(i+1)+" VARIABLES")
tab

Tab(children=(HBox(children=(VBox(children=(Button(description='P(outlook|temperature)', layout=Layout(width='…

In [None]:
def calc_factores(*args):
    factors = []

    n_groups = len(args)
    n_factor = 1

    for group in args:
      n_factor *= len(group)

    ids = [0] * n_groups
    
    it = 0

    while it < n_factor: 
      factor = []
      for i in range(n_groups):
        if ids[i] == len(args[i]):
          ids[i] = 0
          ids[i+1] += 1

        factor.append(args[i][ids[i]])
        if i==0:
          ids[i] += 1
        
      factors.append(factor)
      it += 1

    return factors
  

In [None]:
cero = [0,1]
uno = [0,1]
dos = [0,1,2]  

In [None]:
for factor in calc_factores(dos,uno):
  print(factor)

[0, 0]
[1, 0]
[2, 0]
[0, 1]
[1, 1]
[2, 1]


In [None]:
def normalizar(valores):
    valores_normalizados = []
    total = 0
    for valor in valores:
        total += valor
    for valor in valores:
        valores_normalizados.append(valor/total)
      