In [1]:
import math
import json
import numpy as np
#Para leer archivo arfff
from scipy.io import arff
#Para manipular los conjuntos de datos más fácil
import pandas as pd
#Para medir la precisión del clasificador
from sklearn.metrics import accuracy_score

### Carga los conjuntos de datos flags-train y flags-test 

In [9]:
# Lee el archivo .arff y lo carga a algo parecido a un diccionario
data_flag = arff.loadarff('./flags/flags-train.arff')
# Crea el dataframe del conjunto de entrenamiento
train_df_flags = pd.DataFrame(data_flag[0])
# Lo mismo pero con el conjunto de prueba
data_flag = arff.loadarff('./flags/flags-test.arff')
test_df_flags = pd.DataFrame(data_flag[0])
# Descomentar si sólo se quieren unas cuantas instancias
labels = ['red','green','blue','yellow','white','black','orange']
train_df_flags.shape

(129, 26)

In [10]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.base import clone
from sklearn.utils.validation import check_is_fitted
class CircularChainClassifier(object):
	"""
	Súper clase de clasificador en cadena circular, recibe
	como parámetros un dataframe de pandas que contiene al 
	conjunto de datos y la lista de atributos que son las posibles
	etiquetas de cada ejemplo.
	"""
	def __init__(self, classifier):
		self.classifier = classifier

	def train(self, X, labels):
		self.list_of_classifiers = { label : clone(self.classifier) for label in labels }
		for lab in self.list_of_classifiers:
			print(id(self.list_of_classifiers[lab]))
		self.train_set_x = X
		self.labels = labels
		self.visited = { label : False for label in labels }
		for label in self.labels:
			self.train_one_link(label)
	def train_one_link(self, label, weirdo=False):
		X = self.drop_not_depend_on_columns(self.train_set_x, label)
		y = self.train_set_x[label]
		print("label {} with shape {}".format(label, X.shape))
		self.list_of_classifiers[label].fit(X, y)
		print(self.list_of_classifiers[label].fit(X, y))
	def drop_not_depend_on_columns(self, X, label):
		label_index = self.labels.index(label)
		if label_index + 1 == len(self.labels):
			return X.drop(label, axis=1)
		labels_to_drop = self.labels[label_index:]
		X = X.drop(labels_to_drop, axis=1)
		return X
	def run(self, X):
		print("running...")
		for classifier_label in self.list_of_classifiers:
			print(classifier_label)
			label = classifier_label
			y = X[label]
			X_hat = self.drop_not_depend_on_columns(X, classifier_label)
			print(X_hat.shape)
			try:
				y_pred = self.list_of_classifiers[classifier_label].predict(X_hat)
				print(y_pred)
			except Exception as e:
				print(e)
			print("Accuracy score for classifier {} = {}".format(label, accuracy_score(y, y_pred)))
	def classify(self):
		pass
	def evaluation(self):
		pass


In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
labels = ['red','green','blue','yellow','white','black','orange']
train_df_flags = train_df_flags[train_df_flags.columns[:]].apply(le.fit_transform)
test_df_flags = test_df_flags[test_df_flags.columns[:]].apply(le.fit_transform)
ccc = CircularChainClassifier(MultinomialNB())
ccc.train(train_df_flags, labels)
ccc.run(test_df_flags)

139946612634904
139946612635072
139946612634568
139946612633672
139946612634848
139946612634232
139946612633896
label red with shape (129, 19)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
label green with shape (129, 20)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
label blue with shape (129, 21)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
label yellow with shape (129, 22)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
label white with shape (129, 23)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
label black with shape (129, 24)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
label orange with shape (129, 25)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
running...
yellow
(65, 22)
[0 0 1 1 1 1 0 0 1 0 0 0 1 0 1 0 1 0 1 1 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 0 1
 0 1 0 0 0 1 0 0 1 1 1 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 0 1]
Accuracy score for classifier yellow = 0.676923076923077
blue
(65, 21)
[0 1 1 1 1 1 1

### Carga los conjuntos de datos emotions-train y emotions-test 

In [3]:
data_emotions = arff.loadarff('./emotions/emotions-train.arff')
train_df_emo = pd.DataFrame(data_emotions[0])
data_emotions = arff.loadarff('./emotions/emotions-test.arff')
test_df_emo = pd.DataFrame(data_emotions[0])

### Funciones para hacer la discretización de variables continuas usando PKID

In [101]:
global inf
inf = 10**20

def different(prev, current):
    eps=10e-5
    if prev != inf:
        return abs(prev-current) > eps
    else:
        return True

def proportional_k_interval_discretization(df):
    """
    Aplica dicretización intervalo k proporcional a un dataframe de
    pandas.
    """
    n = len(df)
    n_sqrt = int(math.sqrt(n))
    for attribute in df:
        if df[attribute].dtype != 'object':
            df = df.sort_values(attribute)
            local_index = 0
            current_interval = 0
            pred = inf
            for index, row in df.iterrows():
                #el intervalo incrementa cada sqrt(n) 
                #el tamaño si debe crecer
                local_index %= n_sqrt
                pred = df.at[index, attribute]
                df.at[index, attribute] = current_interval
                if local_index == n_sqrt - 1 and different(pred, df.at[index,attribute]) and current_interval < n_sqrt-1:
                    current_interval += 1
                    
                if not (local_index == n_sqrt-1 and not different(pred,df.at[index,attribute])):
                    local_index += 1                   
    return df.sort_index()

### Implementación del entrenador de Bayes

In [102]:
def train_bayes_freq(labels, dataframe, output_file="freqs.json"):
    """
    Función para crear un JSON con las frecuencias de los atributos para después
    calcular las probabilidades condicionales y a priori.
    
    Dado un conjunto de datos como el siguiente:
    
    +----------+------+----------+----------+----------+----------+------+---------+------+------+
    | landmass | zone | language | religion | crescent | triangle | icon | animate | text | red  |
    +----------+------+----------+----------+----------+----------+------+---------+------+------+
    | b'5'     | b'1' | b'10'    | b'7'     | b'0'     | b'0'     | b'0' | b'0'    | b'0' | b'0' |
    +----------+------+----------+----------+----------+----------+------+---------+------+------+
    | b'6'     | b'1' | b'1'     | b'1'     | b'0'     | b'0'     | b'1' | b'1'    | b'1' | b'1' |
    +----------+------+----------+----------+----------+----------+------+---------+------+------+
    | b'5'     | b'1' | b'8'     | b'2'     | b'0'     | b'0'     | b'0' | b'0'    | b'0' | b'1' |
    +----------+------+----------+----------+----------+----------+------+---------+------+------+
    | b'5'     | b'1' | b'8'     | b'2'     | b'0'     | b'0'     | b'0' | b'0'    | b'0' | b'1' |
    +----------+------+----------+----------+----------+----------+------+---------+------+------+
    
    Se genera el JSON:
    
    {
      "language": {
        "b'10'": 1,
        "b'2'": 1,
        "b'8'": 1,
        "numberOfClasses": 4,
        "b'1'": 1,
        "red": {
          "b'10'": {
            "b'1'": 1
          },
          "b'1'": {
            "b'1'": 1
          },
          "b'2'": {
            "b'0'": 1
          },
          "b'8'": {
            "b'1'": 1
          }
        }
      },
      "triangle": {
        "numberOfClasses": 2,
        "b'1'": 1,
        "b'0'": 3,
        "red": {
          "b'1'": {
            "b'1'": 1
          },
          "b'0'": {
            "b'0'": 1
          }
        }
      },
      "text": {
        "numberOfClasses": 1,
        "b'0'": 4,
        "red": {
          "b'0'": {
            "b'0'": 1
          }
        }
      },
      "zone": {
        "b'4'": 3,
        "numberOfClasses": 2,
        "b'1'": 1,
        "red": {
          "b'4'": {
            "b'0'": 1
          },
          "b'1'": {
            "b'1'": 1
          }
        }
      },
      "landmass": {
        "b'4'": 2,
        "numberOfClasses": 2,
        "b'1'": 2,
        "red": {
          "b'4'": {
            "b'1'": 2
          },
          "b'1'": {
            "b'1'": 1
          }
        }
      },
      "crescent": {
        "numberOfClasses": 2,
        "b'1'": 1,
        "b'0'": 3,
        "red": {
          "b'1'": {
            "b'1'": 1
          },
          "b'0'": {
            "b'0'": 1
          }
        }
      },
      "N": 4,
      "icon": {
        "numberOfClasses": 1,
        "b'0'": 4,
        "red": {
          "b'0'": {
            "b'0'": 1
          }
        }
      },
      "religion": {
        "b'2'": 1,
        "b'5'": 1,
        "numberOfClasses": 4,
        "b'1'": 1,
        "b'0'": 1,
        "red": {
          "b'5'": {
            "b'1'": 1
          },
          "b'1'": {
            "b'1'": 1
          },
          "b'2'": {
            "b'1'": 1
          },
          "b'0'": {
            "b'0'": 1
          }
        }
      },
      "animate": {
        "numberOfClasses": 2,
        "b'1'": 1,
        "b'0'": 3,
        "red": {
          "b'1'": {
            "b'1'": 1
          },
          "b'0'": {
            "b'0'": 1
          }
        }
      },
      "red": {
        "numberOfClasses": 2,
        "b'1'": 3,
        "b'0'": 1
      }
    }
    
    :param labels: Un arreglo con el nombre de las etiquetas u objetivos del conjunto de datos.
    :type labels: list.
    :param dataframe: El dataframe de pandas con el conjunto de datos de entrenamiento.
    :type dataframe: pandas.Dataframe.
    
    """
    
    nc = 'numberOfClasses'
    len_training_instances = 'N'
    
    # El diccionario donde se almacenan las frecuencias 
    # y que se guarda en un JSON al final de la función
    
    frequency = dict()
    frequency[len_training_instances] = len(dataframe)
    
    # Por cada atributo objetivo (etiqueta) contamos frecuencias
    for label in labels:
        
        freq_label = dataframe[label].value_counts()
        
        # Si la etiqueta no está en el diccionario la agregamos y 
        # además añadimos el número de clases distintas
        if not label in frequency:
            frequency[label] = dict()
            frequency[label][nc] = len(dataframe[label].unique())
        # Iteramos sobre todos los posibles valores de la clase y sus 
        # frecuencias
        for label_val, freq in freq_label.iteritems():
            # Guardamos en el diccionarios los nombres de las clases de nuestra etiqueta y
            # sus frecuencias
            frequency[label][label_val] = freq
        
        # Iteramos sobre todas las instancias de entrenamiento.
        for attribute in dataframe:
            if attribute == label:
                continue
            # Si no existe el atributo en nuestro diccionario de frecuencias.
            if not attribute in frequency:
                frequency[attribute] = dict()
                # El número de valores distintos que puede tomar el atributo
                frequency[attribute][nc] =len(dataframe[attribute].unique())
                
                # Por cada valor distinto que puede tomar el atributo,
                # sacamos la frecuencia.
                freq_attr = dataframe[attribute].value_counts()
                for att_val, freq in freq_attr.iteritems():
                    frequency[attribute][att_val] = freq
            
            # Calculas las frecuencias por cada valor que toma el atributo 
            # y cada valor que puede tomar la etiqueta. Esto es X_i = x_i AND C = c
            frequency[attribute][label] = dict()
            freq_attr_and_label_series = dataframe.groupby(attribute)[label].value_counts()
            for index, value in freq_attr_and_label_series.iteritems():
                attr_value = index[0]
                label_value = index[1]
                frequency[attribute][label][attr_value] = dict()
                frequency[attribute][label][attr_value][label_value] = value

    # Guardo el diccionario como un JSON.
    with open(output_file, 'w') as outfile:
        json.dump(frequency, outfile)


def apply_bayes(label, dataframe, input_file="freqs.json", k=1, m=2):
    """
    Aplica Naive Bayes a un conjunto de prueba en un dataframe de pandas.
    
    
    """
    pred_labels = []
    key_nc = 'numberOfClasses'
    
    # Obtiene un objeto con los valores de la clase.
    y = dataframe[label].unique()
    
    # Abre el archivo con las frecuencias del conjunto de entrenamiento.
    with open(input_file, 'r') as f:
        freq = json.loads(f.read())
    
    # Define el N, el número de instancias.
    N = freq['N']
    
    
    # Se itera sobre cada instancia de prueba.
    for _, row in dataframe.iterrows():
        
        # Se incializa la variable que guarda la etiqueta de la instancia.
        row_label = None
        max_prob = -10E10
        
        # Por cada valor posible de la clase u etiqueta.
        for y_hat in y:
            #cast_y_hat = y_hat.decode('ASCII') if type(y_hat) is bytes else y_hat
            
            # nc = # de instancias que satisfacen C = c
            nc = freq[label].get(y_hat, 0)
            
            # n  = # de clases
            n = freq[label][key_nc]
            
            # Calculamos P(C=c) usando Laplace-estimate
            p_c_laplace_estimator = (nc + k) / (N + n * k)
            ans = p_c_laplace_estimator
            
            #print("P({}={}) = {}".format(label, cast_y_hat, ans))
            # Iteramos sobre cada atributo de la instancia para
            # calcular P(Xi=xi | C=c) usando M-estimate
            for attr, x_i in row.iteritems():
                if attr == label:
                    continue
                p_xi_laplace_estimator = 0
                
                # Obtenemos el # de instancias que satisfacen Xi = xi
                n_xi = freq[attr].get(x_i, 0)
                
                # Obtenemos el # de valores posibles que toma Xi
                n = freq[attr][key_nc]
                
                #print("({} + {}) / ({} + {} * {})".format(n_xi, k, N, n, k))
                
                # Calculamos P(Xi=xi) usando Laplace-estimate
                p_xi_laplace_estimator = (n_xi + k) / (N + n * k)
                nci = 0
                #print("P({}={}) = {}".format(attr, x_i, p_xi_laplace_estimator))
                
                # Obtenemos el número de instancias que satisdacen Xi = xi y C = c
                if x_i in freq[attr][label] and y_hat in freq[attr][label][x_i]:
                    nci = freq[attr][label][x_i][y_hat]
                #print("({} + {} * {}) / ({} + {})".format(nci, m, p_xi_laplace_estimator, nc, m))
                
                # Calculamos P(Xi=xi | C=c) usando M-estimate
                m_estimator_xi_given_c = (nci + m * p_xi_laplace_estimator) / (nc + m)
                
                #print("P({}={}|{}={}) = {}".format(attr, x_i, label, cast_y_hat, m_estimator_xi_given_c))
                
                # Hacemos el producto de la probabilidades
                ans *= m_estimator_xi_given_c
            # Cambio la etiqueta si la probabilidad es mayor para este valor de la 
            # etiqueta.
            if ans > max_prob:
                row_label = y_hat
                max_prob = ans
        # Agrego a mi vector de etiquetas inferida
#         pred_labels.append("b'0'" if row_label == "b'1'" else "b'1'")
        pred_labels.append(row_label)
    
    pred_labels = np.array(pred_labels)
    correct_labels = dataframe[label]
    print("Accuracy Score = {}".format(accuracy_score(correct_labels, pred_labels)))
    #print("Accuracy Score Not Normalized = {}".format(accuracy_score(correct_labels, pred_labels, normalize=False)))

In [177]:
from sklearn.preprocessing import KBinsDiscretizer
# train_df_flags['area'] = pd.qcut(train_df_flags['area'], 5, labels=False)
for attr in train_df_flags:
    if train_df_flags[attr].dtype != 'object':
        train_df_flags[attr] = pd.cut(train_df_flags[attr], 10, labels=False)
        test_df_flags[attr] = pd.cut(test_df_flags[attr], 10, labels=False)

In [185]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
labels = ['red','green','blue','yellow','white','black','orange']
# train_df_flags = proportional_k_interval_discretization(train_df_flags)
# test_df_flags = proportional_k_interval_discretization(test_df_flags)
train_df_flags = train_df_flags[train_df_flags.columns[:]].apply(le.fit_transform)
test_df_flags = test_df_flags[test_df_flags.columns[:]].apply(le.fit_transform)
train_bayes_freq(labels, train_df_flags)

In [186]:
for lab in labels:
    apply_bayes(lab, test_df_flags)


Accuracy Score = 0.7538461538461538
Accuracy Score = 0.5846153846153846
Accuracy Score = 0.49230769230769234
Accuracy Score = 0.46153846153846156
Accuracy Score = 0.7846153846153846
Accuracy Score = 0.3230769230769231
Accuracy Score = 0.9076923076923077


In [187]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder

for label in labels:
    X = train_df_flags.drop(label, axis=1)
    y = train_df_flags[label]
    X  = proportional_k_interval_discretization(X)
    
    clf = MultinomialNB()
    le = LabelEncoder()
    X = X[X.columns[:]].apply(le.fit_transform)
    Y = pd.to_numeric(y, downcast='signed')
    clf.fit(X, Y)
    X_test = test_df_flags.drop(label, axis=1)
    X_test = X_test[X_test.columns[:]].apply(le.fit_transform)
    Y_test = pd.to_numeric(test_df_flags[label], downcast='signed')
    y_pred = clf.predict(X_test)
    print("Accuracy score = {}".format(accuracy_score(Y_test, y_pred)))

Accuracy score = 0.49230769230769234
Accuracy score = 0.49230769230769234
Accuracy score = 0.676923076923077
Accuracy score = 0.6307692307692307
Accuracy score = 0.4153846153846154
Accuracy score = 0.6923076923076923
Accuracy score = 0.7846153846153846


In [16]:
labels = ['amazed-suprised', 'happy-pleased', 'relaxing-calm', 'quiet-still', 'sad-lonely', 'angry-aggresive']
#train_df_emo = proportional_k_interval_discretization(train_df_emo)
#test_df_emo = proportional_k_interval_discretization(test_df_emo)
train_bayes_freq(labels, train_df_emo, 'train_emo_freq.json')


In [17]:
for lab in labels:
    apply_bayes(lab, test_df_emo, 'train_emo_freq.json')


Accuracy Score = 0.7326732673267327
Accuracy Score Not Normalized = 148
Accuracy Score = 0.7079207920792079
Accuracy Score Not Normalized = 143
Accuracy Score = 0.5247524752475248
Accuracy Score Not Normalized = 106
Accuracy Score = 0.7079207920792079
Accuracy Score Not Normalized = 143
Accuracy Score = 0.6386138613861386
Accuracy Score Not Normalized = 129
Accuracy Score = 0.7128712871287128
Accuracy Score Not Normalized = 144


In [18]:
data = pd.DataFrame()

# Create our target variable
data['Gender'] = ['male','male','male','male','female','female','female','female']

# Create our feature variables
data['Height'] = [6,5.92,5.58,5.92,5,5.5,5.42,5.75]
data['Weight'] = [180,190,170,165,100,150,130,150]
data['Foot_Size'] = [12,11,12,10,6,8,7,9]

# View the data
data_discrete = proportional_k_interval_discretization(data)
data_discrete

Unnamed: 0,Gender,Height,Weight,Foot_Size
4,female,0.0,0,0
6,female,0.0,0,0
5,female,1.0,1,1
7,female,1.0,1,1
3,male,1.0,1,1
1,male,1.0,1,1
2,male,1.0,1,1
0,male,1.0,1,1


In [19]:
data['Gender']

0      male
1      male
2      male
3      male
4    female
5    female
6    female
7    female
Name: Gender, dtype: object

In [20]:
train_bayes_freq(['Gender'], data_discrete, output_file="sex")

In [21]:
person = pd.DataFrame()

# Create some feature values for this single row
person['Gender'] = 'female'
person['Height'] = [6]
person['Weight'] = [130]
person['Foot_Size'] = [8]

# View the data 
person = proportional_k_interval_discretization(person)

In [22]:
apply_bayes('Gender', person, "sex")

Accuracy Score = 0.0
Accuracy Score Not Normalized = 0
