In [748]:
import math
from math import e
import sys
from describe import Math_calculat
from describe import DataSet
import decimal
from scipy.special import expit


#     """
#         Hogwarts House = [Ravenclaw, Slytherin, Gryffindor, Hufflepuff]
#     """

class LogisticRegression:

    def __init__(self, file='datasets/dataset_train.csv', y_true='Slytherin', x_columns=[], size=10, lr=0.15, num_iter=2000, fit_intercept=True, verbose=False):
        self.lr = lr
        self.num_iter = num_iter
        self.verbose = verbose
        self.fit_intercept = fit_intercept
        self.y_true = y_true
        self.size = size
        self.x_columns = x_columns
        self.file = file
        self.theta = []
        if not (y_true in ['Ravenclaw', 'Slytherin', 'Gryffindor', 'Hufflepuff']):
            print ("Error: bad parameter y_true")
            sys.exit()
        if num_iter < 1 or lr < 0 or lr > 1:
            print ("Error: bad parameter num_iter or lr")
            sys.exit()
        if not (type(x_columns) == list):
            print ("Error: x_columns must will be list int")
            sys.exit()
    
    
    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)
    
    
    def __sigmoid(self, z):
        return 1. / (1. + np.exp(-z))
    
    
    def __loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    
    def get_x_y(self, ds, return_y=True):
        mas_columns = []
        x = []
        y = []
        if self.x_columns:
            for i in self.x_columns:
                if i in ds.numeric_columns:
                    mas_columns.append(i)
        if not mas_columns:
            mas_columns = ds.numeric_columns
        for i in mas_columns:
            x.append(ds.get_float_col(i)[:self.size])
        x_new = []
        for i in range(len(x[0])):
            new = []
            for j in range(len(mas_columns)):
                new.append(x[j][i])
            x_new.append(new)
        if not return_y:
            return np.array(x_new)
        for i in ds.get_col(1)[:self.size]:
            y.append(1) if i == self.y_true else y.append(0)
        return np.array(x_new), np.array(y)
    
    def fit(self):
        ds = DataSet(filename=self.file)
        ds.find_numeric_label()
        X, y = self.get_x_y(ds)
        if self.fit_intercept:
            X = self.__add_intercept(X)
        
        self.theta = np.random.randn(X.shape[1])
        for i in range(self.num_iter):
            z = np.dot(X, self.theta)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.theta -= self.lr * gradient
            if(self.verbose == True and i % 10000 == 0):
                z = np.dot(X, self.theta)
                h = self.__sigmoid(z)
                print(f'loss: {self.__loss(h, y)} \t')
    
    
    def predict_file(self, theta = np.array([]), theta_exit=0):
        df = DataSet(filename=self.file)
        df.find_numeric_label()
        X = self.get_x_y(df, return_y=False)
        X = self.__add_intercept(X)
        self.theta = np.array(theta)
        if not theta and theta_exit:
            print ("Error: Have not theta")
        if not theta:
            self.theta = np.ones(X.shape[1])
        if self.theta.shape[0] != X.shape[1]:
            print ('Error: bad theta or X')
            sys.exit()
        return [self.predict(X), self.predict_prob(X)]
        
    
    def predict_prob(self, X, fit_intercept=0):
        if fit_intercept:
            X = self.__add_intercept(X)
        return self.__sigmoid(np.dot(X, self.theta))
    
    
    def predict(self, X, threshold=0.5, fit_intercept=0):
        return self.predict_prob(X, fit_intercept=fit_intercept) >= threshold

theta = []
try:
    with open('model.txt', 'r') as f:
        cat = f.read()
    mas = []
    for i in cat.strip().split('\n'):
        s = []
        for j in i.strip().split(';'):
            if j.strip() != '':
                s.append(float(j))
        theta.append(s)
except:
    theta = [[], [], [], []]
    
if len(theta) != 4:
    print ('Error: bad model.txt')
    sys.exit()

lr = {'Ravenclaw' : [], 'Slytherin' : [], 'Gryffindor' : [], 'Hufflepuff' : []}
k = 0
for i in lr:
    lr[i].append(LogisticRegression(file='datasets/dataset_train.csv', y_true=i, x_columns=[9, 10]))
    ms = lr[i][0].predict_file(theta[k])
    k += 1
    lr[i].append(ms[0])
    lr[i].append(ms[1])


with open('houses.csv', 'w') as f:
    f.write('Index,Hogwarts House\n')
    for i in range(len(lr['Ravenclaw'][1])):
        f.write(str(i) + ',')
        max_prob = -1
        key = 'Ravenclaw'
        for j in lr:
            if lr[j][1][i] and lr[j][2][i] >= max_prob:
                max_prob = lr[j][2][i]
                key = j
        f.write(str(key) + '\n')
    

In [767]:
lr['Ravenclaw'][1:]

True

In [751]:
lr['Slytherin'][1:]

[array([ True, False,  True, False, False, False, False, False, False,
        False]),
 array([0.83471472, 0.06237309, 0.87531499, 0.18850375, 0.3540628 ,
        0.04784784, 0.04557142, 0.20850882, 0.45135896, 0.2057575 ])]

In [752]:
lr['Gryffindor'][1:]

[array([False, False, False,  True, False, False,  True,  True,  True,
         True]),
 array([4.98070354e-05, 2.23080313e-03, 1.83311678e-04, 9.99930081e-01,
        2.96745245e-01, 1.30602895e-03, 9.99967305e-01, 9.96058784e-01,
        9.95761698e-01, 9.99197808e-01])]

In [753]:
lr['Hufflepuff'][1:]

[array([False,  True, False, False, False,  True, False, False, False,
        False]),
 array([0.14640553, 0.99985577, 0.02483568, 0.00300022, 0.22729181,
        0.99995775, 0.18317099, 0.03222282, 0.00107647, 0.01150788])]

In [645]:
# import math
# from math import e
# import sys
# from describe import Math_calculat
# from describe import DataSet
# import decimal

# from scipy.special import expit
# # import numpy as np

# class Logreg:
#     """
#         Hogwarts House = [Ravenclaw, Slytherin, Gryffindor, Hufflepuff]
#     """
#     def __init__(self, file=None, num_iter=50, y_true='Ravenclaw', x_columns=[], lr = 0.15, loss = True, size = 10):
#         self.file = file
#         self.num_iter = num_iter
#         self.x_columns = x_columns
#         self.y = 1
#         self.y_true = y_true
#         self.size = size
#         self.lr = lr
#         self.theta = [0]
#         for i in x_columns:
#             self.theta.append(0)
#         self.loss = loss
#         if not (y_true in ['Ravenclaw', 'Slytherin', 'Gryffindor', 'Hufflepuff']):
#             print ("Error: bad parameter y_true")
#             sys.exit()
#         if num_iter < 1 or lr < 0 or lr > 1:
#             print ("Error: bad parameter num_iter or lr")
#             sys.exit()
#         if not (type(x_columns) == list):
#             print ("Error: x_columns must will be list int")
#             sys.exit()
            
#     def sigmoid(self, xi, theta):
#         z = 0
#         for i in range(len(theta)):
#             z += theta[i] * xi[i]
# #         print ('-----------')
# #         print (-z)
# #         print (expit(-z))
# #         print (math.exp(-z))
# #         print ('-----------')
# #         return (1.0 / (1.0 + math.exp(-z)))
#         return (1.0 / (1.0 + expit(z)))
    
#     def __loss(self, x, y):
#         tmp = 0
#         for i in range(len(x)):
#             tmp += y[i] * np.log(self.sigmoid(x[i], self.theta)) + (1 - y[i]) * np.log(1 - self.sigmoid(x[i], self.theta))
#         return -tmp / (len(x) * 1.0)
    
#     def predict_proba(self, x, theta):
#         return (self.sigmoid(x, theta))
    
#     def predict(self, x, theta, threshold=0.5):
#         return 1 if self.predict_proba(x, theta) >= threshold else 0
    
#     def get_x_y(self, ds):
#         mas_columns = []
#         x = []
#         y = []
#         theta = [0]
#         if not self.x_columns:
#             for i in self.x_columns:
#                 if i in ds.numeric_columns:
#                     mas_columns.append(i)
#         if not mas_columns:
#             mas_columns = ds.numeric_columns
#         for i in mas_columns:
#             x.append(ds.get_float_col(i)[:self.size])
#             theta.append(0)
#         x_new = []
#         for i in range(len(x[0])):
#             new = [1]
#             for j in range(len(mas_columns)):
#                 new.append(x[j][i])
#             x_new.append(new)
#         for i in ds.get_col(1)[:self.size]:
#             if i == self.y_true:
#                 y.append(1)
#             else:
#                 y.append(0)
#         return x_new, y, theta
                
#     def fit(self):
#         ds = DataSet(filename=self.file)
#         ds.find_numeric_label()
#         x, y, self.theta = self.get_x_y(ds)
# #         x = [[1,1,2,3]]
# #         y = [1]
# #         self.theta = [0,0,0,0]
    
    
#         for n in range(self.num_iter):
#             for j in range(len(x[0])):
#                 tmp = 0
#                 for i in range(len(x)):
#                     tmp += (self.sigmoid(x[i], self.theta) - y[i]) * x[i][j]
#                 self.theta[j] -= self.lr * tmp / (len(x) * 1.0)
#             if self.loss == True and n % 1000 == 0:
#                 print ('niter = {0} ; loss = {1}'.format(n, self.__loss(x, y)))
#         print (x)
#         print (y)
#         return self.theta

In [647]:
# lg = Logreg(file='datasets/dataset_train.csv', lr=0.7, y_true='Slytherin', num_iter=100, size=20)
# theta = lg.fit()