In [54]:
import numpy as np
import pandas as pd
from sys import argv
import os
import json
from pathlib import Path


MY_FEATURES = ["Herbology", "Ancient Runes", "Astronomy", "Charms", "Defense Against the Dark Arts"]

class LogReg:

    def __init__(self, filename):

        self.mean = None
        self.std = None
        self.df = pd.read_csv(filename)
        self.df = self.df.dropna()
        self.Y = self.df['Hogwarts House']
        self.house_names = self.Y.unique()
        self.df = self.df[MY_FEATURES]
        self.df = self.normalize_data(self.df)

        self.df = self.df.to_numpy(dtype=np.float128)

        self.convert_housenames()

        #flattern
        self.df = self.df.reshape(self.df.shape[0], -1).T
        self.Y = self.Y.reshape(self.Y.shape[0], -1).T

        self.thetas = []
        self.bs = []


    def convert_housenames(self):
        self.Y = np.asarray([np.float128(np.where(self.house_names == x)[0][0] + 1) for x in self.Y]).reshape(-1, 1)
        # self.Y = np.array(self.Y)


        self.Y_1 = (self.Y.astype(int) == 1).astype(int).reshape(-1,1)
        self.Y_2 = (self.Y.astype(int) == 2).astype(int).reshape(-1,1)
        self.Y_3 = (self.Y.astype(int) == 3).astype(int).reshape(-1,1)
        self.Y_4 = (self.Y.astype(int) == 4).astype(int).reshape(-1,1)

        self.y_s = [self.Y_1, self.Y_2, self.Y_3, self.Y_4]

    def normalize_data(self, data):
        self.mean = data.mean()
        self.std = data.std()
        return (data - data.mean()) / data.std()

    def init_with_zeroes(self, dim):
        self.theta = np.zeros(shape=(dim, 1))
        self.b = 0
        return self.theta, self.b

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def propagate(self, theta, b, X, Y):
        m = X.shape[1]
        g = (np.dot(theta.T,X) + b).T
        h = self.sigmoid(g)
        cost = (1/m) * np.sum(np.dot(-Y.T , np.log(h)) - 
        np.dot((1 - Y).T , np.log(1 - h)))
        print(X.shape)
        dtheta = (1 / m) * np.dot(X, (h - Y))
        db = (1 / m) * np.sum(h - Y)

        cost = np.squeeze(cost)

        grads = {"dtheta": dtheta,
                 "db": db}
        return grads, cost

    def optimize(self, theta, b, X, Y, num_iterations, lr):
        costs = []
        for i in range(num_iterations):
            print(i, num_iterations)
            grads, cost = self.propagate(theta, b, X, Y)

            dtheta = grads["dtheta"]
            db = grads["db"]

            theta = theta - lr * dtheta
            b = b - lr * db

            if i % 100 == 0:
                costs.append(cost)
        params = {"theta": theta,
                  "b": b}
        grads = {"dtheta": dtheta,
                 "db": db}
        print(params, grads)
        return params, grads, cost

    def save_coefficients(self, filename='coefficients.json', folder='data'):
        results = []
        for theta, b in zip(self.thetas, self.bs):
          results.append({'b': str(b), 'theta': [str(th[0]) for th in theta.tolist()]})
        '''
        заменить путь внутри with
        '''
        print(results)
        with open('/content.json', 'w+') as file:
            file.write(json.dumps(results))

    def model(self, X, Y, lr=0.001, num_iterations=1000, visu=0):
        theta, b = self.init_with_zeroes(X.shape[0])
        parametrs = []
        grads = []
        costs = []
        for i,y in enumerate(self.y_s):
          parameter, grad, cost = self.optimize(theta, b, X, y, num_iterations, lr)
          parametrs.append(parameter)
          grads.append(grad)
          costs.append(cost)
          self.thetas.append(parameter['theta'])
          self.bs.append(parameter['b'])
        print('ok')
        self.save_coefficients(i)
        print('ok')
      
      def predict




In [55]:
log_reg = LogReg('/content/dataset_train.csv')
log_reg.model(log_reg.df, log_reg.Y)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
515 1000
(5, 1251)
516 1000
(5, 1251)
517 1000
(5, 1251)
518 1000
(5, 1251)
519 1000
(5, 1251)
520 1000
(5, 1251)
521 1000
(5, 1251)
522 1000
(5, 1251)
523 1000
(5, 1251)
524 1000
(5, 1251)
525 1000
(5, 1251)
526 1000
(5, 1251)
527 1000
(5, 1251)
528 1000
(5, 1251)
529 1000
(5, 1251)
530 1000
(5, 1251)
531 1000
(5, 1251)
532 1000
(5, 1251)
533 1000
(5, 1251)
534 1000
(5, 1251)
535 1000
(5, 1251)
536 1000
(5, 1251)
537 1000
(5, 1251)
538 1000
(5, 1251)
539 1000
(5, 1251)
540 1000
(5, 1251)
541 1000
(5, 1251)
542 1000
(5, 1251)
543 1000
(5, 1251)
544 1000
(5, 1251)
545 1000
(5, 1251)
546 1000
(5, 1251)
547 1000
(5, 1251)
548 1000
(5, 1251)
549 1000
(5, 1251)
550 1000
(5, 1251)
551 1000
(5, 1251)
552 1000
(5, 1251)
553 1000
(5, 1251)
554 1000
(5, 1251)
555 1000
(5, 1251)
556 1000
(5, 1251)
557 1000
(5, 1251)
558 1000
(5, 1251)
559 1000
(5, 1251)
560 1000
(5, 1251)
561 1000
(5, 1251)
562 1000
(5, 1251)
563 1000
(5, 1251)
564 