In [11]:
import numpy as np
import pandas as pd
import random


import numpy as np
import pandas as pd
import random


class MyLogReg():
    def __init__(self, n_iter, learning_rate, weights=None, metric=None, reg=None, l1_coef=0, l2_coef=0, sgd_sample=None, random_state=42):
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.weights = weights
        self.metric = metric
        self.reg = reg
        self.l1_coef = l1_coef
        self.l2_coef = l2_coef
        self.random_state = random_state
        self.sgd_sample = sgd_sample

        self.metric = self.metrics(metric)
        self.learning_rate = self.learning_rate_type(learning_rate)

    def __str__(self) -> str:
        return f"MyLogReg class: n_iter={self.n_iter}, learning_rate={self.learning_rate}"

    def __repr__(self) -> str:
        return f"MyLogReg class: n_iter={self.n_iter}, learning_rate={self.learning_rate}"

    def calculate_gradient(self, X, y, y_pred):
        if self.reg == 'l1':
            assert self.l1_coef != 0
            return (1/X.shape[0]) * np.dot((y_pred - y.values.ravel()), X) + self.l1_coef*np.sign(self.weights)
        elif self.reg == 'l2':
            assert self.l2_coef != 0
            return (1/X.shape[0]) * np.dot((y_pred - y.values.ravel()), X) + self.l2_coef*2*(self.weights)
        elif self.reg == 'elasticnet':
            assert self.l1_coef != 0 and self.l2_coef != 0
            return (1/X.shape[0]) * np.dot((y_pred - y.values.ravel()), X) + self.l1_coef*np.sign(self.weights) + self.l2_coef*2*(self.weights)
        else:
            return (1/X.shape[0]) * np.dot((y_pred - y.values.ravel()), X)

    def learning_rate_type(self, LR):
        if not isinstance(LR, float):
            return LR
        else:
            return lambda x: LR
        
    def sgd_sample_size(self, sgd_sample_num, samples):
        if isinstance(sgd_sample_num, float):
            return int(samples.shape[0] * sgd_sample_num)
        elif isinstance(sgd_sample_num, int):
            return sgd_sample_num
        else:
            return samples.shape[0]

    def metrics(self, metric) -> None:
        if metric:
            if metric == 'accuracy':
                metric = ['accuracy', lambda y, y_pred: (
                    np.sum(y.values.ravel() == (y_pred > 0.5).ravel())) / len(y)]

            elif metric == 'precision':
                # TP / (TP + FP)
                def precision_score_func(y, y_pred):
                    y_pred_binary = (y_pred > 0.5).ravel()
                    y_ravel = y.values.ravel()
                    TP = np.sum((y_ravel == 1) & (y_pred_binary == 1))
                    FP = np.sum((y_ravel == 0) & (y_pred_binary == 1))
                    return TP/(TP+FP)

                metric = ['precision', precision_score_func]

            elif metric == 'recall':
                # TP / (TP + FN)
                def recall_score_func(y, y_pred):
                    y_pred_binary = (y_pred > 0.5)
                    y_ravel = y.values.ravel()
                    TP = np.sum((y_ravel == 1) & (y_pred_binary == 1))
                    FN = np.sum((y_ravel == 1) & (y_pred_binary == 0))
                    return TP/(TP+FN)
                
                metric = ['recall', recall_score_func]

            elif metric == 'f1':
                # 2 * precision * recall / (precision + recall)
                def f1_score_func(y, y_pred):
                    y_pred_binary = (y_pred > 0.5)
                    y_ravel = y.values.ravel()
                    TP = np.sum((y_ravel == 1) & (y_pred_binary == 1))
                    FP = np.sum((y_ravel == 0) & (y_pred_binary == 1))
                    FN = np.sum((y_ravel == 1) & (y_pred_binary == 0))
                    precision = TP/(TP+FP)
                    recall = TP/(TP+FN)
                    return 2 * precision * recall / (precision + recall)
                
                metric = ['f1', f1_score_func]


            elif metric == 'roc_auc':
                def auc_score_def(y, y_pred):
                    data = np.concatenate(
                        (y.to_numpy().reshape(-1, 1), np.round(y_pred.reshape(-1, 1), 10)), axis=1)
                    data = data[data[:, 1].argsort()][::-1]

                    pos_above_iter = 0

                    for y, pred in data:
                        if y == 0:
                            if (data[data[:, 1] == pred]).shape[0] > 1:
                                pos_above_iter += np.sum(
                                    data[data[:, 1] > pred][:, 0], axis=0) / 2
                            pos_above_iter += np.sum(
                                data[data[:, 1] > pred][:, 0], axis=0)
                    return pos_above_iter / (np.sum(data[:, 0] == 1) * np.sum(data[:, 0] == 0))

                metric = ['roc_auc', auc_score_def]

        return metric

    def fit(self, samples: pd.DataFrame, y: pd.Series, verbose=False) -> None:
        random.seed(self.random_state)  # фиксируем рандом сид

        sgd_sample_quantity = self.sgd_sample_size(self.sgd_sample, samples)

        X = samples.copy()
        X.insert(0, 'bias', pd.Series(1, index=X.index))

        self.weights = np.ones(X.shape[1])

        for iter in range(1, self.n_iter+1):
            sample_rows_idx = random.sample(
                range(X.shape[0]), sgd_sample_quantity)
            X_mini_batch = X.iloc[sample_rows_idx]
            y_mini_batch = y.iloc[sample_rows_idx]

            y_pred = np.array(1/(1 + np.exp(-np.dot(X_mini_batch, self.weights))))

            y_pred_for_loss = np.array(1/(1 + np.exp(-np.dot(X, self.weights))))

            loss = -np.mean(np.log(y_pred_for_loss+1e-100)*y.values.ravel() +
                            np.log(1 - y_pred_for_loss+1e-100)*(1-y.values.ravel()))

            # grad = (1/(X.shape[0]) * np.dot((y_pred - y.values.ravel()), X))
            grad = self.calculate_gradient(X_mini_batch, y_mini_batch, y_pred)

            self.weights = self.weights - grad * self.learning_rate(iter)
            if verbose and (iter % verbose) == 0 and self.metric is not None:
                print(
                    f'iter = {iter+1} ||| Loss = {loss} ||| {self.metric[0]} = {self.metric[1](y, y_pred_for_loss)}')
            elif verbose and (iter % verbose) == 0:
                print(f'iter = {iter+1} ||| Loss = {loss}')
        if self.metric:
            self.final_metric = self.metric[1](y, np.array(
                1/(1 + np.exp(-np.dot(X, self.weights)))))

    def predict(self, samples):
        X = samples.copy()
        X.insert(0, 'bias', pd.Series(1, index=range(X.shape[0])))
        return (np.array(1/(1 + np.exp(-np.dot(X, self.weights)))) > 0.5).astype(np.int8)

    def predict_proba(self, samples):
        X = samples.copy()
        X.insert(0, 'bias', pd.Series(1, index=range(X.shape[0])))
        return np.array(1/(1 + np.exp(-np.dot(X, self.weights))))

    def get_coef(self) -> list():
        try:
            assert self.weights is not None
            return np.array(self.weights[1:])
        except:
            return 'fit before!'

    def get_best_score(self) -> int:
        return self.final_metric

In [12]:
x = MyLogReg(50, lambda iter: 0.5 * (0.85 ** iter), metric='precision', sgd_sample=0.1, reg='elasticnet', l1_coef=0.1, l2_coef=0.1)

np.random.seed(42)
# X_1 = pd.DataFrame({'first':[50 for x in range(200)], 'second':[0 for x in range(200)]})
# X_2 = pd.DataFrame({'first':[0 for x in range(200)], 'second':[50 for x in range(200)]})

# X = pd.concat((X_1, X_2), axis=0)
X = pd.DataFrame(np.random.randint(-25, 25, (400, 100)))
y = pd.DataFrame({'target': [0 if x<200 else 1 for x in range(400)]})

x.fit(X, y, verbose=10)

# np.mean((x.predict_proba(X) > 0.5) == y.values)

iter = 11 ||| Loss = 35.631830863566286 ||| precision = 0.643979057591623
iter = 21 ||| Loss = 16.181827325897686 ||| precision = 0.6911764705882353
iter = 31 ||| Loss = 13.34606526399461 ||| precision = 0.7177033492822966
iter = 41 ||| Loss = 12.627028679628506 ||| precision = 0.7142857142857143
iter = 51 ||| Loss = 12.579146054146257 ||| precision = 0.7142857142857143


In [13]:
x.get_coef()

array([-3.29319297e-02,  1.53071071e-01, -2.40637417e-01, -3.86252539e-01,
       -3.17564549e-02, -4.90962676e-03,  2.07070920e-01, -8.29053212e-02,
        5.82039659e-02,  3.66321087e-01,  2.61473751e-01, -1.69966116e-01,
        1.82640145e-01, -9.16783275e-02, -5.21694779e-01,  1.21994545e-01,
       -1.90354214e-01,  3.45929609e-01, -6.39620215e-02,  6.98215836e-01,
        6.50689611e-02, -2.29641814e-02,  1.57412351e-02, -8.39547702e-03,
        3.20811310e-01,  4.98331535e-01,  1.98689248e-01,  2.81684333e-01,
        2.56380488e-01,  3.00457887e-01, -2.23391219e-04,  5.88743975e-01,
        1.70543787e-01,  5.15946416e-01,  5.28951363e-01,  5.47468505e-03,
       -2.21889069e-01, -7.27561108e-02,  3.66049847e-01, -4.51141840e-02,
       -1.67830212e-01,  2.74762161e-01, -1.30383492e-01,  2.56454542e-02,
       -4.30234307e-02, -5.32909834e-01, -9.04343319e-02,  4.62973046e-01,
        6.16148618e-02,  1.09103519e-01, -4.56836385e-01, -2.21006493e-01,
       -8.15083597e-02, -

In [423]:
def try_me(a):
    pos_above_iter = 0

    for y, pred in a:
        if y == 0:
            if (a[a[:, 1]==pred]).shape[0] > 1:
                pos_above_iter += np.sum(a[a[:,1] > pred][:, 0], axis=0) / 2
            pos_above_iter += np.sum(a[a[:,1] > pred][:, 0], axis=0)
    return pos_above_iter / (np.sum(a[:,0]==1) * np.sum(a[:,0]==0))

In [424]:
# try_me(a)

In [425]:
qwe = np.array([[1, 0.91],
                [0, 0.86],
                [0, 0.78],
                [1, 0.6],
                [0, 0.6],
                [1, 0.55],
                [0, 0.51],
                [0, 0.46],
                [0, 0.42],])


try_me(qwe)

0.6944444444444444

In [232]:
loss = lambda x, y: -np.mean(np.log(x)*y + np.log(1 - x)*(1-y))

In [233]:
x = pd.Series(np.array([0.001, 0.001, 0.001, 0.999, 0.999]).T)
y = pd.Series(np.array([0, 0, 0, 1, 1]).T)

In [234]:
import numpy as np

In [235]:
pred = np.array([1, 0, 1, 1, 0])
true = np.array([1, 1, 1, 0, 1])

In [236]:
# np.sum(pred == true) / (np.sum(pred == true) + np.sum((pred == 1) & (true == 0)))
q = lambda x, y : x * (y> 0.5)

In [237]:
q(5, 0.6)

5

In [238]:
from sklearn.metrics import precision_score

# Пример предсказанных и истинных меток классов
predicted_labels = [1, 0, 1, 1, 0]
true_labels = [1, 1, 1, 0, 1]

# Расчет precision
precision = precision_score(true_labels, predicted_labels)

print("Precision:", precision)


Precision: 0.6666666666666666


In [239]:
def x():
    print('1231213')

In [240]:
# Версия без mini_batch

# class MyLogReg():
#     def __init__(self, n_iter, learning_rate, weights=None, metric=None, reg=None, l1_coef=0, l2_coef=0):
#         self.n_iter = n_iter
#         self.learning_rate = learning_rate
#         self.weights = weights
#         self.metric = metric
#         self.reg = reg
#         self.l1_coef = l1_coef
#         self.l2_coef = l2_coef

#         self.metric = self.metrics(metric)
#         self.learning_rate = self.learning_rate_type(learning_rate)

#     def __str__(self) -> str:
#         return f"MyLogReg class: n_iter={self.n_iter}, learning_rate={self.learning_rate}"

#     def __repr__(self) -> str:
#         return f"MyLogReg class: n_iter={self.n_iter}, learning_rate={self.learning_rate}"

#     def calculate_gradient(self, X, y, y_pred):
#         if self.reg == 'l1':
#             assert self.l1_coef != 0
#             return (1/X.shape[0]) * np.dot((y_pred - y.values.ravel()), X) + self.l1_coef*np.sign(self.weights)
#         elif self.reg == 'l2':
#             assert self.l2_coef != 0
#             return (1/X.shape[0]) * np.dot((y_pred - y.values.ravel()), X) + self.l2_coef*2*(self.weights)
#         elif self.reg == 'elasticnet':
#             assert self.l1_coef != 0 and self.l2_coef != 0
#             return (1/X.shape[0]) * np.dot((y_pred - y.values.ravel()), X) + self.l1_coef*np.sign(self.weights) + self.l2_coef*2*(self.weights)
#         else:
#             return (1/X.shape[0]) * np.dot((y_pred - y.values.ravel()), X)

#     def learning_rate_type(self, LR):
#         if not isinstance(LR, float):
#             return LR
#         else:
#             return lambda x: LR

#     def metrics(self, metric) -> None:
#         if metric:
#             if metric == 'accuracy':
#                 metric = ['accuracy', lambda y, y_pred: (
#                     np.sum(y.values.ravel() == (y_pred > 0.5).ravel())) / len(y)]

#             elif metric == 'precision':
#                 # TP / (TP + FP)
#                 def precision_score_func(y, y_pred):
#                     y_pred_binary = (y_pred > 0.5).ravel()
#                     y_ravel = y.values.ravel()
#                     TP = np.sum((y_ravel == 1) & (y_pred_binary == 1))
#                     FP = np.sum((y_ravel == 0) & (y_pred_binary == 1))
#                     return TP/(TP+FP)

#                 metric = ['precision', precision_score_func]

#             elif metric == 'recall':
#                 # TP / (TP + FN)
#                 def recall_score_func(y, y_pred):
#                     y_pred_binary = (y_pred > 0.5)
#                     y_ravel = y.values.ravel()
#                     TP = np.sum((y_ravel == 1) & (y_pred_binary == 1))
#                     FN = np.sum((y_ravel == 1) & (y_pred_binary == 0))
#                     return TP/(TP+FN)
                
#                 metric = ['recall', recall_score_func]

#             elif metric == 'f1':
#                 # 2 * precision * recall / (precision + recall)
#                 def f1_score_func(y, y_pred):
#                     y_pred_binary = (y_pred > 0.5)
#                     y_ravel = y.values.ravel()
#                     TP = np.sum((y_ravel == 1) & (y_pred_binary == 1))
#                     FP = np.sum((y_ravel == 0) & (y_pred_binary == 1))
#                     FN = np.sum((y_ravel == 1) & (y_pred_binary == 0))
#                     precision = TP/(TP+FP)
#                     recall = TP/(TP+FN)
#                     return 2 * precision * recall / (precision + recall)
                
#                 metric = ['f1', f1_score_func]


#             elif metric == 'roc_auc':
#                 def auc_score_def(y, y_pred):
#                     data = np.concatenate(
#                         (y.to_numpy().reshape(-1, 1), np.round(y_pred.reshape(-1, 1), 10)), axis=1)
#                     data = data[data[:, 1].argsort()][::-1]

#                     pos_above_iter = 0

#                     for y, pred in data:
#                         if y == 0:
#                             if (data[data[:, 1] == pred]).shape[0] > 1:
#                                 pos_above_iter += np.sum(
#                                     data[data[:, 1] > pred][:, 0], axis=0) / 2
#                             pos_above_iter += np.sum(
#                                 data[data[:, 1] > pred][:, 0], axis=0)
#                     return pos_above_iter / (np.sum(data[:, 0] == 1) * np.sum(data[:, 0] == 0))

#                 metric = ['roc_auc', auc_score_def]

#         return metric

#     def fit(self, samples: pd.DataFrame, y: pd.Series, verbose=False) -> None:
#         X = samples.copy()
#         X.insert(0, 'bias', pd.Series(1, index=range(X.shape[0])))

#         self.weights = np.ones(X.shape[1])

#         for iter in range(1, self.n_iter+1):

#             y_pred = np.array(1/(1 + np.exp(-np.dot(X, self.weights))))

#             loss = -np.mean(np.log(y_pred+1e-100)*y.values.ravel() +
#                             np.log(1 - y_pred+1e-100)*(1-y.values.ravel()))

#             # grad = (1/(X.shape[0]) * np.dot((y_pred - y.values.ravel()), X))
#             grad = self.calculate_gradient(X, y, y_pred)

#             self.weights = self.weights - grad * self.learning_rate(iter)
#             if verbose and (iter % verbose) == 0 and self.metric is not None:
#                 print(
#                     f'iter = {iter+1} ||| Loss = {loss} ||| {self.metric[0]} = {self.metric[1](y, y_pred)}')
#             elif verbose and (iter % verbose) == 0:
#                 print(f'iter = {iter+1} ||| Loss = {loss}')
#         if self.metric:
#             self.final_metric = self.metric[1](y, np.array(
#                 1/(1 + np.exp(-np.dot(X, self.weights)))))

#     def predict(self, samples):
#         X = samples.copy()
#         X.insert(0, 'bias', pd.Series(1, index=range(X.shape[0])))
#         return (np.array(1/(1 + np.exp(-np.dot(X, self.weights)))) > 0.5).astype(np.int8)

#     def predict_proba(self, samples):
#         X = samples.copy()
#         X.insert(0, 'bias', pd.Series(1, index=range(X.shape[0])))
#         return np.array(1/(1 + np.exp(-np.dot(X, self.weights))))

#     def get_coef(self) -> list():
#         try:
#             assert self.weights is not None
#             return np.array(self.weights[1:])
#         except:
#             return 'fit before!'

#     def get_best_score(self) -> int:
#         return self.final_metric

In [121]:
import numpy as np
import pandas as pd
import random


class MyLogReg():
    def __init__(self, n_iter, learning_rate, weights=None, sgd_sample=None, random_state=42):
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.weights = weights
        self.random_state = random_state
        self.sgd_sample = sgd_sample

        self.learning_rate = self.learning_rate_type(learning_rate)

    def __str__(self) -> str:
        return f"MyLogReg class: n_iter={self.n_iter}, learning_rate={self.learning_rate}"

    def __repr__(self) -> str:
        return f"MyLogReg class: n_iter={self.n_iter}, learning_rate={self.learning_rate}"


    def learning_rate_type(self, LR):
        if not isinstance(LR, float):
            return LR
        else:
            return lambda x: LR
        
    def sgd_sample_size(self, sgd_sample_num, samples):
        if isinstance(sgd_sample_num, float):
            return int(samples * sgd_sample_num)
        elif isinstance(sgd_sample_num, int):
            return sgd_sample_num
        else:
            return samples


    def fit(self, samples: pd.DataFrame, y: pd.Series, verbose=False) -> None:
        random.seed(self.random_state)  # фиксируем рандом сид
        np.random.seed(self.random_state)

        sgd_sample_quantity = self.sgd_sample_size(self.sgd_sample, samples.shape[0])

        X = samples.copy()
        X.insert(0, 'bias', pd.Series(1, index=range(X.shape[0])))

        self.weights = np.ones(X.shape[1])

        for iter in range(1, self.n_iter+1):
            sample_rows_idx = random.sample(
                range(X.shape[0]), sgd_sample_quantity)
            X_mini_batch = X.iloc[sample_rows_idx]
            y_mini_batch = y.iloc[sample_rows_idx]

            y_pred = np.array(1/(1 + np.exp(-np.dot(X_mini_batch, self.weights))))

            grad = (1/(X.shape[0]) * np.dot((y_pred - y_mini_batch.values.ravel()), X_mini_batch))

            # grad = self.calculate_gradient(X_mini_batch, y_mini_batch, y_pred)
            self.weights = self.weights - grad * self.learning_rate(iter)


    def predict(self, samples):
        X = samples.copy()
        X.insert(0, 'bias', pd.Series(1, index=range(X.shape[0])))
        return (np.array(1/(1 + np.exp(-np.dot(X, self.weights)))) > 0.5).astype(np.int8)

    def predict_proba(self, samples):
        X = samples.copy()
        X.insert(0, 'bias', pd.Series(1, index=range(X.shape[0])))
        return np.array(1/(1 + np.exp(-np.dot(X, self.weights))))

    def get_coef(self) -> list():
        try:
            assert self.weights is not None
            return np.array(self.weights[1:])
        except:
            return 'fit before!'

    def get_best_score(self) -> int:
        return self.final_metric

In [122]:
x = MyLogReg(50, 0.1, sgd_sample=0.1)
np.random.seed(42)

# X = pd.concat((X_1, X_2), axis=0)
X = pd.DataFrame(np.random.randint(-25, 25, (400, 100)))
y = pd.DataFrame({'target': [0 if x<200 else 1 for x in range(400)]})

x.fit(X, y, verbose=10)

# np.mean((x.predict_proba(X) > 0.5) == y.values)

(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 101)
(101,)
(40,) (40, 