In [62]:
import numpy as np
import pandas as pd
import random


import numpy as np
import pandas as pd
import random


class MyLineReg():
    def __init__(self, n_iter, learning_rate, metric=None, reg=None, l1_coef=0, l2_coef=0, sgd_sample=None, random_state=42):
        self.n_iter = n_iter
        self.weights = None
        self.metric = metric
        self.reg = reg
        self.l1_coef = l1_coef
        self.l2_coef = l2_coef
        self.random_state = random_state
        self.sgd_sample = sgd_sample

        self.learning_rate = self.learning_rate_type(learning_rate)
        self.metric = self.metrics(self.metric)

    def sgd_sample_size(self, sgd_sample_num, samples):
        if isinstance(sgd_sample_num, float):
            return int(samples.shape[0] * sgd_sample_num)
        elif isinstance(sgd_sample_num, int):
            return sgd_sample_num
        else:
            return samples.shape[0]

    def calculate_gradient(self, X, y, y_pred):
        if self.reg == 'l1':
            assert self.l1_coef != 0
            return (2/X.shape[0]) * np.dot((y_pred - y.values.ravel()), X) + self.l1_coef*np.sign(self.weights)
        elif self.reg == 'l2':
            assert self.l2_coef != 0
            return (2/X.shape[0]) * np.dot((y_pred - y.values.ravel()), X) + self.l2_coef*2*(self.weights)
        elif self.reg == 'elasticnet':
            assert self.l1_coef != 0 and self.l2_coef != 0
            return (2/X.shape[0]) * np.dot((y_pred - y.values.ravel()), X) + self.l1_coef*np.sign(self.weights) + self.l2_coef*2*(self.weights)
        else:
            return (2/X.shape[0]) * np.dot((y_pred - y.values.ravel()), X)

    def learning_rate_type(self, LR):
        if not isinstance(LR, float):
            return LR
        else:
            return lambda x: LR

    def metrics(self, metric) -> None:
        if metric:
            if metric == 'mae':
                metric = ['mae', lambda y,
                          y_pred: np.mean(np.abs(y.values - y_pred))]

            elif metric == 'mse':
                metric = ['mse', lambda y,
                          y_pred: np.mean((y.values - y_pred)**2)]

            elif metric == 'rmse':
                metric = ['rmse', lambda y, y_pred: (
                    np.mean((y.values - y_pred)**2))**(0.5)]

            elif metric == 'mape':
                metric = ['mape', lambda y, y_pred: 100 *
                          np.mean(np.abs((y.values - y_pred)/y))]

            elif metric == 'r2':
                metric = ['r2', lambda y, y_pred: (1 -
                                                   (np.sum((y.values - y_pred)**2))/(np.sum((y.values - np.mean(y.values))**2)))]
        return metric

    def __str__(self):
        return f"MyLineReg class: n_iter={self.n_iter}, learning_rate={self.learning_rate}"

    def __repr__(self):
        return f"MyLineReg class: n_iter={self.n_iter}, learning_rate={self.learning_rate}"

    def fit(self, samples: pd.DataFrame, y: pd.Series, verbose=False) -> None:
        random.seed(self.random_state)  # фиксируем рандом сид

        sgd_sample_quantity = self.sgd_sample_size(self.sgd_sample, samples)
        X = samples.copy()
        X.insert(0, 'bias', pd.Series(1, index=X.index))

        self.weights = np.ones(X.shape[1])

        for iter in range(1, self.n_iter+1):
            sample_rows_idx = random.sample(
                range(X.shape[0]), sgd_sample_quantity)
            X_mini_batch = X.iloc[sample_rows_idx]
            y_mini_batch = y.iloc[sample_rows_idx]

            y_pred = np.dot(X_mini_batch, self.weights)
            # loss = np.mean((y_pred - y_mini_batch.values)**2)
            loss = np.mean((np.dot(X, self.weights) - y.values)**2)
            grad = self.calculate_gradient(X_mini_batch, y_mini_batch, y_pred)

            self.weights = self.weights - grad * self.learning_rate(iter)

            if verbose and (iter % verbose) == 0 and self.metric is not None:
                print(
                    f'iter = {iter} ||| Loss = {loss} ||| {self.metric[0]} = {self.metric[1](y, y_pred)}')
            elif verbose and (iter % verbose) == 0:
                print(f'iter = {iter} ||| Loss = {loss}')
        if self.metric:
            self.final_metric = self.metric[1](y, np.dot(X, self.weights))

    def predict(self, samples: pd.DataFrame) -> int:
        """
        Принимает на вход  матрицу фичей в виде датафрейма пандаса.
        Дополняет матрицу фичей единичным вектором (первый столбец).
        Возвращает вектор предсказаний.
        """

        X = samples.copy()
        X.insert(0, 'bias', pd.Series(1, index=range(X.shape[0])))

        return np.dot(X, self.weights)

    def get_coef(self) -> list():
        try:
            assert self.weights is not None
            return np.array(self.weights[1:])
        except:
            return 'fit before!'

    def get_best_score(self) -> int:
        return self.final_metric

In [63]:
X = pd.DataFrame(range(1000), index=[str(x) + 'qwe' for x in range(1000)])
y = (pd.DataFrame(list(range(1000)))*150)

model = MyLineReg(50, lambda iter: 0.5 * (0.85 ** iter), 'r2', reg='l2', l2_coef=1, sgd_sample=123)

model.fit(X, y, 10)
model.get_coef()

        bias    0
0qwe       1    0
1qwe       1    1
2qwe       1    2
3qwe       1    3
4qwe       1    4
...      ...  ...
995qwe     1  995
996qwe     1  996
997qwe     1  997
998qwe     1  998
999qwe     1  999

[1000 rows x 2 columns]
        bias    0
0qwe       1    0
1qwe       1    1
2qwe       1    2
3qwe       1    3
4qwe       1    4
...      ...  ...
995qwe     1  995
996qwe     1  996
997qwe     1  997
998qwe     1  998
999qwe     1  999

[1000 rows x 2 columns]
        bias    0
0qwe       1    0
1qwe       1    1
2qwe       1    2
3qwe       1    3
4qwe       1    4
...      ...  ...
995qwe     1  995
996qwe     1  996
997qwe     1  997
998qwe     1  998
999qwe     1  999

[1000 rows x 2 columns]
        bias    0
0qwe       1    0
1qwe       1    1
2qwe       1    2
3qwe       1    3
4qwe       1    4
...      ...  ...
995qwe     1  995
996qwe     1  996
997qwe     1  997
998qwe     1  998
999qwe     1  999

[1000 rows x 2 columns]
        bias    0
0qwe       1    0


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  loss = np.mean((np.dot(X, self.weights) - y.values)**2)
  loss = np.mean((np.dot(X, self.weights) - y.values)**2)
  loss = np.mean((np.dot(X, self.weights) - y.values)**2)
  loss = np.mean((np.dot(X, self.weights) - y.values)**2)
  (np.sum((y.values - y_pred)**2))/(np.sum((y.values - np.mean(y.values))**2)))]
  loss = np.mean((np.dot(X, self.weights) - y.values)**2)
  loss = np.mean((np.dot(X, self.weights) - y.values)**2)
  loss = np.mean((np.dot(X, self.weights) - y.values)**2)
  loss = np.mean((np.dot(X, self.weights) - y.values)**2)
  loss = np.mean((np.dot(X, self.weights) - y.values)**2)
  loss = np.mean((np.dot(X, self.weights) - y.values)**2)
  loss = np.mean((np.dot(X, self.weights) - y.values)**2)
  loss = np.mean((np.dot(X, self.weights) - y.values)**2)
  loss = np.mean((np.dot(X, self.weights) - y.values)**2)
  loss = np.mean((np.dot(X, self.weights) - y.values)**2)
  (np.sum((y.values - y_pred)**2))/(np.sum((y

array([-1.90035005e+188])

iter = 10 ||| Loss = nan ||| r2 = nan
iter = 20 ||| Loss = nan ||| r2 = nan
iter = 30 ||| Loss = nan ||| r2 = nan
iter = 40 ||| Loss = nan ||| r2 = nan
iter = 50 ||| Loss = nan ||| r2 = nan


array([nan])

In [5]:
model.get_best_score()

-inf

In [11]:
import pandas as pd

df = pd.DataFrame({'target':range(1, 11, 2)}, index=[0, 1, 'dd', 'q', 99])

In [58]:
df.insert()

Unnamed: 0,target
0,1
1,3
dd,5
q,7
99,9


In [59]:
X.insert(0, 'bias', pd.Series(1, index=range(X.shape[0])))

In [61]:
df.index

Index([0, 1, 'dd', 'q', 99], dtype='object')