In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt 

# Ошибка
## Начинается всё с метода максимизации правдоподобия 
### На классах {0, 1}
$$Likehood = \prod \pi(x)^{y_i} \cdot (1 - \pi(x)^{(1 - y_i)}$$
$$\pi(x) = \frac{1}{1 + w^T \cdot x}$$
$$LogLikehood = \ln Likehood = \sum(y_i \ln(\pi(x_i) + (1-y_i)\ln(1-\pi(x_i))$$

### На классах {-1, 1}
$$Q = \sum \ln(1 + e^{-y_i <w, x>})$$

## Производная по w

$$\nabla_{w} Q = \sum \frac{1}{1 + exp{(-y_i<w, x_i>)}} \cdot exp{(-y_i<w, x_i>)} \cdot \{-y_i \vec{x_i}\}$$

# GD
## SGD
Будем задавать `batch_size` как долю от всех строк матрицы, которые будут рандомно сэмплироваться.

При `batch_size`=1 получится градиентный спуск

Возьмём датасет про вино. Много классов, но я возьму всё что больше 90-ого прерсентиля за 1 класс, остальное -1

In [2]:
data = pd.read_csv('winequality-red.csv', sep=';')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
q_thrh = data['quality'].quantile(0.9)
q_thrh
data['quality'] = data['quality'].apply(lambda x: 1 if x >= q_thrh else -1)

In [4]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,-1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,-1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,-1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,-1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,-1


In [5]:
x = data.drop(columns=['quality']).values
y = data['quality'].values

In [6]:
from  sklearn.preprocessing import StandardScaler

In [7]:
sc = StandardScaler()
sc.fit(x)
x = sc.transform(x)

In [8]:
import imp
import classiFIRE as FIRE
from sklearn.linear_model import LogisticRegression
imp.reload(FIRE)

clf_fire = FIRE.classiFIRE()

Очень быстрый sklearn-овский класификатор

In [9]:
clf = LogisticRegression(random_state=134, solver='liblinear')
clf.fit(x, y)

LogisticRegression(random_state=134, solver='liblinear')

И мой

In [10]:
clf_fire = FIRE.classiFIRE()
clf_fire.fit(x, y, learning_rate=1e-4, tol=1e-3, batch_size=0.2)

In [16]:
answers = pd.DataFrame()
answers['fire'] = clf_fire.predict_proba(x)[: , 0]
answers['sklearn'] = clf.predict_proba(x)[: , 0]
answers

Unnamed: 0,fire,sklearn
0,0.955293,0.991978
1,0.950220,0.990513
2,0.938282,0.987645
3,0.819188,0.926170
4,0.955293,0.991978
...,...,...
1594,0.899766,0.964110
1595,0.810902,0.865585
1596,0.809574,0.882121
1597,0.901470,0.960941


Выставим threshold = 0.5 и классифицируем

In [12]:
def decode(x):
    out = []
    for z in x:
        if z > 0.5:
            out.append(1)
        else:
            out.append(-1)
    return out

Неверно классифицированные

In [17]:
answers = answers.apply(decode)
answers

Unnamed: 0,fire,sklearn
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
...,...,...
1594,1,1
1595,1,1
1596,1,1
1597,1,1


In [18]:
answers[answers['fire'] != answers['sklearn']]

Unnamed: 0,fire,sklearn
339,1,-1
341,1,-1
365,1,-1
423,1,-1
430,1,-1
462,-1,1
503,1,-1
504,1,-1
506,1,-1
538,1,-1


In [22]:
miss = answers[answers['fire'] != answers['sklearn']]
print(f'{miss.shape[0]} of {x.shape[0]} missclassified')

31 of 1599 missclassified


Видно что в основном наш классификатор ошибается, предсказывая 1ый класс вина(хорошие вина). Всего хороших вина замечено

In [29]:
print(f'{y[y == 1].shape[0]} шт.')

217 шт.


Результаты судя по всему непротиворечивы, а работает намоног медленней. Исследуем время работы в зависимости от `batch_size`

In [None]:
sizes = np.linspace(0.1, 1, 100)
times = []
for _i, size in enumerate(sizes):
    print(f'{_i+1}/{len(sizes)}', end = '\r')
    loc_times = np.array([])
    for _ in range(4):
        start = time.time()
        # каждый раз обновляем результаты
        clf_fire = FIRE.classiFIRE()
        clf_fire.fit(x=x, y=y, learning_rate=1e-4, tol=1e-3, batch_size=size)
        end = time.time()
        loc_times = np.insert(arr=loc_times, obj=0, values=end - start)
    times.append(loc_times.mean())

    
for_plot = pd.DataFrame()
for_plot['times'] = times
for_plot['batches'] = sizes
for_plot.to_csv('batch_time.csv')

9/100

In [None]:
for_plot = pd.read_csv('batch_time.csv')

In [None]:
plt.plot(for_plot['batches'], for_plot['times'])
plt.xlabel('batch_size')
plt.ylabel('time')
plt.show()

In [None]:
x.shape[0]

Да, получилась фигня т.к. размер выборки маловат