In [125]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [126]:
class LinRegRidge():
    def __init__(self, batch_size=25, num_steps=350, lr=1e-2, alpha=1.0):
        self.batch_size = batch_size
        self.alpha = alpha
        self.lr = lr
        self.num_steps = num_steps

    def fit(self, X, Y):
        w = np.random.randn(X.shape[1])[:, None]
        n_obj = len(X)
        for i in range(self.num_steps):
            ind = np.random.randint(0, n_obj, self.batch_size)
            w -= self.lr * (2 * np.dot(X[ind].T, np.dot(X[ind], w) - Y[ind]) / self.batch_size + self.alpha * w/n_obj)
        self.w = w
        return self

    def predict(self, X):
        return np.dot(X, self.w)

In [127]:
n_features = 700
n_objects = 100000
w_true = np.random.uniform(-2, 2, (n_features, 1))
X = np.random.uniform(-100, 100, (n_objects, n_features)) * np.arange(n_features)
Y = X.dot(w_true) + np.random.normal(0, 10, (n_objects, 1))

In [128]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [129]:
Own_model = LinRegRidge()
sklearn_model = Ridge()

Own_model.fit(X_train_scaled, Y_train)
sklearn_model.fit(X_train_scaled, Y_train)
Y_pred_own = Own_model.predict(X_test_scaled)
Y_pred = sklearn_model.predict(X_test_scaled)

print('R^2 in own model:', r2_score(Y_test, Y_pred_own))
print('R^2 in sklearn Ridge:', r2_score(Y_test, Y_pred))

R^2 in own model: 0.9999584538355106
R^2 in sklearn Ridge: 0.9999999996439242


Логистическая регрессия:

In [130]:
class LogReg_with_batch():
    def __init__(self, batch_size=25, num_steps=350, lr=1e-2):
        self.batch_size = batch_size
        self.lr = lr
        self.num_steps = num_steps

    def fit(self, X, Y):
        w = np.random.randn(X.shape[1])[:, None]
        Y_1 = Y[:, None]
        for i in range(self.num_steps):
            ind = np.random.randint(0, len(X), self.batch_size)
            w += self.lr * np.dot(X[ind].T, (Y_1[ind] - 1/(1 + np.exp(-np.dot(X[ind], w)))))/self.batch_size
        self.w = w
        return self

    def predict(self, X):
        return 1/(1 + np.exp(-np.dot(X, self.w)))

In [131]:
class LogReg():
    def __init__(self, num_steps=350, lr=1e-2):
        self.lr = lr
        self.num_steps = num_steps

    def fit(self, X, Y):
        w = np.random.randn(X.shape[1])[:, None]
        Y_1 = Y[:, None]
        for i in range(self.num_steps):
            w += self.lr * np.dot(X.T, (Y_1 - 1/(1 + np.exp(-np.dot(X, w)))))/len(Y)
        self.w = w
        return self

    def predict(self, X):
        return 1/(1 + np.exp(-np.dot(X, self.w)))

проверим на датасете из интернета:

P.S. Далее я буду обрабатывать датасет для работы с нашей моделью. Эту часть можно не проверять.

In [132]:
df = pd.read_csv('train.csv')
df.head(10)

Unnamed: 0,ClientPeriod,MonthlySpending,TotalSpent,Sex,IsSeniorCitizen,HasPartner,HasChild,HasPhoneService,HasMultiplePhoneNumbers,HasInternetService,HasOnlineSecurityService,HasOnlineBackup,HasDeviceProtection,HasTechSupportAccess,HasOnlineTV,HasMovieSubscription,HasContractPhone,IsBillingPaperless,PaymentMethod,Churn
0,55,19.5,1026.35,Male,0,Yes,Yes,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,No,Mailed check,0
1,72,25.85,1872.2,Male,0,Yes,No,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),0
2,1,75.9,75.9,Male,0,No,No,Yes,No,Fiber optic,No,No,No,Yes,No,No,Month-to-month,Yes,Electronic check,1
3,32,79.3,2570.0,Female,1,Yes,No,Yes,Yes,Fiber optic,No,No,Yes,No,No,No,Month-to-month,No,Mailed check,0
4,60,115.25,6758.45,Female,0,Yes,Yes,Yes,Yes,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,No,Credit card (automatic),0
5,25,19.8,475.2,Female,0,No,No,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),0
6,27,90.15,2423.4,Female,0,Yes,No,Yes,Yes,Fiber optic,No,No,Yes,No,No,Yes,Month-to-month,No,Bank transfer (automatic),0
7,1,45.7,45.7,Male,0,No,No,Yes,No,DSL,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,1
8,50,105.95,5341.8,Male,0,Yes,Yes,Yes,Yes,Fiber optic,Yes,No,Yes,No,Yes,Yes,Month-to-month,No,Credit card (automatic),1
9,72,61.2,4390.25,Male,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),0


In [133]:
cat_cols = [
    'Sex',
    'IsSeniorCitizen',
    'HasPartner',
    'HasChild',
    'HasPhoneService',
    'HasMultiplePhoneNumbers',
    'HasInternetService',
    'HasOnlineSecurityService',
    'HasOnlineBackup',
    'HasDeviceProtection',
    'HasTechSupportAccess',
    'HasOnlineTV',
    'HasMovieSubscription',
    'HasContractPhone',
    'IsBillingPaperless',
    'PaymentMethod'
]

num_cols = [
    'ClientPeriod',
    'MonthlySpending',
    'TotalSpent'
]

In [134]:
def conv(x):
  if x != ' ':
    return int(x.split('.')[0])
  else:
    return np.nan
df1 = df[num_cols[2]].apply(conv)
df[num_cols[2]] = df1

In [135]:
df = df.dropna(how = 'any')

In [136]:
y = df.iloc[:, -1]
X_orig = df.iloc[:, :-1]
h1 = pd.get_dummies(df[cat_cols])
X = pd.concat([df[num_cols], h1], axis = 1)

In [137]:
X_train, X_test, Y_train, Y_test = train_test_split(X.values, y.values, 
                                                    train_size=0.8,
                                                    random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Закончили с обработкой, теперь будем с ним работать:

Сначала сравним модель без батчей и модель из ящика:

In [138]:
Own_model = LogReg(lr = 1e-1)
sklearn_model = LogisticRegression()

Own_model.fit(X_train_scaled, Y_train)
sklearn_model.fit(X_train_scaled, Y_train)
Y_pred_own = Own_model.predict(X_test_scaled)
Y_pred = sklearn_model.predict_proba(X_test_scaled)
Y_pred_own_1 = (Y_pred_own > 1/2).astype(int)

In [139]:
roc_auc_score(Y_test, Y_pred_own), roc_auc_score(Y_test, Y_pred[:, 1])

(0.8300920028308564, 0.8442934654399623)

In [140]:
accuracy_score(Y_test, Y_pred_own_1), accuracy_score(Y_test, sklearn_model.predict(X_test_scaled))

(0.7165876777251184, 0.804739336492891)

Теперь та же модель из sklearn и модель с батчами:

In [141]:
Own_model1 = LogReg_with_batch(lr = 1e-1)
Own_model1.fit(X_train_scaled, Y_train)
Y_pred_own1 = Own_model1.predict(X_test_scaled)
Y_pred_own1_1 = (Y_pred_own > 1/2).astype(int)

In [142]:
roc_auc_score(Y_test, Y_pred_own1), roc_auc_score(Y_test, Y_pred[:, 1])

(0.8230242981835338, 0.8442934654399623)

In [143]:
accuracy_score(Y_test, Y_pred_own1_1), accuracy_score(Y_test, sklearn_model.predict(X_test_scaled))

(0.7165876777251184, 0.804739336492891)

На accuracy_score модель показывает себя не очень хорошо. Возможно следует выбирать другой порог для определения класса?

Наши модели лучше всего работают с lr=1e-1, но всё равно не дотягивают до реализации из sklearn, которая стабильно выдаёт хороший результат, хотя, судя по коду, использует lr= 1e-4, с которым наши модели в лучшем случае выдают скор 0.6:

In [144]:
Own_model = LogReg(lr = 1e-4)
sklearn_model = LogisticRegression()

Own_model.fit(X_train_scaled, Y_train)
sklearn_model.fit(X_train_scaled, Y_train)
Y_pred_own = Own_model.predict(X_test_scaled)
Y_pred = sklearn_model.predict_proba(X_test_scaled)
Y_pred_own_1 = (Y_pred_own > 1/2).astype(int)

In [145]:
roc_auc_score(Y_test, Y_pred_own), roc_auc_score(Y_test, Y_pred[:, 1])

(0.5783250766690257, 0.8442934654399623)

In [146]:
accuracy_score(Y_test, Y_pred_own_1), accuracy_score(Y_test, sklearn_model.predict(X_test_scaled))

(0.4966824644549763, 0.804739336492891)

In [147]:
Own_model1 = LogReg_with_batch(lr = 1e-4)
Own_model1.fit(X_train_scaled, Y_train)
Y_pred_own1 = Own_model1.predict(X_test_scaled)
Y_pred_own1_1 = (Y_pred_own > 1/2).astype(int)

In [148]:
roc_auc_score(Y_test, Y_pred_own1), roc_auc_score(Y_test, Y_pred[:, 1])

(0.5691106393017221, 0.8442934654399623)

In [149]:
accuracy_score(Y_test, Y_pred_own1_1), accuracy_score(Y_test, sklearn_model.predict(X_test_scaled))

(0.4966824644549763, 0.804739336492891)