In [1]:
import numpy as np
import pandas as pd
import timeit
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder

In [10]:
n_rows = 100000 * 11
df = pd.read_csv("train.gz", nrows=n_rows)

X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
Y = df['click'].values

n_train = 100000 * 10
X_train = X[:n_train]
Y_train = Y[:n_train]
X_test = X[n_train:]
Y_test = Y[n_train:]


In [11]:
enc = OneHotEncoder(handle_unknown='ignore')

enc.fit(X_train)

In [12]:
# The number of iterations is set to 1 if using partial_fit.

sgd_lr_online = SGDClassifier(loss='log_loss',
                              penalty=None,
                              fit_intercept=True,
                              max_iter=1,
                              learning_rate='constant',
                              eta0=0.01,
                              random_state=42
                              )

In [13]:
start_time = timeit.default_timer()

# Use the first 1,000,000 samples for training, and the next 100,000 for testing
for i in range(10):
    x_train = X_train[i*100000:(i+1)*100000]
    y_train = Y_train[i*100000:(i+1)*100000]
    x_train_enc = enc.transform(x_train)
    sgd_lr_online.partial_fit(x_train_enc.toarray(), y_train, classes=[0, 1])

print(f"--- {(timeit.default_timer() - start_time):.3f} seconds ---")


--- 117.249 seconds ---


In [14]:
x_test_enc = enc.transform(X_test)

pred = sgd_lr_online.predict_proba(x_test_enc.toarray())[:, 1]
print(f'Training samples: {n_train * 10}, AUC on testing set: {roc_auc_score(Y_test, pred):.3f}')


Training samples: 10000000, AUC on testing set: 0.762


## Handling multiclass classification

In [15]:
from sklearn import datasets

digits = datasets.load_digits()
n_samples = len(digits.images)

In [16]:
X = digits.images.reshape((n_samples, -1))

Y = digits.target

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [18]:
from sklearn.model_selection import GridSearchCV
parameters = {'penalty': ['l2', None],
              'alpha': [1e-07, 1e-06, 1e-05, 1e-04],
              'eta0': [0.01, 0.1, 1, 10]}

sgd_lr = SGDClassifier(loss='log_loss',
                       learning_rate='constant',
                       fit_intercept=True,
                       max_iter=50,
                       random_state=42)

grid_search = GridSearchCV(sgd_lr, parameters, n_jobs=-1, cv=5)

grid_search.fit(X_train, Y_train)

print(grid_search.best_params_)

{'alpha': 1e-07, 'eta0': 1, 'penalty': 'l2'}


In [19]:
sgd_lr_best = grid_search.best_estimator_
accuracy = sgd_lr_best.score(X_test, Y_test)

print(f'The accuracy on testing set is: {accuracy*100:.1f}%')

The accuracy on testing set is: 94.7%


## Implementing logistic regression using TensorFlow

In [20]:
import tensorflow as tf


In [22]:
n_rows = 100000
df = pd.read_csv("train.gz", nrows=n_rows)

X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
Y = df['click'].values

n_train = int(n_rows * 0.9)
X_train = X[:n_train]
Y_train = Y[:n_train]
X_test = X[n_train:]
Y_test = Y[n_train:]

In [23]:
enc = OneHotEncoder(handle_unknown='ignore')

X_train_enc = enc.fit_transform(X_train).toarray().astype('float32')
X_test_enc = enc.transform(X_test).toarray().astype('float32')

Y_train = Y_train.astype('float32')
Y_test = Y_test.astype('float32')

In [24]:
batch_size = 1000

train_data = tf.data.Dataset.from_tensor_slices((X_train_enc, Y_train))

train_data = train_data.repeat().shuffle(5000).batch(batch_size).prefetch(1)

In [25]:
n_features = X_train_enc.shape[1]

W = tf.Variable(tf.zeros([n_features, 1]))
b = tf.Variable(tf.zeros([1]))

In [26]:
learning_rate = 0.001

optimizer = tf.optimizers.Adam(learning_rate)

In [27]:
def run_optimization(x, y):
    with tf.GradientTape() as tape:
        logits = tf.add(tf.matmul(x, W), b)[:, 0]
        loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits))

    # Update the parameters with respect to the gradient calculations
    gradients = tape.gradient(loss, [W, b])
    optimizer.apply_gradients(zip(gradients, [W, b]))


In [28]:
training_steps = 5000
for step, (batch_x, batch_y) in enumerate(train_data.take(training_steps), 1):
    run_optimization(batch_x, batch_y)
    if step % 500 == 0:
        logits = tf.add(tf.matmul(batch_x, W), b)[:, 0]
        loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=batch_y, logits=logits))
        print("step: %i, loss: %f" % (step, loss))

step: 500, loss: 0.419531
step: 1000, loss: 0.420348
step: 1500, loss: 0.381467
step: 2000, loss: 0.402164
step: 2500, loss: 0.375709
step: 3000, loss: 0.411544
step: 3500, loss: 0.349678
step: 4000, loss: 0.391284
step: 4500, loss: 0.396006
step: 5000, loss: 0.400889


In [29]:
logits = tf.add(tf.matmul(X_test_enc, W), b)[:, 0]
pred = tf.nn.sigmoid(logits)
auc_metric = tf.keras.metrics.AUC()
auc_metric.update_state(Y_test, pred)

print(f'AUC on testing set: {auc_metric.result().numpy():.3f}')


AUC on testing set: 0.736


## Feature selection using random forest

In [30]:
X_train = X

Y_train = Y

enc = OneHotEncoder(handle_unknown='ignore')

X_train_enc = enc.fit_transform(X_train)

In [31]:
# Feature selection with random forest

from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    min_samples_split=30,
    n_jobs=-1,
    random_state=42)

random_forest.fit(X_train_enc.toarray(), Y_train)

In [32]:
feature_imp = random_forest.feature_importances_

print(feature_imp)

[1.22776093e-05 1.42544940e-03 8.11601536e-04 ... 7.51812083e-04
 8.79340746e-04 8.49537255e-03]


In [33]:
# bottom 10 weights and the corresponding 10 least important features

feature_names = enc.get_feature_names_out()
print(np.sort(feature_imp)[:10])

bottom_10 = np.argsort(feature_imp)[:10]
print('10 least important features are:\n', feature_names[bottom_10])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
10 least important features are:
 ['x8_c03d34a2' 'x2_e72b82ae' 'x8_eb3f4b48' 'x3_503f3f1b' 'x3_65295ffa'
 'x3_3d3f385d' 'x8_a496f117' 'x8_28daa008' 'x8_7d196936' 'x5_72c55d0b']


In [34]:

# top 10 weights and the corresponding 10 most important features

print(np.sort(feature_imp)[-10:])

top_10 = np.argsort(feature_imp)[-10:]
print('10 most important features are:\n', feature_names[top_10])

[0.00849437 0.00849537 0.00872154 0.01010324 0.0109653  0.01099363
 0.01319093 0.01471638 0.01802233 0.01889752]
10 most important features are:
 ['x3_7687a86e' 'x18_157' 'x17_-1' 'x14_1993' 'x8_8a4875bd' 'x2_d9750ee7'
 'x3_98572c79' 'x16_1063' 'x15_2' 'x18_33']
