In [45]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import json
import pickle
import numpy as np
from tqdm.notebook import trange, tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
from writeprints import get_writeprints_transformer, prepare_entry
from utills import batch
from pytorch_models import NeuralNet
import torch
import torch.nn as nn
from torch.utils import data
import copy
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

In [3]:
from plotly.offline import init_notebook_mode
import plotly.offline as py
import plotly.graph_objs as go
import plotly.express as px
init_notebook_mode(connected=True)

In [5]:
# Constants
DATA_DIR = 'data/pan_clustering/pan20_small/'
TEMP_DATA_DIR = 'temp_data/pan20_computed/'

Load ground truth and split dataset
====

In [6]:
# Load gound truth
ground_truth = {}
partition = {}
with open(DATA_DIR + '/pan20-authorship-verification-training-small-truth.jsonl', 'r') as f:
    for l in f:
        d = json.loads(l)
        ground_truth[d['id']] = d['same']
        r = np.random.rand()
        if r < 0.7: # 70%
            partition[d['id']] = 'train'
        elif r < 0.85: # 15%
            partition[d['id']] = 'val'
        else: # 15%
            partition[d['id']] = 'test'

In [7]:
# Split datasets in to three sets and assign random order for train set
train_sz = len([v for v in partition.values() if v == 'train'])
test_sz = len([v for v in partition.values() if v == 'test'])
val_sz = len([v for v in partition.values() if v == 'val'])

Preprocess
=====

In [None]:
# Preprocess data and split to files
with open(DATA_DIR + 'pan20-authorship-verification-training-small.jsonl', 'r') as f,\
    open(TEMP_DATA_DIR + 'processed_docs_train.jsonl', 'w') as f_train,\
    open(TEMP_DATA_DIR + 'processed_docs_test.jsonl', 'w') as f_test,\
    open(TEMP_DATA_DIR + 'processed_docs_val.jsonl', 'w') as f_val:
    for l in tqdm(f, total=len(ground_truth)):
        d = json.loads(l)
        e1 = prepare_entry(d['pair'][0])
        e2 = prepare_entry(d['pair'][1])
        
        if partition[d['id']] == 'train':
            output_file = f_train
        elif partition[d['id']] == 'test':
            output_file = f_test
        elif partition[d['id']] == 'val':
            output_file = f_val
        else:
            raise('Invalid partition')
        json.dump({'id': d['id'], 'doc1': e1, 'doc2': e2}, output_file)
        output_file.write('\n')

In [None]:
# Load previously prepared preprocessed data, split them into three files
with open('temp_data/pan20/processed_docs_small.jsonl', 'r') as f,\
    open(TEMP_DATA_DIR + 'processed_docs_train.jsonl', 'w') as f_train,\
    open(TEMP_DATA_DIR + 'processed_docs_test.jsonl', 'w') as f_test,\
    open(TEMP_DATA_DIR + 'processed_docs_val.jsonl', 'w') as f_val:
    for l in tqdm(f, total=len(ground_truth)):
        d = json.loads(l)
        if partition[d['id']] == 'train':
            f_train.write(l)
        elif partition[d['id']] == 'test':
            f_test.write(l)
        elif partition[d['id']] == 'val':
            f_val.write(l)
        else:
            print('Invalid partition')

Fit the feature transformer
======

In [None]:
with open(TEMP_DATA_DIR + 'processed_docs_train.jsonl', 'r') as f:
    docs = []
    for l in tqdm(f, total=train_sz):
#         Only retain a sample of records to train the transformer
        if np.random.rand() > 0.5:
            continue
            
        d = json.loads(l)
        docs.append(d['doc1'])
        docs.append(d['doc2'])

In [None]:
transformer = get_writeprints_transformer()
X = transformer.fit_transform(docs)
scaler = StandardScaler(with_mean=False)
X = scaler.fit_transform(X)
with open(TEMP_DATA_DIR + 'transformers.p', 'wb') as f:
    pickle.dump((transformer, scaler), f)


In [None]:
train_idxs = np.array(range(train_sz))
np.random.shuffle(train_idxs)

with open(TEMP_DATA_DIR + 'ordering_metadata.p', 'wb') as f:
    pickle.dump((train_sz, test_sz, val_sz, train_idxs), f)

In [5]:
with open(TEMP_DATA_DIR + 'transformers.p', 'rb') as f:
    transformer, scaler = pickle.load(f)
    
with open(TEMP_DATA_DIR + 'ordering_metadata.p', 'rb') as f:
    train_sz, test_sz, val_sz, train_idxs = pickle.load(f)

In [6]:

with open(TEMP_DATA_DIR + 'ordering_metadata.p', 'rb') as f:
    train_idxs = pickle.load(f)

Vectorize the training data
======

In [16]:
feature_sz = len(transformer.get_feature_names())
X_train = np.memmap(TEMP_DATA_DIR + 'extracted_features_X_train.npy', dtype='float32', mode='w+', shape=(train_sz, feature_sz))
Y_train = np.memmap(TEMP_DATA_DIR + 'extracted_features_Y_train.npy', dtype='int32', mode='w+', shape=(train_sz))

In [17]:
with open(TEMP_DATA_DIR + 'processed_docs_train.jsonl', 'r') as f:
    batch_size = 20000
    i = 0;
    docs1 = []
    docs2 = []
    idxs = []
    labels = []
    for l in tqdm(f, total=train_sz):
        d = json.loads(l)
        docs1.append(d['doc1'])
        docs2.append(d['doc2'])
        labels.append(ground_truth[d['id']])
        idxs.append(train_idxs[i])
        i += 1
        if len(labels) >= batch_size:
            x1 = scaler.transform(transformer.transform(docs1))
            x2 = scaler.transform(transformer.transform(docs2))
            X_train[idxs, :] = np.abs(x1-x2).todense()
            Y_train[idxs] = labels
            
            docs1 = []
            docs2 = []
            idxs = []
            labels = []
            
x1 = scaler.transform(transformer.transform(docs1))
x2 = scaler.transform(transformer.transform(docs2))
X_train[idxs, :] = np.abs(x1-x2).todense()
Y_train[idxs] = labels




Vectorize the test data
=====

In [26]:
# feature_sz = len(transformer.get_feature_names())
X_test = np.memmap(TEMP_DATA_DIR + 'extracted_features_X_test.npy', dtype='float32', mode='w+', shape=(test_sz, feature_sz))
Y_test = np.memmap(TEMP_DATA_DIR + 'extracted_features_Y_test.npy', dtype='int32', mode='w+', shape=(test_sz))

with open(TEMP_DATA_DIR + 'processed_docs_test.jsonl', 'r') as f:
    batch_size = 10000
    i = 0;
    docs1 = []
    docs2 = []
    labels = []
    idxs = []
    for l in f:
        d = json.loads(l)
        docs1.append(d['doc1'])
        docs2.append(d['doc2'])
        labels.append(ground_truth[d['id']])
        idxs.append(i)
        i += 1

In [27]:
x1 = scaler.transform(transformer.transform(docs1))
x2 = scaler.transform(transformer.transform(docs2))
X_test[idxs, :] = np.abs(x1-x2).todense()
Y_test[idxs] = labels

Vectorize the Val data
====

In [29]:

feature_sz = len(transformer.get_feature_names())
X_val = np.memmap(TEMP_DATA_DIR + 'extracted_features_X_val.npy', dtype='float32', mode='w+', shape=(val_sz, feature_sz))
Y_val = np.memmap(TEMP_DATA_DIR + 'extracted_features_Y_val.npy', dtype='int32', mode='w+', shape=(val_sz))


In [30]:

with open(TEMP_DATA_DIR + 'processed_docs_val.jsonl', 'r') as f:
    batch_size = 10000
    i = 0;
    docs1 = []
    docs2 = []
    labels = []
    idxs = []
    for l in f:
        if i % 10000 == 0:
            print(i)
        d = json.loads(l)
        docs1.append(d['doc1'])
        docs2.append(d['doc2'])
        labels.append(ground_truth[d['id']])
        idxs.append(i)
        i += 1

0


In [31]:
x1 = scaler.transform(transformer.transform(docs1))
x2 = scaler.transform(transformer.transform(docs2))
X_val[idxs, :] = np.abs(x1-x2).todense()
Y_val[idxs] = labels

Train Classifier: SGD
====

In [5]:
with open(TEMP_DATA_DIR + 'ordering_metadata.p', 'rb') as f:
    train_sz, test_sz, val_sz, train_idxs = pickle.load(f)
    
with open(TEMP_DATA_DIR + 'transformers.p', 'rb') as f:
    transformer, scaler = pickle.load(f)

In [6]:
feature_sz = len(transformer.get_feature_names())
X_train = np.memmap(TEMP_DATA_DIR + 'extracted_features_X_train.npy', dtype='float32', mode='r', shape=(train_sz, feature_sz))
Y_train = np.memmap(TEMP_DATA_DIR + 'extracted_features_Y_train.npy', dtype='int32', mode='r', shape=(train_sz))

In [9]:
X_test = np.memmap(TEMP_DATA_DIR + 'extracted_features_X_test.npy', dtype='float32', mode='r', shape=(test_sz, feature_sz))
Y_test = np.memmap(TEMP_DATA_DIR + 'extracted_features_Y_test.npy', dtype='int32', mode='r', shape=(test_sz))

In [7]:
clf = SGDClassifier(loss='log', verbose=True)

In [None]:
batch_size=100000
num_epochs = 100
for i in range(num_epochs):
    print('Epoch - ', i)
    print('-' * 30)
    for idxs in tqdm(batch(range(train_sz), batch_size), total=int(train_sz/batch_size) + 1):
        clf.partial_fit(X_train[idxs, :], Y_train[idxs], classes=[0, 1])
        
    probs = clf.predict_proba(X_test)[:, 1]
    fpr, tpr, thresh = roc_curve(Y_test, probs)
    roc_auc = auc(fpr, tpr)
    print('AUC: ', roc_auc)

In [25]:
roc_auc = auc(fpr, tpr)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x = fpr,
    y = tpr,
    text = thresh,
    mode='lines'
))
fig.show()
print(roc_auc)

0.9020908251721158


Train Classifier: PyTorch NN
====

In [14]:
with open(TEMP_DATA_DIR + 'ordering_metadata.p', 'rb') as f:
    train_sz, test_sz, val_sz, train_idxs = pickle.load(f)
    
with open(TEMP_DATA_DIR + 'transformers.p', 'rb') as f:
    transformer, scaler = pickle.load(f)

In [32]:
feature_sz = len(transformer.get_feature_names())
X_train = np.memmap(TEMP_DATA_DIR + 'extracted_features_X_train.npy', dtype='float32', mode='r', shape=(train_sz, feature_sz))
Y_train = np.memmap(TEMP_DATA_DIR + 'extracted_features_Y_train.npy', dtype='int32', mode='r', shape=(train_sz))

X_val = np.memmap(TEMP_DATA_DIR + 'extracted_features_X_val.npy', dtype='float32', mode='r', shape=(val_sz, feature_sz))
Y_val = np.memmap(TEMP_DATA_DIR + 'extracted_features_Y_val.npy', dtype='int32', mode='r', shape=(val_sz))

X_test = np.memmap(TEMP_DATA_DIR + 'extracted_features_X_test.npy', dtype='float32', mode='r', shape=(test_sz, feature_sz))
Y_test = np.memmap(TEMP_DATA_DIR + 'extracted_features_Y_test.npy', dtype='int32', mode='r', shape=(test_sz))

In [33]:
# Device configuration
device = torch.device('cpu')

# Hyper-parameters 
input_size = X_train.shape[1]
hidden_size = 500
num_classes = 1
num_epochs = 20
batch_size = 1000
dropout_rate = 0.9
learning_rate = 0.0001

In [34]:
train_dataset = data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(Y_train.astype('float32')))
val_dataset = data.TensorDataset(torch.from_numpy(X_val), torch.from_numpy(Y_val.astype('float32')))
test_dataset = data.TensorDataset(torch.from_numpy(X_test), torch.from_numpy(Y_test.astype('float32')))

# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=False)

val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=False)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

model = NeuralNet(input_size, hidden_size, num_classes, dropout_rate).to(device)

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

In [35]:
# Train the model
training_loss = []
validation_loss = []
aucs = []
total_step = len(train_loader)
for epoch in range(num_epochs):
    model.train()
    running_training_loss = 0.0
    for i, (x, y) in enumerate(train_loader):  
        # Move tensors to the configured device
        x = x.to(device)
        y = y.to(device).unsqueeze(1)
        
        # Forward pass
        outputs = model(x)
        loss = criterion(outputs, y)
        
        running_training_loss += loss.item()
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    training_loss.append(running_training_loss / len(train_loader))
    
    model.eval()
    # Validation loop
    running_val_loss = 0.0
    actual = []
    preds = []
    
    best_auc = 0.0
    best_model = model
    for X_batch, y_batch in val_loader:
        with torch.no_grad():
            y_pred = model(X_batch)
            y_batch = y_batch.to(device).unsqueeze(1)
            loss = criterion(y_pred, y_batch)
            running_val_loss += loss.item()
            preds.extend(list(y_pred.numpy()[:, 0]))
            actual.extend(list(y_batch.numpy()))
            
    fpr, tpr, thresh = roc_curve(actual, preds)
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    if roc_auc > best_auc:
        best_auc = roc_auc
        best_model = copy.deepcopy(model)
    validation_loss.append(running_val_loss / len(val_loader))


    print ('Epoch [{}/{}], Training Loss: {:.4f}, Val Loss: {:.4f}, AUC: {:.4f}' 
           .format(epoch+1, num_epochs, running_training_loss / len(train_loader), running_val_loss / len(val_loader), roc_auc))

Epoch [1/20], Training Loss: 0.4225, Val Loss: 1.3419, AUC: 0.9628
Epoch [2/20], Training Loss: 0.2535, Val Loss: 0.2605, AUC: 0.9719
Epoch [3/20], Training Loss: 0.2094, Val Loss: 0.2336, AUC: 0.9755
Epoch [4/20], Training Loss: 0.1824, Val Loss: 0.2185, AUC: 0.9774
Epoch [5/20], Training Loss: 0.1610, Val Loss: 0.2004, AUC: 0.9783
Epoch [6/20], Training Loss: 0.1445, Val Loss: 0.2069, AUC: 0.9787
Epoch [7/20], Training Loss: 0.1295, Val Loss: 0.1954, AUC: 0.9788
Epoch [8/20], Training Loss: 0.1162, Val Loss: 0.1884, AUC: 0.9790
Epoch [9/20], Training Loss: 0.1102, Val Loss: 0.1891, AUC: 0.9792
Epoch [10/20], Training Loss: 0.1001, Val Loss: 0.1933, AUC: 0.9791
Epoch [11/20], Training Loss: 0.0949, Val Loss: 0.1909, AUC: 0.9790
Epoch [12/20], Training Loss: 0.0864, Val Loss: 0.1978, AUC: 0.9789
Epoch [13/20], Training Loss: 0.0813, Val Loss: 0.1946, AUC: 0.9789
Epoch [14/20], Training Loss: 0.0773, Val Loss: 0.1982, AUC: 0.9787
Epoch [15/20], Training Loss: 0.0718, Val Loss: 0.1995, A

In [36]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x = list(range(len(training_loss))),
    y = training_loss,
    mode='lines',
    name='Training Loss'
))
fig.add_trace(go.Scatter(
    x = list(range(len(validation_loss))),
    y = validation_loss,
    mode='lines',
    name='Validation Loss'
))
fig.show()

In [37]:
with torch.no_grad():
    preds = []
    actual = []
    for x, y in test_loader:
        x = x.to(device)
        y = y.to(device).unsqueeze(1)
        outputs = best_model(x)
        preds.extend(list(outputs.numpy()[:, 0]))
        actual.extend(list(y.numpy()[:, 0]))

In [38]:
print('FPR-TRP Curve')

fpr, tpr, thresh = roc_curve(actual, preds)
roc_auc = auc(fpr, tpr)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x = fpr,
    y = tpr,
    text = thresh,
    mode='lines'
))
fig.show()
print(roc_auc)


print('P-R Curve')
precision, recall, thresholds = precision_recall_curve(actual, preds)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=recall,
    y=precision,
    text=np.array(thresholds).astype(str)
))
fig.show()
print('AUC: ', auc(recall, precision))

FPR-TRP Curve


0.97654344384857
P-R Curve


AUC:  0.9788340524906007


In [39]:
torch.save(best_model, TEMP_DATA_DIR + 'best_model.pt')

Training Classifier: Logistic Regression
====

In [5]:
with open(TEMP_DATA_DIR + 'ordering_metadata.p', 'rb') as f:
    train_sz, test_sz, val_sz, train_idxs = pickle.load(f)
    
with open(TEMP_DATA_DIR + 'transformers.p', 'rb') as f:
    transformer, scaler = pickle.load(f)

In [40]:
feature_sz = len(transformer.get_feature_names())
X_train = np.memmap(TEMP_DATA_DIR + 'extracted_features_X_train.npy', dtype='float32', mode='r', shape=(train_sz, feature_sz))
Y_train = np.memmap(TEMP_DATA_DIR + 'extracted_features_Y_train.npy', dtype='int32', mode='r', shape=(train_sz))

X_val = np.memmap(TEMP_DATA_DIR + 'extracted_features_X_val.npy', dtype='float32', mode='r', shape=(val_sz, feature_sz))
Y_val = np.memmap(TEMP_DATA_DIR + 'extracted_features_Y_val.npy', dtype='int32', mode='r', shape=(val_sz))

X_test = np.memmap(TEMP_DATA_DIR + 'extracted_features_X_test.npy', dtype='float32', mode='r', shape=(test_sz, feature_sz))
Y_test = np.memmap(TEMP_DATA_DIR + 'extracted_features_Y_test.npy', dtype='int32', mode='r', shape=(test_sz))

In [41]:
X_train = np.array(X_train)

In [None]:
clf = LogisticRegression(solver='lbfgs', max_iter=500)
distributions = dict(C=uniform(loc=0, scale=4), penalty=['l2', 'l1'])
param_clf = RandomizedSearchCV(clf, distributions, random_state=0, verbose=2, scoring='roc_auc')
search = param_clf.fit(X_train, Y_train)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] C=2.195254015709299, penalty=l1 .................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.

Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: 
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.3s remaining:    0.0s


[CV] .................. C=2.195254015709299, penalty=l1, total=   2.9s
[CV] C=2.195254015709299, penalty=l1 .................................
[CV] .................. C=2.195254015709299, penalty=l1, total=   2.8s
[CV] C=2.195254015709299, penalty=l1 .................................
[CV] .................. C=2.195254015709299, penalty=l1, total=   2.9s
[CV] C=2.195254015709299, penalty=l1 .................................
[CV] .................. C=2.195254015709299, penalty=l1, total=   3.0s
[CV] C=2.195254015709299, penalty=l1 .................................
[CV] .................. C=2.195254015709299, penalty=l1, total=   3.1s
[CV] C=3.3770629943240693, penalty=l1 ................................
[CV] ................. C=3.3770629943240693, penalty=l1, total=   2.6s
[CV] C=3.3770629943240693, penalty=l1 ................................
[CV] ................. C=3.3770629943240693, penalty=l1, total=   2.7s
[CV] C=3.3770629943240693, penalty=l1 ................................
[CV] .


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV] ................. C=2.4942547871438894, penalty=l2, total= 6.5min
[CV] C=2.4942547871438894, penalty=l2 ................................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV] ................. C=2.4942547871438894, penalty=l2, total= 6.7min
[CV] C=2.4942547871438894, penalty=l2 ................................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV] ................. C=2.4942547871438894, penalty=l2, total= 6.3min
[CV] C=2.4942547871438894, penalty=l2 ................................
[CV] ................. C=2.4942547871438894, penalty=l2, total= 5.7min
[CV] C=2.4942547871438894, penalty=l2 ................................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV] ................. C=2.4942547871438894, penalty=l2, total= 6.2min
[CV] C=1.75034884505077, penalty=l2 ..................................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV] ................... C=1.75034884505077, penalty=l2, total= 6.2min
[CV] C=1.75034884505077, penalty=l2 ..................................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV] ................... C=1.75034884505077, penalty=l2, total= 6.4min
[CV] C=1.75034884505077, penalty=l2 ..................................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV] ................. C=1.5337660753031108, penalty=l2, total= 6.6min
[CV] C=1.5337660753031108, penalty=l2 ................................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV] ................. C=1.5337660753031108, penalty=l2, total= 6.3min
[CV] C=1.5337660753031108, penalty=l2 ................................
[CV] ................. C=1.5337660753031108, penalty=l2, total= 5.4min
[CV] C=1.5337660753031108, penalty=l2 ................................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV] ................. C=1.5337660753031108, penalty=l2, total= 6.2min
[CV] C=3.2486749151019727, penalty=l2 ................................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV] ................. C=3.2486749151019727, penalty=l2, total= 5.9min
[CV] C=3.2486749151019727, penalty=l2 ................................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV] ................. C=3.2486749151019727, penalty=l2, total= 6.1min
[CV] C=3.2486749151019727, penalty=l2 ................................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV] ................. C=3.2486749151019727, penalty=l2, total= 6.1min
[CV] C=3.2486749151019727, penalty=l2 ................................
[CV] ................. C=3.2486749151019727, penalty=l2, total= 4.8min
[CV] C=3.2486749151019727, penalty=l2 ................................
[CV] ................. C=3.2486749151019727, penalty=l2, total= 5.7min
[CV] C=2.2721782443757292, penalty=l1 ................................



Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: 
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.




[CV] ................. C=2.2721782443757292, penalty=l1, total=   2.9s
[CV] C=2.2721782443757292, penalty=l1 ................................
[CV] ................. C=2.2721782443757292, penalty=l1, total=   2.9s
[CV] C=2.2721782443757292, penalty=l1 ................................
[CV] ................. C=2.2721782443757292, penalty=l1, total=   2.6s
[CV] C=2.2721782443757292, penalty=l1 ................................
[CV] ................. C=2.2721782443757292, penalty=l1, total=   2.8s
[CV] C=2.2721782443757292, penalty=l1 ................................
[CV] ................. C=2.2721782443757292, penalty=l1, total=   2.8s
[CV] C=3.34431505414951, penalty=l2 ..................................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV] ................... C=3.34431505414951, penalty=l2, total= 6.1min
[CV] C=3.34431505414951, penalty=l2 ..................................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV] ................... C=3.34431505414951, penalty=l2, total= 6.2min
[CV] C=3.34431505414951, penalty=l2 ..................................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV] ................... C=3.34431505414951, penalty=l2, total= 6.3min
[CV] C=3.34431505414951, penalty=l2 ..................................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV] ................... C=3.34431505414951, penalty=l2, total= 5.8min
[CV] C=3.34431505414951, penalty=l2 ..................................


In [25]:
search.best_params_

{'C': 0.22685190926977272, 'penalty': 'l2'}

In [42]:
clf = LogisticRegression(C=0.2, solver='lbfgs', max_iter=5000, verbose=True)
clf.fit(X_train, Y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 15.8min finished


LogisticRegression(C=0.2, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=5000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=True,
                   warm_start=False)

In [43]:
preds = clf.predict_proba(X_test)[:, 1]
print('FPR-TRP Curve')

fpr, tpr, thresh = roc_curve(Y_test, preds)
roc_auc = auc(fpr, tpr)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x = fpr,
    y = tpr,
    text = thresh,
    mode='lines'
))
fig.show()
print(roc_auc)


print('P-R Curve')
precision, recall, thresholds = precision_recall_curve(Y_test, preds)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=recall,
    y=precision,
    text=np.array(thresholds).astype(str)
))
fig.show()
print('AUC: ', auc(recall, precision))

FPR-TRP Curve


0.9727431447453054
P-R Curve


AUC:  0.9751846343582337


In [44]:
with open(TEMP_DATA_DIR + 'LiniearRegressionModal.p', 'wb') as f:
    pickle.dump(clf, f)