In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import pickle
import numpy as np
import copy
from tqdm.notebook import trange, tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
from writeprints import get_writeprints_transformer, prepare_entry
from utills import batch
from utills import batch
from pytorch_models import NeuralNet
import torch
import torch.nn as nn
from torch.utils import data

In [3]:
from plotly.offline import init_notebook_mode
import plotly.offline as py
import plotly.graph_objs as go
import plotly.express as px
init_notebook_mode(connected=True)

In [4]:
# Constants
DATA_DIR = 'data/pan_clustering/pan20_large/'
TEMP_DATA_DIR = 'temp_data/pan20_large_computed/'

Load ground truth and split dataset
===

In [5]:
# Load gound truth
ground_truth = {}
partition = {}
with open(DATA_DIR + '/pan20-authorship-verification-training-large-truth.jsonl', 'r') as f:
    for l in f:
        d = json.loads(l)
        ground_truth[d['id']] = d['same']
        r = np.random.rand()
        if r < 0.7: # 70%
            partition[d['id']] = 'train'
        elif r < 0.85: # 15%
            partition[d['id']] = 'val'
        else: # 15%
            partition[d['id']] = 'test'

In [6]:
# Split datasets in to three sets and assign random order for train set
train_sz = len([v for v in partition.values() if v == 'train'])
test_sz = len([v for v in partition.values() if v == 'test'])
val_sz = len([v for v in partition.values() if v == 'val'])
# train_idxs = np.array(range(train_sz))
# np.random.shuffle(train_idxs)

In [13]:
train_idxs = np.array(range(train_sz))
np.random.shuffle(train_idxs)

Preprocess
====

In [None]:
# Preprocess data and split to files
with open(DATA_DIR + 'pan20-authorship-verification-training-large.jsonl', 'r') as f,\
    open(TEMP_DATA_DIR + 'processed_docs_train.jsonl', 'w') as f_train,\
    open(TEMP_DATA_DIR + 'processed_docs_test.jsonl', 'w') as f_test,\
    open(TEMP_DATA_DIR + 'processed_docs_val.jsonl', 'w') as f_val:
    i = 0
    for l in tqdm(f, total=len(ground_truth)):
        i += 1
        if i % 10000 == 0:
            print(i)
        d = json.loads(l)
        e1 = prepare_entry(d['pair'][0])
        e2 = prepare_entry(d['pair'][1])
        
        if partition[d['id']] == 'train':
            output_file = f_train
        elif partition[d['id']] == 'test':
            output_file = f_test
        elif partition[d['id']] == 'val':
            output_file = f_val
        else:
            raise('Invalid partition')
        json.dump({'id': d['id'], 'doc1': e1, 'doc2': e2}, output_file)
        output_file.write('\n')

10000
20000
30000
40000
50000
60000
70000
80000


In [None]:
# Load previously prepared preprocessed data, split them into three files
with open('temp_data/pan20/processed_docs.jsonl', 'r') as f,\
    open(TEMP_DATA_DIR + 'processed_docs_train.jsonl', 'w') as f_train,\
    open(TEMP_DATA_DIR + 'processed_docs_test.jsonl', 'w') as f_test,\
    open(TEMP_DATA_DIR + 'processed_docs_val.jsonl', 'w') as f_val:
    for l in tqdm(f, total=len(ground_truth)):
        d = json.loads(l)
        if partition[d['id']] == 'train':
            f_train.write(l)
        elif partition[d['id']] == 'test':
            f_test.write(l)
        elif partition[d['id']] == 'val':
            f_val.write(l)
        else:
            print('Invalid partition')

Fit the feature transforme
====

In [10]:
with open(TEMP_DATA_DIR + 'processed_docs_train.jsonl', 'r') as f:
    docs = []
    for l in tqdm(f, total=train_sz):
        # Only retain a sample of records to train the transformer
        if np.random.rand() < 0.9:
            continue
            
        d = json.loads(l)
        docs.append(d['doc1'])
        docs.append(d['doc2'])




In [31]:
with open(TEMP_DATA_DIR + 'ordering_metadata.p', 'wb') as f:
    pickle.dump((train_sz, test_sz, val_sz, train_idxs), f)

In [11]:
transformer = get_writeprints_transformer()
X = transformer.fit_transform(docs)
scaler = StandardScaler(with_mean=False)
X = scaler.fit_transform(X)
import pickle
with open(TEMP_DATA_DIR + 'transformers.p', 'wb') as f:
    pickle.dump((transformer, scaler), f)


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



In [None]:
with open(TEMP_DATA_DIR + 'ordering_metadata.p', 'rb') as f:
    train_sz, test_sz, val_sz, train_idxs = pickle.load(f)
    
with open(TEMP_DATA_DIR + 'transformers.p', 'rb') as f:
    transformer, scaler = pickle.load(f)

Vectorize the training data
====

In [15]:
feature_sz = len(transformer.get_feature_names())
X_train = np.memmap(TEMP_DATA_DIR + 'extracted_features_X_train.npy', dtype='float32', mode='w+', shape=(train_sz, feature_sz))
Y_train = np.memmap(TEMP_DATA_DIR + 'extracted_features_Y_train.npy', dtype='int32', mode='w+', shape=(train_sz))

In [None]:
with open(TEMP_DATA_DIR + 'processed_docs_train.jsonl', 'r') as f:
    batch_size = 10000
    i = 0;
    docs1 = []
    docs2 = []
    idxs = []
    labels = []
    for l in tqdm(f, total=train_sz):
        d = json.loads(l)
        docs1.append(d['doc1'])
        docs2.append(d['doc2'])
        labels.append(ground_truth[d['id']])
        idxs.append(train_idxs[i])
        i += 1
        if len(labels) >= batch_size:
            x1 = scaler.transform(transformer.transform(docs1))
            x2 = scaler.transform(transformer.transform(docs2))
            X_train[idxs, :] = np.abs(x1-x2).todense()
            Y_train[idxs] = labels
            
            docs1 = []
            docs2 = []
            idxs = []
            labels = []
            
    x1 = scaler.transform(transformer.transform(docs1))
    x2 = scaler.transform(transformer.transform(docs2))
    X_train[idxs, :] = np.abs(x1-x2).todense()
    Y_train[idxs] = labels

Vectorize the test data
==

In [33]:
feature_sz = len(transformer.get_feature_names())
X_test = np.memmap(TEMP_DATA_DIR + 'extracted_features_X_test.npy', dtype='float32', mode='w+', shape=(test_sz, feature_sz))
Y_test = np.memmap(TEMP_DATA_DIR + 'extracted_features_Y_test.npy', dtype='int32', mode='w+', shape=(test_sz))

with open(TEMP_DATA_DIR + 'processed_docs_test.jsonl', 'r') as f:
    batch_size = 10000
    i = 0;
    idxs = []
    docs1 = []
    docs2 = []
    labels = []
    for l in tqdm(f, total=test_sz):
        d = json.loads(l)
        docs1.append(d['doc1'])
        docs2.append(d['doc2'])
        labels.append(ground_truth[d['id']])
        idxs.append(i)
        i += 1
        if len(labels) >= batch_size:
            x1 = scaler.transform(transformer.transform(docs1))
            x2 = scaler.transform(transformer.transform(docs2))
            X_test[idxs, :] = np.abs(x1-x2).todense()
            Y_test[idxs] = labels
            
            docs1 = []
            docs2 = []
            idxs = []
            labels = []
            
    x1 = scaler.transform(transformer.transform(docs1))
    x2 = scaler.transform(transformer.transform(docs2))
    X_test[idxs, :] = np.abs(x1-x2).todense()
    Y_test[idxs] = labels




Vectorize the validation data
====

In [34]:

feature_sz = len(transformer.get_feature_names())
X_val = np.memmap(TEMP_DATA_DIR + 'extracted_features_X_val.npy', dtype='float32', mode='w+', shape=(val_sz, feature_sz))
Y_val = np.memmap(TEMP_DATA_DIR + 'extracted_features_Y_val.npy', dtype='int32', mode='w+', shape=(val_sz))

with open(TEMP_DATA_DIR + 'processed_docs_val.jsonl', 'r') as f:
    batch_size = 10000
    i = 0;
    docs1 = []
    docs2 = []
    labels = []
    idxs = []
    for l in tqdm(f, total=val_sz):
        d = json.loads(l)
        docs1.append(d['doc1'])
        docs2.append(d['doc2'])
        labels.append(ground_truth[d['id']])
        idxs.append(i)
        i += 1
        if len(labels) >= batch_size:
            x1 = scaler.transform(transformer.transform(docs1))
            x2 = scaler.transform(transformer.transform(docs2))
            X_val[idxs, :] = np.abs(x1-x2).todense()
            Y_val[idxs] = labels
            
            docs1 = []
            docs2 = []
            idxs = []
            labels = []
            
    x1 = scaler.transform(transformer.transform(docs1))
    x2 = scaler.transform(transformer.transform(docs2))
    X_val[idxs, :] = np.abs(x1-x2).todense()
    Y_val[idxs] = labels




Train Classifier: SGD
===

In [4]:
with open(TEMP_DATA_DIR + 'ordering_metadata.p', 'rb') as f:
    train_sz, test_sz, val_sz, train_idxs = pickle.load(f)
    
with open(TEMP_DATA_DIR + 'transformers.p', 'rb') as f:
    transformer, scaler = pickle.load(f)

In [5]:
feature_sz = len(transformer.get_feature_names())
X_train = np.memmap(TEMP_DATA_DIR + 'extracted_features_X_train.npy', dtype='float32', mode='r', shape=(train_sz, feature_sz))
Y_train = np.memmap(TEMP_DATA_DIR + 'extracted_features_Y_train.npy', dtype='int32', mode='r', shape=(train_sz))

In [6]:
X_test = np.memmap(TEMP_DATA_DIR + 'extracted_features_X_test.npy', dtype='float32', mode='r', shape=(test_sz, feature_sz))
Y_test = np.memmap(TEMP_DATA_DIR + 'extracted_features_Y_test.npy', dtype='int32', mode='r', shape=(test_sz))

In [11]:
clf = SGDClassifier(loss='log', verbose=True)

In [None]:
batch_size=50000
num_epochs = 100
for i in range(num_epochs):
    print('Epoch - ', i)
    print('-' * 30)
    for idxs in tqdm(batch(range(train_sz), batch_size), total=int(train_sz/batch_size) + 1):
        clf.partial_fit(X_train[idxs, :], Y_train[idxs], classes=[0, 1])
        
    probs = clf.predict_proba(X_test)[:, 1]
    fpr, tpr, thresh = roc_curve(Y_test, probs)
    roc_auc = auc(fpr, tpr)
    print('AUC: ', roc_auc)

In [15]:
roc_auc = auc(fpr, tpr)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x = fpr,
    y = tpr,
    text = thresh,
    mode='lines'
))
fig.show()
print(roc_auc)

0.9361641699426799


Train Classifier: PyTorch NN
=====

In [35]:
with open(TEMP_DATA_DIR + 'ordering_metadata.p', 'rb') as f:
    train_sz, test_sz, val_sz, train_idxs = pickle.load(f)
    
with open(TEMP_DATA_DIR + 'transformers.p', 'rb') as f:
    transformer, scaler = pickle.load(f)
    
feature_sz = len(transformer.get_feature_names())
X_train = np.memmap(TEMP_DATA_DIR + 'extracted_features_X_train.npy', dtype='float32', mode='r', shape=(train_sz, feature_sz))
Y_train = np.memmap(TEMP_DATA_DIR + 'extracted_features_Y_train.npy', dtype='int32', mode='r', shape=(train_sz))

X_val = np.memmap(TEMP_DATA_DIR + 'extracted_features_X_val.npy', dtype='float32', mode='r', shape=(val_sz, feature_sz))
Y_val = np.memmap(TEMP_DATA_DIR + 'extracted_features_Y_val.npy', dtype='int32', mode='r', shape=(val_sz))

X_test = np.memmap(TEMP_DATA_DIR + 'extracted_features_X_test.npy', dtype='float32', mode='r', shape=(test_sz, feature_sz))
Y_test = np.memmap(TEMP_DATA_DIR + 'extracted_features_Y_test.npy', dtype='int32', mode='r', shape=(test_sz))

In [36]:
# Device configuration
device = torch.device('cpu')

# Hyper-parameters 
input_size = X_train.shape[1]
hidden_size = 100
num_classes = 1
num_epochs = 100
batch_size = 1000
dropout_rate = 0.8
learning_rate = 0.01

In [37]:
train_dataset = data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(Y_train.astype('float32')))
val_dataset = data.TensorDataset(torch.from_numpy(X_val), torch.from_numpy(Y_val.astype('float32')))
test_dataset = data.TensorDataset(torch.from_numpy(X_test), torch.from_numpy(Y_test.astype('float32')))

# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=False)

val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=False)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

model = NeuralNet(input_size, hidden_size, num_classes, dropout_rate).to(device)

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

In [38]:
# Train the model
training_loss = []
validation_loss = []
aucs = []
total_step = len(train_loader)
for epoch in range(num_epochs):
    model.train()
    running_training_loss = 0.0
    for i, (x, y) in enumerate(train_loader):  
        # Move tensors to the configured device
        x = x.to(device)
        y = y.to(device).unsqueeze(1)
        
        # Forward pass
        outputs = model(x)
        loss = criterion(outputs, y)
        
        running_training_loss += loss.item()
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    training_loss.append(running_training_loss / len(train_loader))
    
    model.eval()
    # Validation loop
    running_val_loss = 0.0
    actual = []
    preds = []
    
    best_auc = 0.0
    best_model = model
    for X_batch, y_batch in val_loader:
        with torch.no_grad():
            y_pred = model(X_batch)
            y_batch = y_batch.to(device).unsqueeze(1)
            loss = criterion(y_pred, y_batch)
            running_val_loss += loss.item()
            preds.extend(list(y_pred.numpy()[:, 0]))
            actual.extend(list(y_batch.numpy()))
            
    fpr, tpr, thresh = roc_curve(actual, preds)
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    if roc_auc > best_auc:
        best_auc = roc_auc
        best_model = copy.deepcopy(model)
    validation_loss.append(running_val_loss / len(val_loader))


    print ('Epoch [{}/{}], Training Loss: {:.4f}, Val Loss: {:.4f}, AUC: {:.4f}' 
           .format(epoch+1, num_epochs, running_training_loss / len(train_loader), running_val_loss / len(val_loader), roc_auc))

Epoch [1/100], Training Loss: 0.4046, Val Loss: 0.2899, AUC: 0.9532
Epoch [2/100], Training Loss: 0.3146, Val Loss: 0.2889, AUC: 0.9604
Epoch [3/100], Training Loss: 0.2871, Val Loss: 0.2782, AUC: 0.9630
Epoch [4/100], Training Loss: 0.2691, Val Loss: 0.2521, AUC: 0.9636
Epoch [5/100], Training Loss: 0.2566, Val Loss: 0.3119, AUC: 0.9649
Epoch [6/100], Training Loss: 0.2495, Val Loss: 0.2541, AUC: 0.9655
Epoch [7/100], Training Loss: 0.2407, Val Loss: 0.2507, AUC: 0.9642
Epoch [8/100], Training Loss: 0.2353, Val Loss: 0.2584, AUC: 0.9662
Epoch [9/100], Training Loss: 0.2306, Val Loss: 0.2858, AUC: 0.9664
Epoch [10/100], Training Loss: 0.2258, Val Loss: 0.2580, AUC: 0.9661
Epoch [11/100], Training Loss: 0.2221, Val Loss: 0.2582, AUC: 0.9676
Epoch [12/100], Training Loss: 0.2178, Val Loss: 0.2489, AUC: 0.9676
Epoch [13/100], Training Loss: 0.2140, Val Loss: 0.3151, AUC: 0.9678
Epoch [14/100], Training Loss: 0.2097, Val Loss: 0.2705, AUC: 0.9678
Epoch [15/100], Training Loss: 0.2089, Val 

In [40]:
torch.save(best_model, TEMP_DATA_DIR + 'best_model.pt')

In [41]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x = list(range(len(aucs))),
    y = aucs,
    mode='lines',
    name='AUCs'
))

In [42]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x = list(range(len(training_loss))),
    y = training_loss,
    mode='lines',
    name='Training Loss'
))
fig.add_trace(go.Scatter(
    x = list(range(len(validation_loss))),
    y = validation_loss,
    mode='lines',
    name='Validation Loss'
))
fig.show()

In [43]:
with torch.no_grad():
    preds = []
    actual = []
    for x, y in test_loader:
        x = x.to(device)
        y = y.to(device).unsqueeze(1)
        outputs = best_model(x)
        preds.extend(list(outputs.numpy()[:, 0]))
        actual.extend(list(y.numpy()[:, 0]))

In [32]:
print('FPR-TRP Curve')

fpr, tpr, thresh = roc_curve(actual, preds)
roc_auc = auc(fpr, tpr)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x = fpr,
    y = tpr,
    text = thresh,
    mode='lines'
))
fig.show()
print(roc_auc)


print('P-R Curve')
precision, recall, thresholds = precision_recall_curve(actual, preds)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=recall,
    y=precision,
    text=np.array(thresholds).astype(str)
))
fig.show()
print('AUC: ', auc(recall, precision))

FPR-TRP Curve


0.9732000658438521
P-R Curve


AUC:  0.9788685326999276
