In [1]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


# Creating an Author Idenfication Model
A second-generation model of the main "originality score" algorithm: preprocessing a sample paper, performing analytics, saving the document's hash, and returning a score.  Uses embedding to improve understanding.

In [2]:
import warnings
warnings.filterwarnings('ignore')

import os
import numpy as np

## Read in dataset
Load the [Reuter 50_50 training dataset](https://archive.ics.uci.edu/ml/datasets/Reuter_50_50).

In [3]:
DATASET_PATH = 'data/C50/C50all/'

In [4]:
# load text
authors = os.listdir(DATASET_PATH)
X = []
y = []

for author in authors:
    texts = os.listdir(DATASET_PATH + author + '/')
    for text in texts:
        f=open(DATASET_PATH + author + '/' + text, 'r')
        X.append(f.read())
        y.append(author)
        f.close()

print("# of text:", len(X))
print("# of authors:", len(np.unique(y)))

# of text: 5018
# of authors: 50


## Preprocess data

In [None]:
# pip install spacy
# python -m spacy download en_core_web_md
import spacy
from tqdm import tqdm

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load('en_core_web_md')
embed_size = 300

# convert text to vectors
X_vector = []
for text in tqdm(X):
    text_nlp = nlp(text)
    vectors = [word.vector for word in text_nlp]
    X_vector.append(vectors)

del X
del nlp

In [None]:
# save for quicker reload
with open('data/x_vector.npy','wb') as f:
    np.save(f, X_vector)

In [5]:
# load from file
embed_size = 300
with open('data/x_vector.npy','rb') as f:
    X_vector = np.load(f)

In [6]:
print("Shape of X array:", np.array(X_vector).shape)
print("Shape of y array:", np.array(y).shape)

Shape of X array: (5018,)
Shape of y array: (5018,)


## Windowing
Creating smaller windows of data to process

In [7]:
# Create 100 word chucks
WINDOW_SIZE = 200
WINDOW_SPACING = 50

def chunk(x, y):
    X_chunk = []
    y_chunk = []
    
    for i in range(0, len(x)-WINDOW_SIZE, WINDOW_SPACING):
        X_chunk.append(x[i:i+WINDOW_SIZE])
        y_chunk.append(y)
    
    return X_chunk, y_chunk

X_chunks = []
y_chunks = []
for i, x in enumerate(X_vector):
    xc, yc = chunk(x, y[i])
    X_chunks += xc
    y_chunks += yc
    
del X_vector

In [8]:
X_chunks = np.array(X_chunks)
y_chunks = np.array(y_chunks)
print("X_chunks shape:", X_chunks.shape)
print("y_chunks shape:", y_chunks.shape)

X_chunks shape: (43523, 200, 300)
y_chunks shape: (43523,)


## Training

### Create training, test, and "new" sets

In [9]:
def find_first(array, find):
    for i, val in enumerate(array):
        if val == find:
            return i
    return -1

_, idx = np.unique(y_chunks, return_index=True)
author_num = y_chunks[np.sort(idx)][40]
index = find_first(y_chunks, author_num)
print("Start index of 41st author:", index)

Start index of 41st author: 34784


In [10]:
from sklearn.model_selection import train_test_split

# Keeps some authors aside for hash testing
x_train, x_new, y_train, y_new = train_test_split(X_chunks, y_chunks, train_size=index, shuffle=False)

# Split remainder into 70% training and 30% testing and shuffle
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, train_size=0.9, random_state=1)

print("Train: {} text from {} authors".format(x_train.shape[0], len(np.unique(y_train, axis=0))))
print("Test:  {} text from {} authors".format(x_test.shape[0], len(np.unique(y_test, axis=0))))
print("New:   {} text from {} authors".format(x_new.shape[0], len(np.unique(y_new, axis=0))))

del X_chunks
del y_chunks

Train: 31305 text from 40 authors
Test:  3479 text from 40 authors
New:   8739 text from 10 authors


### One-hot encode labels (authors)

In [11]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

author = y_train[0]

encoder = LabelEncoder()
encoded = encoder.fit_transform(y_train)
y_train = to_categorical(encoded)
y_test = to_categorical(encoder.transform(y_test)) 

print("Author {} is one-hot encoded as: \n{}".format(author, y_train[0]))

del y

Author KirstinRidley is one-hot encoded as: 
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [12]:
output_shape = len(y_train[0])
print("Network output shape (# of trained authors):", output_shape)

Network output shape (# of trained authors): 40


### Train newtork

In [13]:
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional, RepeatVector, TimeDistributed, Activation, Lambda, Input
import tensorflow.keras.backend as K

def model1():
    model = Sequential()
    model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.2, input_shape=(WINDOW_SIZE, embed_size), return_sequences=True))
    model.add(LSTM(128, dropout=0.2))
    model.add(Dense(20))
    model.add(Dense(output_shape, activation='softmax', name='output'))
    return model

def model2():
    model = Sequential()
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, input_shape=(WINDOW_SIZE, embed_size)))
    model.add(RepeatVector(WINDOW_SIZE))
    model.add(LSTM(128, dropout=0.2))
    model.add(Dense(30))
    model.add(Dense(output_shape, activation='softmax', name='output'))
    return model

def model3():
    model_inputs = Input(shape=(WINDOW_SIZE, embed_size))
    #inputs = Lambda(lambda x: K.expand_dims(x, -1))(model_inputs)
    encoded = LSTM(128, return_sequences=False, name='LSTM-encode')(model_inputs)

    decoded = RepeatVector(WINDOW_SIZE)(encoded)
    decoded = LSTM(1, return_sequences=True, name='LSTM-decode')(decoded)
    #decoded = Lambda(lambda x: K.squeeze(x, -1))(decoded)

    sequence_autoencoder = Model(model_inputs, decoded)
    #sequence_autoencoder.compile(loss='mse', optimizer='adam')

    earlyStopping = keras.callbacks.EarlyStopping(monitor='loss', patience=5, verbose=0, mode='auto')

    return sequence_autoencoder

model = model2()
model.compile(#loss='categorical_crossentropy',
              loss='mse',
              optimizer='adam',
              metrics=['accuracy'])

#model.summary()

In [14]:
model.fit(x_train, 
          y_train,
          batch_size=512,
          epochs=40,
          validation_split=0.2,
          shuffle=True)

Train on 25044 samples, validate on 6261 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x7f84860e1da0>

In [16]:
score, acc = model.evaluate(x_test, y_test)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.004849352655152234
Test accuracy: 0.8643288301921295


In [15]:
# save model
model.save("data/5-300-lstm128-128-20-50-model.h5")

### Create and compare hashes

In [17]:
# copy model and remove last layer
model_copy = model2()
model_copy.set_weights(model.get_weights())
model_copy.pop()

In [18]:
model_copy.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 128)               219648    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 200, 128)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 30)                3870      
Total params: 355,102
Trainable params: 355,102
Non-trainable params: 0
_________________________________________________________________


In [19]:
from sklearn.metrics.pairwise import cosine_similarity

def get_author(index):
    return encoder.inverse_transform(y_hash[index])

def get_hash(text):
    return model_copy.predict(text)

def get_similarity(hash1, hash2):
    return cosine_similarity(hash1, hash2)

In [28]:
x_new[0:2]

array([[[-0.55408 ,  0.32141 , -0.082956, ...,  0.17642 , -0.3941  ,
          0.23898 ],
        [-0.27846 ,  0.58525 , -0.29158 , ..., -0.21882 , -0.16092 ,
          0.12867 ],
        [-0.18567 ,  0.066008, -0.25209 , ..., -0.023452,  0.12302 ,
          0.3312  ],
        ...,
        [ 0.27204 , -0.06203 , -0.1884  , ...,  0.13015 , -0.18317 ,
          0.1323  ],
        [-0.8994  ,  0.58613 , -0.19851 , ..., -0.2976  ,  0.58026 ,
         -0.038478],
        [-0.86201 ,  0.27651 , -0.11209 , ..., -0.029787,  0.30704 ,
         -0.019168]],

       [[ 0.01751 ,  0.56925 , -0.16269 , ...,  0.56086 , -0.28723 ,
          0.95187 ],
        [-0.042501,  0.090773, -0.11918 , ..., -0.56068 ,  0.19177 ,
         -0.029525],
        [-0.15502 ,  0.35825 ,  0.12037 , ..., -0.22542 ,  0.037579,
          0.21992 ],
        ...,
        [ 0.08396 ,  0.049921, -0.20207 , ..., -0.25511 ,  0.11579 ,
          0.03228 ],
        [ 0.072554,  0.42846 ,  0.16527 , ...,  0.23156 ,  0.41411 ,
   

In [20]:
x_hash = get_hash(x_new)

In [21]:
np.var(x_hash, axis=0)

array([2.855928 , 2.0440075, 1.7792742, 3.2562096, 3.4413583, 3.7530243,
       3.7717354, 1.3561691, 3.5164473, 3.6343184, 6.751772 , 6.6856446,
       3.5441523, 1.984597 , 3.7207649, 2.463945 , 2.24198  , 5.104289 ,
       2.9089975, 2.421086 , 4.496727 , 1.8888315, 1.8361448, 6.008152 ,
       3.902635 , 1.5374885, 0.9063362, 2.2372754, 4.726219 , 1.5870391],
      dtype=float32)

In [23]:
#[np.sort(x_hash[i])[::-1][:1][0] for i in range(23,8000,37)]

In [24]:
import warnings
warnings.filterwarnings('ignore')

def print_similarity(i, j):
    similarity = get_similarity([x_hash[i]], [x_hash[j]])
    similarity = float(similarity)
    if y_new[i] == y_new[j]:
        print("Comparision of text {} and {} \tfor same author {} is: \t\t".format(
             i, j, y_new[i]), end=' ')
        print("{:0.10f}".format(similarity))
    else:
        print("Comparision of text {} and {} \tfor authors {} and {} is: \t".format(
             i, j, y_new[i], y_new[j]), end=' ')
        print("{:0.10f}".format(similarity))

for i in range(1, len(x_hash), 37):
    print_similarity(45, i)

Comparision of text 45 and 1 	for same author KevinDrawbaugh is: 		 0.8114759922
Comparision of text 45 and 38 	for same author KevinDrawbaugh is: 		 0.5325340033
Comparision of text 45 and 75 	for same author KevinDrawbaugh is: 		 0.5046958923
Comparision of text 45 and 112 	for same author KevinDrawbaugh is: 		 0.4234870672
Comparision of text 45 and 149 	for same author KevinDrawbaugh is: 		 0.8637467623
Comparision of text 45 and 186 	for same author KevinDrawbaugh is: 		 0.6951338053
Comparision of text 45 and 223 	for same author KevinDrawbaugh is: 		 0.7119156122
Comparision of text 45 and 260 	for same author KevinDrawbaugh is: 		 0.4131043255
Comparision of text 45 and 297 	for same author KevinDrawbaugh is: 		 0.6975989342
Comparision of text 45 and 334 	for same author KevinDrawbaugh is: 		 0.4887762666
Comparision of text 45 and 371 	for same author KevinDrawbaugh is: 		 0.5291176438
Comparision of text 45 and 408 	for same author KevinDrawbaugh is: 		 0.5005991459
Comparis

In [25]:
from tqdm import tqdm

true_positive, true_negative, false_positive, false_negative = 0,0,0,0
margin = 0.5
num_texts = len(x_hash)

for i in tqdm(range(num_texts - 1)):
    similarity = get_similarity([x_hash[i]], x_hash)
    
    for j in range(i, num_texts):
        if similarity[0][j] >= margin:
            if y_new[i] == y_new[j]:
                true_positive += 1
            else:
                false_positive += 1
        else:
            if y_new[i] == y_new[j]:
                false_negative += 1
            else:
                true_negative += 1

print("True positives ", true_positive)
print("False positives", false_positive)
print("True negatives ", true_negative)
print("False negatives", false_negative)

True positives  2562837
False positives 2745230
True negatives  31572218
False negatives 1309144


In [27]:
# Comparison just for the same author
new_authors = {name:{"correct":0, "incorrect":0} for name in np.unique(y_new)}
margin = 0.5

for i in tqdm(range(num_texts - 1)):
    similarity = get_similarity([x_hash[i]], x_hash)
    
    for j in range(i, num_texts):
        if y_new[i] == y_new[j]:
            if similarity[0][j] >= margin:
                new_authors[y_new[i]]["correct"] += 1
            else:
                new_authors[y_new[i]]["incorrect"] += 1

print("Number of correctly identified text belonging to each author:")             
new_authors

100%|██████████| 8738/8738 [00:48<00:00, 178.80it/s]

Number of correctly identified text belonging to each author:





{'EdnaFernandes': {'correct': 175283, 'incorrect': 114658},
 'FumikoFujisaki': {'correct': 205649, 'incorrect': 256592},
 'JanLopatka': {'correct': 184053, 'incorrect': 68351},
 'KevinDrawbaugh': {'correct': 176737, 'incorrect': 146069},
 'MureDickie': {'correct': 324074, 'incorrect': 43579},
 'PierreTran': {'correct': 251964, 'incorrect': 195967},
 'SamuelPerry': {'correct': 389246, 'incorrect': 86554},
 'SarahDavison': {'correct': 267909, 'incorrect': 271332},
 'SimonCowell': {'correct': 277280, 'incorrect': 56056},
 'ToddNissen': {'correct': 310642, 'incorrect': 69986}}