# Creating a model
A second-generation model of the main "originality score" algorithm: preprocessing a sample paper, performing analytics, saving the document's hash, and returning a score.  Uses embedding to improve understanding.

In [124]:
import warnings
warnings.filterwarnings('ignore')

import os
import nltk
import numpy as np
import pandas as pd

In [125]:
DATASET_PATH = 'data/C50/C50all/'
PROCESSED_DATA_PATH = 'data/inputs.pickle'

## Read in dataset
Load the [Reuter 50_50 training dataset](https://archive.ics.uci.edu/ml/datasets/Reuter_50_50).

TODO:  download and extract directly from website

In [130]:
# source modified from:
# https://github.com/devanshdalal/Author-Identification-task/blob/master/learner.py
authors = os.listdir(DATASET_PATH)
data = []

for author in authors:
  texts = os.listdir(path + author + '/')
  for text in texts:
    f=open(path + author + '/' + text, 'r')
    data.append([author, f.read()])
    f.close()
    
df = pd.DataFrame(data, columns=["author", "text"])
df.head()

Unnamed: 0,author,text
0,RobinSidel,Drugstore giant Revco D.S. Inc. said Monday it...
1,RobinSidel,"Mattel Inc., seeking to expand in the market f..."
2,RobinSidel,A financial agreement between Barney's Inc and...
3,RobinSidel,ITT Corp. met with financial advisers on Thurs...
4,RobinSidel,An independent shareholder advisory firm recom...


In [131]:
y = df["author"]
del data

## Preprocess data

In [99]:
# download 'punkt' if this is first time in notebook
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/kahlil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [100]:
# change text to lower case, replace new lines, and tokenize
X = df["text"].str.lower().replace('\n', ' ')
X = [nltk.word_tokenize(x) for x in X]
print("First 100 tokens in text:\n", X[0][:100])

First 100 tokens in text:
 ['drugstore', 'giant', 'revco', 'd.s', '.', 'inc.', 'said', 'monday', 'it', 'agreed', 'to', 'buy', 'regional', 'chain', 'big', 'b', 'inc.', 'in', 'a', 'sweetened', 'takeover', 'valued', 'at', '$', '380', 'million', '.', 'the', 'transaction', 'calls', 'for', 'twinsburg', ',', 'ohio-based', 'revco', 'to', 'buy', 'all', 'outstanding', 'shares', 'of', 'big', 'b', 'common', 'stock', 'for', '$', '17.25', 'per', 'share', ',', 'up', 'from', 'revco', "'s", 'unsolicited', 'offer', 'of', '$', '15', 'per', 'share', ',', 'which', 'big', 'b', 'rejected', 'last', 'month', '.', '``', 'we', 'are', 'very', 'excited', 'about', 'the', 'combination', 'of', 'revco', 'and', 'big', 'b.', 'i', 'am', 'pleased', 'we', 'were', 'able', 'to', 'bring', 'this', 'process', 'to', 'a', 'fast', 'and', 'successful', 'conclusion', ',']


## Create vocabulary

### Reuters dataset vocabulary

In [101]:
all_text = [y for x in X for y in x]
print("Reuters dataset")
print("Total words: {}".format(len(all_text)))
print("Unique words: {}".format(len(set(all_text))))

Reuters dataset
Total words: 2902829
Unique words: 53207


In [102]:
# Create a vocab of the Reuters dataset, ordered by frequency (decending)
f = nltk.FreqDist(all_text)
reuters_most_common = [w for (w,_) in f.most_common()]
print("Top 100 Reuters words:\n", reuters_most_common[:100])

Top 100 Reuters words:
 ['the', ',', '.', 'to', 'of', 'a', 'in', 'and', 'said', "'s", "''", '``', 'on', 'for', 'that', 'is', 'it', 'with', '$', 'be', 'at', 'by', 'its', 'was', 'as', 'from', 'he', 'will', 'but', 'has', 'have', 'percent', 'would', 'are', 'million', 'not', 'which', 'an', 'year', '(', ')', 'we', 'this', 'company', 'had', 'they', 'new', 'were', 'market', 'china', 'billion', 'up', 'more', 'been', '--', 'one', 'also', 'or', 'about', 'after', 'last', 'analysts', 'than', 'their', 'over', 'some', 'u.s.', 'hong', 'there', 'kong', 'could', 'who', 'two', 'i', 'group', 'business', 'share', 'first', 'other', 'his', 'government', 'companies', 'industry', 'bank', 'if', 'stock', 'expected', 'into', 'out', 'years', 'sales', 'shares', 'analyst', 'told', 'chinese', 'no', 'when', 'all', 'people', ';']


### Google's top 10,000 word vocabulary

In [103]:
# Load top 10,000 English words, according to Google
#    Source:  https://github.com/first20hours/google-10000-english
with open('data/google-10000-english-usa-no-swears.txt', 'r') as f:
  google_most_common = f.read().replace('\n', ' ')
google_most_common = nltk.word_tokenize(google_most_common)
print("Top 100 Google words:\n", google_most_common[:100])

Top 100 Google words:
 ['the', 'of', 'and', 'to', 'a', 'in', 'for', 'is', 'on', 'that', 'by', 'this', 'with', 'i', 'you', 'it', 'not', 'or', 'be', 'are', 'from', 'at', 'as', 'your', 'all', 'have', 'new', 'more', 'an', 'was', 'we', 'will', 'home', 'can', 'us', 'about', 'if', 'page', 'my', 'has', 'search', 'free', 'but', 'our', 'one', 'other', 'do', 'no', 'information', 'time', 'they', 'site', 'he', 'up', 'may', 'what', 'which', 'their', 'news', 'out', 'use', 'any', 'there', 'see', 'only', 'so', 'his', 'when', 'contact', 'here', 'business', 'who', 'web', 'also', 'now', 'help', 'get', 'pm', 'view', 'online', 'c', 'e', 'first', 'am', 'been', 'would', 'how', 'were', 'me', 's', 'services', 'some', 'these', 'click', 'its', 'like', 'service', 'x', 'than', 'find']


### Compare vocabs and choose one for this project

In [104]:
# Look through the differences between the two vocabs
unique_reuters_words = [x for x in reuters_most_common[:10000] if x not in google_most_common]
print("Reuters unique words: {}. Here's the top 100.".format(len(unique_reuters_words)))
print(unique_reuters_words[:100])

unique_google_words = [x for x in google_most_common if x not in reuters_most_common[:10000]]
print("\nGoogle unique words: {}. Here's the top 100.".format(len(unique_google_words)))
print(unique_google_words[:100])

Reuters unique words: 4390. Here's the top 100.
[',', '.', "'s", "''", '``', '$', '(', ')', '--', 'u.s.', ';', '&', "n't", '1996', "'", 'corp.', '1997', 'inc.', '...', '10', 'tonnes', '1995', '20', "'re", 'pence', '30', 'bre-x', '1', 'deng', '50', 'mci', ':', 'newsroom', '15', 'tung', '100', 'boeing', 'wang', 'co.', '25', 'takeover', '12', 'francs', 'traders', 'yuan', "'ve", '40', 'labour', 'rival', 'conrail', 'margins', 'busang', 'airbus', 'shareholder', 'ltd.', 'nomura', '14', '1995.', 'cocoa', 'stg', 'jiang', '-', '11', 'handover', '1994', 'long-term', '1996/97', '300', '2', '60', 'klaus', 'speculation', 'uaw', 'crowns', 'privatisation', '1998', '1997.', 'regulators', '90', 'barrick', 'long-distance', 'thomson-csf', 'brokerage', '1996.', 'spokeswoman', '18', '?', '200', 'tonne', 'jumped', '171', 'csx', '31', 'rivals', '16', "'ll", '13', 'natwest', '=', 'telecoms']

Google unique words: 4279. Here's the top 100.
['pm', 'click', 'x', 're', 'info', 'ebay', 'dvd', 'website', 'v', 'descr

Let's take a look at some of these unique words, in order of frequency, to see if they are domain-specific.  For the Reuters vocab, some are punctuation, many are numbers, and the remainder are mostly domain-specific (international-business) related words, such as *privitisation, pre-tax*, and *conglomerate* or names such as *murdoch* and *monsanto*.  For the Google vocab, some are letters, some computer-related such as *forums* and *login*, and some are more general such as *color* and *thank*.

Let's use the Google vocab and add punctuations and contractions.

In [105]:
# Extend common vocab to include punctuation + contractions
from string import punctuation
vocab = google_most_common + list(punctuation)

## Convert text and authors to network-ready input
### Download embedding model to represent words

In [106]:
import gensim.downloader as api

# https:/github.com/RaRe-Technologies/gensim-data
# glove-twitter-25
# word2vec-google-news-300
info = api.info()
embed_model = api.load("glove-twitter-100")

In [107]:
# print sample data
print("Embedding for 'house':\n", embed_model.wv['house'])
print("\nSimilar words to 'house':\n", embed_model.most_similar("house"))

Embedding for 'house':
 [-1.3345e-01  3.4688e-01  3.0748e-01 -2.1794e-03  7.1898e-01 -2.8725e-03
  9.5989e-02  5.5276e-01  1.2153e-01 -2.6555e-01 -1.0277e+00  7.2278e-01
 -4.2767e+00 -9.0406e-02  1.1909e-01 -5.0647e-02 -3.3165e-01 -1.8213e-01
 -3.6218e-01  6.9813e-03  2.0147e-01 -2.9150e-01 -1.6417e-01 -2.8022e-01
  5.4800e-01 -5.8081e-01  3.8146e-01 -5.5519e-01  1.6094e-01 -5.2039e-02
 -1.4798e-01  1.0892e-03 -2.6702e-01 -1.7885e-01  5.1449e-02  6.7434e-02
  9.5654e-02  5.6137e-01  7.1208e-03  4.7000e-01 -3.1460e-01  1.0552e+00
  5.2215e-01 -4.8432e-01  2.8615e-01  7.9474e-02  6.4211e-01  6.5274e-01
 -2.6493e-01 -8.9566e-02 -2.6298e-01 -3.4906e-01  3.3645e-02  2.1278e-01
 -1.0738e+00 -3.6867e-01  1.8473e-01  3.3821e-01  5.7516e-01  1.7559e-01
 -1.5436e-01  5.2836e-02 -9.8523e-02 -4.0975e-01 -8.5839e-02 -3.1527e-01
  1.7936e-01 -2.0953e-01  6.6424e-01 -5.7412e-02  2.4528e-01 -2.2577e-01
 -3.3233e-01  2.1225e-01  2.3743e-01  1.3298e-01 -4.4889e-01  4.9577e-01
  4.3360e-01  2.4248e-01  1

### Convert text to embedding vectors

In [108]:
# convert text to vectors based on frequency
embed_size = embed_model.vector_size
embed_zeros = np.zeros(embed_size)

for i, doc in enumerate(X):
    X[i] = [embed_model[word] \
                    if word in vocab and word in embed_model else embed_zeros \
                    for word in doc]

print("Embedded vector representation of some words of the first text:\n", X[0][0:3])

Embedded vector representation of some words of the first text:
 [array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([ 0.25888  ,  0.20283  ,  0.45292  ,  0.051316 , -0.59831  ,
        0.17218  ,  0.43064  , -0.47245  , -0.11205  , -0.51921  ,
        0.20608  ,  0.33019  , -2.7551   , -0.28471  ,  0.78403  ,
        0.95814  ,  0.74391  , -0.40147  , -0.052915 , -0.38103  ,
       -0.80142  , -0.2495   ,  0.12538  , -0.29809  ,  0.28059  ,
        0.25785  , -0.2339   ,  0.7638   , -0.040205 ,  0.017227 ,
        0.72153  ,  0.023785 ,  0.80918  , -0.095793 ,  0.44504  ,
       -0.47478  ,  0

### Windowing
Creating smaller windows of data to process

In [109]:
# Create 100 word chucks
WINDOW_SIZE = 300
WINDOW_SPACING = 100

def chunk(x, y):
    X_chunk = []
    y_chunk = []
    
    for i in range(0, len(x)-WINDOW_SIZE, WINDOW_SPACING):
        X_chunk.append(x[i:i+WINDOW_SIZE])
        y_chunk.append(y)
    
    return X_chunk, y_chunk

X_chunks = []
y_chunks = []
for i, _ in enumerate(X):
    xc, yc = chunk(X[i], y[i])
    X_chunks += xc
    y_chunks += yc

In [110]:
X_chunks = np.array(X_chunks)
y_chunks = np.array(y_chunks)
print("X_chunks shape:", X_chunks.shape)
print("y_chunks shape:", y_chunks.shape)

X_chunks shape: (16545, 300, 100)
y_chunks shape: (16545,)


## Training

### Create training, test, and "new" sets

In [111]:
from sklearn.model_selection import train_test_split

# Keeps some authors aside for hash testing
x_train, x_new, y_train, y_new = train_test_split(X_chunks, y_chunks, train_size=13138, shuffle=False)

# Split remainder into 70% training and 30% testing and shuffle
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, train_size=0.9, random_state=1)

print("New:   {} text from {} authors".format(x_new.shape[0], len(np.unique(y_new, axis=0))))
print("Train: {} text from {} authors".format(x_train.shape[0], len(np.unique(y_train, axis=0))))
print("Test:  {} text from {} authors".format(x_test.shape[0], len(np.unique(y_test, axis=0))))

New:   3407 text from 10 authors
Train: 11824 text from 40 authors
Test:  1314 text from 40 authors


### One-hot encode labels (authors)

In [112]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

#encoder = LabelEncoder()
#encoded = encoder.fit_transform(df["author"])
#y = to_categorical(encoded)

encoder = LabelEncoder()
encoded = encoder.fit_transform(y_train)
y_train = to_categorical(encoded)
y_test = to_categorical(encoder.transform(y_test))

print("Author {} is one-hot encoded as:\n".format(df["author"][0]), y_train[0])

Author RobinSidel is one-hot encoded as:
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [113]:
len(y_train[0])

40

### Save preprocessed data

In [134]:
import pickle

obj = [x_train, x_test, x_new, y_train, y_test, y_new, embed_size]
obj = [x_train]
with open(PROCESSED_DATA_PATH, "wb") as handle:
    pickle.dump(x_train, handle, protocol=pickle.HIGHEST_PROTOCOL)

OSError: [Errno 22] Invalid argument

In [91]:
# free up memory
del X 
del y 
del embed_model 
del df
del X_chunks
del y_chunks

### Create network model

#### Reload data

In [None]:
with open(TOKENIZER, 'rb') as handle:
                tokenizer = pickle.load(handle)
        

In [116]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional
    
model = Sequential()
model.add(LSTM(128, dropout=0.2, input_shape=(WINDOW_SIZE, embed_size), return_sequences=True))
model.add(LSTM(64, dropout=0.2, input_shape=(WINDOW_SIZE, embed_size)))
model.add(Dense(40, activation='softmax', name='output'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
#model.summary()

In [117]:
RUN_NAME = "run 3 with LSTM 64"
logger = keras.callbacks.TensorBoard(
    log_dir='logs/{}'.format(RUN_NAME),
    write_graph=True,
    histogram_freq=5
)

model.fit(x_train, 
          y_train,
          batch_size=128,
          epochs=25,
          validation_split=0.2,
          #callbacks=[logger],
          shuffle=True)

Train on 9459 samples, validate on 2365 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25

KeyboardInterrupt: 

In [None]:
score, acc = model.evaluate(x_test, y_test)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
# save model
model.save("data/5-lstm64-model.h5")