In [2]:
import warnings
warnings.filterwarnings('ignore')

import os
import nltk
import numpy as np
import pandas as pd

# Read in dataset
Load the [Reuter 50_50 training dataset](https://archive.ics.uci.edu/ml/datasets/Reuter_50_50).

TODO:  download and extract directly from website

In [3]:
# source modified from:
# https://github.com/devanshdalal/Author-Identification-task/blob/master/learner.py
path = 'data/C50/C50train/'
authors = os.listdir(path)
data = []

for author in authors:
  texts = os.listdir(path + author + '/')
  for text in texts:
    f=open(path + author + '/' + text, 'r')
    data.append([author, f.read()])
    f.close()
    
df = pd.DataFrame(data, columns=["author", "text"])
df.head()

# TODO: add more author, text pairs

Unnamed: 0,author,text
0,RobinSidel,Drugstore giant Revco D.S. Inc. said Monday it...
1,RobinSidel,"Mattel Inc., seeking to expand in the market f..."
2,RobinSidel,A financial agreement between Barney's Inc and...
3,RobinSidel,An independent shareholder advisory firm recom...
4,RobinSidel,Raising the stakes in the escalating battle fo...


## Preprocess data

In [4]:
#nltk.download()
# download 'punkt' if this is first time in notebook

In [5]:
# change text to lower case, replace new lines, and tokenize
X = df["text"].str.lower().replace('\n', ' ')
X = [nltk.word_tokenize(x) for x in X]
print("First 100 tokens in text:\n", X[0][:100])

First 100 tokens in text:
 ['drugstore', 'giant', 'revco', 'd.s', '.', 'inc.', 'said', 'monday', 'it', 'agreed', 'to', 'buy', 'regional', 'chain', 'big', 'b', 'inc.', 'in', 'a', 'sweetened', 'takeover', 'valued', 'at', '$', '380', 'million', '.', 'the', 'transaction', 'calls', 'for', 'twinsburg', ',', 'ohio-based', 'revco', 'to', 'buy', 'all', 'outstanding', 'shares', 'of', 'big', 'b', 'common', 'stock', 'for', '$', '17.25', 'per', 'share', ',', 'up', 'from', 'revco', "'s", 'unsolicited', 'offer', 'of', '$', '15', 'per', 'share', ',', 'which', 'big', 'b', 'rejected', 'last', 'month', '.', '``', 'we', 'are', 'very', 'excited', 'about', 'the', 'combination', 'of', 'revco', 'and', 'big', 'b.', 'i', 'am', 'pleased', 'we', 'were', 'able', 'to', 'bring', 'this', 'process', 'to', 'a', 'fast', 'and', 'successful', 'conclusion', ',']


## Create vocabulary

### Reuters dataset vocabulary

In [6]:
all_text = [y for x in X for y in x]
print("Reuters dataset")
print("Total words: {}".format(len(all_text)))
print("Unique words: {}".format(len(set(all_text))))

Reuters dataset
Total words: 1435370
Unique words: 37268


In [7]:
# Create a vocab of the Reuters dataset, ordered by frequency (decending)
f = nltk.FreqDist(all_text)
reuters_most_common = [w for (w,_) in f.most_common()]
print("Top 100 Reuters words:\n", reuters_most_common[:100])

Top 100 Reuters words:
 ['the', ',', '.', 'to', 'of', 'a', 'in', 'and', 'said', "'s", "''", '``', 'for', 'on', 'that', 'is', 'it', 'with', 'be', '$', 'at', 'by', 'its', 'as', 'was', 'from', 'he', 'will', 'but', 'has', 'have', 'would', 'percent', 'are', 'million', 'not', 'which', 'an', 'year', '(', ')', 'this', 'we', 'company', 'had', 'new', 'they', 'market', 'were', 'china', 'billion', 'up', 'been', 'more', 'one', '--', 'also', 'or', 'about', 'analysts', 'after', 'u.s.', 'last', 'their', 'than', 'some', 'over', 'there', 'could', 'who', 'group', 'two', 'share', 'first', 'i', 'companies', 'hong', 'industry', 'business', 'kong', 'other', 'his', 'if', 'bank', 'into', 'stock', 'government', 'expected', 'years', 'out', 'shares', 'analyst', 'sales', 'no', ';', 'all', 'told', 'when', 'chinese', 'next']


### Google's top 10,000 word vocabulary

In [8]:
# Load top 10,000 English words, according to Google
#    Source:  https://github.com/first20hours/google-10000-english
with open('data/google-10000-english-usa-no-swears.txt', 'r') as f:
  google_most_common = f.read().replace('\n', ' ')
google_most_common = nltk.word_tokenize(google_most_common)
print("Top 100 Google words:\n", google_most_common[:100])

Top 100 Google words:
 ['the', 'of', 'and', 'to', 'a', 'in', 'for', 'is', 'on', 'that', 'by', 'this', 'with', 'i', 'you', 'it', 'not', 'or', 'be', 'are', 'from', 'at', 'as', 'your', 'all', 'have', 'new', 'more', 'an', 'was', 'we', 'will', 'home', 'can', 'us', 'about', 'if', 'page', 'my', 'has', 'search', 'free', 'but', 'our', 'one', 'other', 'do', 'no', 'information', 'time', 'they', 'site', 'he', 'up', 'may', 'what', 'which', 'their', 'news', 'out', 'use', 'any', 'there', 'see', 'only', 'so', 'his', 'when', 'contact', 'here', 'business', 'who', 'web', 'also', 'now', 'help', 'get', 'pm', 'view', 'online', 'c', 'e', 'first', 'am', 'been', 'would', 'how', 'were', 'me', 's', 'services', 'some', 'these', 'click', 'its', 'like', 'service', 'x', 'than', 'find']


### Compare vocabs and choose one for this project

In [9]:
# Look through the differences between the two vocabs
unique_reuters_words = [x for x in reuters_most_common[:10000] if x not in google_most_common]
print("Reuters unique words: {}. Here's the top 100.".format(len(unique_reuters_words)))
print(unique_reuters_words[:100])

unique_google_words = [x for x in google_most_common if x not in reuters_most_common[:10000]]
print("\nGoogle unique words: {}. Here's the top 100.".format(len(unique_google_words)))
print(unique_google_words[:100])

Reuters unique words: 4503. Here's the top 100.
[',', '.', "'s", "''", '``', '$', '(', ')', '--', 'u.s.', ';', '&', "n't", 'corp.', "'", '1996', '10', '1997', '...', 'inc.', 'tonnes', '1995', 'pence', '20', '30', 'wang', "'re", 'mci', '1', ':', 'newsroom', 'boeing', '15', '50', 'bre-x', '100', 'airbus', 'co.', 'tung', 'francs', 'takeover', '12', 'traders', '40', '25', 'rival', "'ve", 'uaw', 'klaus', 'stg', '14', 'cocoa', 'yuan', 'barrick', 'shareholder', '-', 'labour', '11', 'ltd.', 'conrail', '1996/97', '60', 'speculation', 'margins', '1997.', '1995.', '300', 'regulators', 'long-term', 'automaker', '1998', 'tibet', '1994', 'long-distance', 'murdoch', '2', 'exporters', '90', 'jiang', 'handover', 'telecoms', '1989', '?', 'eurotunnel', 'crowns', 'privatisation', '16', '17', '171', 'rivals', 'jumped', 'adm', 'dissident', 'csx', 'deng', 'profitable', '80', '200', '18', '13']

Google unique words: 4392. Here's the top 100.
['pm', 'e', 'click', 'x', 're', 'info', 'm', 'ebay', 'dvd', 'website

Let's take a look at some of these unique words, in order of frequency, to see if they are domain-specific.  For the Reuters vocab, some are punctuation, many are numbers, and the remainder are mostly domain-specific (international-business) related words, such as *privitisation, pre-tax*, and *conglomerate* or names such as *murdoch* and *monsanto*.  For the Google vocab, some are letters, some computer-related such as *forums* and *login*, and some are more general such as *color* and *thank*.

Let's use the Google vocab and add punctuations and contractions.

In [10]:
# Extend common vocab to include punctuation + contractions
from string import punctuation
vocab = google_most_common + list(punctuation) + ['--', "'s", "n't", '...', "'re", "'ve"]

## Convert text and authors to network-ready input
### Download embedding model to represent words

In [11]:
import gensim.downloader as api

# https:/github.com/RaRe-Technologies/gensim-data
# glove-twitter-25
# word2vec-google-news-300
info = api.info()
embed_model = api.load("glove-twitter-100")

In [12]:
# print sample data
print("Embedding for 'house':\n", embed_model.wv['house'])
print("\nSimilar words to 'house':\n", embed_model.most_similar("house"))

Embedding for 'house':
 [-1.3345e-01  3.4688e-01  3.0748e-01 -2.1794e-03  7.1898e-01 -2.8725e-03
  9.5989e-02  5.5276e-01  1.2153e-01 -2.6555e-01 -1.0277e+00  7.2278e-01
 -4.2767e+00 -9.0406e-02  1.1909e-01 -5.0647e-02 -3.3165e-01 -1.8213e-01
 -3.6218e-01  6.9813e-03  2.0147e-01 -2.9150e-01 -1.6417e-01 -2.8022e-01
  5.4800e-01 -5.8081e-01  3.8146e-01 -5.5519e-01  1.6094e-01 -5.2039e-02
 -1.4798e-01  1.0892e-03 -2.6702e-01 -1.7885e-01  5.1449e-02  6.7434e-02
  9.5654e-02  5.6137e-01  7.1208e-03  4.7000e-01 -3.1460e-01  1.0552e+00
  5.2215e-01 -4.8432e-01  2.8615e-01  7.9474e-02  6.4211e-01  6.5274e-01
 -2.6493e-01 -8.9566e-02 -2.6298e-01 -3.4906e-01  3.3645e-02  2.1278e-01
 -1.0738e+00 -3.6867e-01  1.8473e-01  3.3821e-01  5.7516e-01  1.7559e-01
 -1.5436e-01  5.2836e-02 -9.8523e-02 -4.0975e-01 -8.5839e-02 -3.1527e-01
  1.7936e-01 -2.0953e-01  6.6424e-01 -5.7412e-02  2.4528e-01 -2.2577e-01
 -3.3233e-01  2.1225e-01  2.3743e-01  1.3298e-01 -4.4889e-01  4.9577e-01
  4.3360e-01  2.4248e-01  1

### Convert text to integers

In [15]:
# convert text to vectors based on frequency
vocab_to_int = {c: i for i, c in enumerate(vocab, 1)}
for i, tokens in enumerate(X):
    X[i] = [vocab_to_int[x] if x in vocab else 0 for x in tokens]

print("Integer representation of first text:\n", X[0])

Integer representation of first text:
 [0, 4096, 0, 0, 9904, 0, 185, 1009, 16, 2751, 4, 130, 1033, 2222, 329, 124, 0, 6, 5, 0, 0, 8494, 22, 9894, 0, 662, 9904, 1, 3411, 1868, 7, 0, 9902, 0, 0, 4, 130, 25, 3459, 2807, 2, 329, 124, 733, 433, 7, 9894, 0, 322, 658, 9902, 54, 21, 0, 9924, 0, 616, 2, 9894, 0, 322, 658, 9902, 57, 329, 124, 6605, 126, 636, 9904, 0, 31, 20, 174, 6285, 36, 1, 2787, 2, 0, 3, 329, 0, 14, 84, 4956, 31, 88, 724, 4, 1418, 12, 430, 4, 5, 848, 3, 1885, 3994, 9902, 0, 185, 0, 0, 9902, 686, 3, 1781, 1098, 1687, 2, 0, 9904, 1, 1270, 32, 4982, 1, 1837, 9924, 0, 3, 0, 1014, 218, 6151, 9904, 1, 665, 185, 329, 124, 9924, 344, 2, 2774, 0, 1574, 1, 587, 616, 3, 1567, 10, 329, 124, 7339, 6665, 58, 2807, 9904, 1, 424, 2, 0, 9902, 0, 329, 124, 3687, 1, 587, 3759, 6, 1, 5367, 0, 0, 478, 9904, 0, 1642, 0, 2088, 29, 705, 12, 636, 4, 130, 443, 1714, 2222, 0, 0, 0, 7, 9894, 0, 2501, 9902, 3, 0, 1014, 0, 9902, 5, 851, 2, 0, 0, 0, 0, 9902, 2751, 6, 559, 4, 130, 0, 9924, 0, 7, 36, 9894, 0

### Convert integers to vectors

In [18]:
# create embedding matrix covering the dataset vocab
embed_vector_size = embed_model.vector_size
embedding_matrix = \
  [embed_model.wv[word] if word in embed_model.vocab and word in vocab else np.zeros(embed_vector_size) \
            for word in reuters_most_common]

print("Embedding matrix for first word in the first text\n", embedding_matrix[0])

Embedding matrix for first word in the first text
 [ 9.5152e-02  3.7024e-01  5.4291e-01  1.9621e-01  4.8205e-02  3.2033e-01
 -5.9638e-01  1.5868e-02 -1.2989e-01 -6.3028e-01  8.1944e-02  2.4164e-01
 -6.0990e+00 -6.8557e-01  5.0354e-01 -3.4089e-02  1.1705e-01 -7.7403e-03
 -8.6512e-02  4.3617e-01 -4.3982e-01  2.6125e-01 -4.0348e-02 -1.9194e-01
  8.3204e-02 -5.8246e-01 -3.1923e-02  1.2630e-01  4.0120e-01  6.8906e-02
 -1.0517e-01 -2.0804e-01 -4.2554e-01  4.7799e-01  3.4651e-01  2.4057e-01
  5.0244e-02 -7.2587e-02 -2.4347e-03 -5.0342e-01 -1.0601e+00 -3.1586e-01
 -3.2457e-02 -7.6317e-02  7.9045e-01  8.6367e-02 -1.9632e-01  5.7566e-02
  8.4129e-01 -4.2020e-01 -1.1335e-03 -8.5632e-02  6.1910e-02  2.1423e-01
 -1.0356e-01 -3.6946e-02 -2.6005e-01 -3.5657e-01  5.4321e-02  3.0875e-02
  1.4092e-01 -9.1998e-02 -4.1841e-01 -3.1135e-01 -1.4937e-01 -2.2699e-04
 -3.3454e-01 -1.4848e-01 -1.1944e-01 -2.7174e-01  3.1320e-01 -1.0998e-01
 -4.7524e-01  1.4056e-01  3.9641e-01 -4.9413e-02 -4.2601e-01 -2.3576e-01


### One-hot encode labels (authors)

In [31]:
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

# enumerate authors and create one-hot encodings
encoder = LabelEncoder()
encoder.fit(df["author"].values)
encoded_y = encoder.transform(df["author"].values)
y = np_utils.to_categorical(encoded_y)

print("Author {} is encoded as:\n".format(df["author"][0]), y[0])
print("Author {} is encoded as:\n".format(df["author"][100]), y[100])

Author RobinSidel is encoded as:
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]
Author KouroshKarimkhany is encoded as:
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]


### Mapping text to padded sequences

## Training

### Create training, validation, and test sets

In [None]:
# Sample and split dataframe: 60% training, 20% validation, and 20% test
train = df.sample(frac=0.6, replace=False, random_state=1)
test = df.drop(train.index)
val = test.sample(frac=0.5, replace=False, random_state=1)
test = test.drop(val.index)

In [None]:
def get_values(dataframe):
    x = [x for x in dataframe["tokens"]]
    y = [y for y in dataframe["author"]]
    return np.array(x), np.array(y)

x_train, y_train = get_values(train)
x_val, y_val = get_values(val)
x_test, y_test = get_values(test)

# Source below from:  https://github.com/udacity/deep-learning/blob/master/sentiment-rnn/Sentiment_RNN_Solution.ipynb
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(x_train.shape), 
      "\nValidation set: \t{}".format(x_val.shape),
      "\nTest set: \t\t{}".format(x_test.shape))

In [None]:
from sklearn.model_selection import train_test_split

# Split into 60% training, 20% validation, and 20% test sets
# Testing set has some authors not seen in training
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=False)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.75, random_state=1)

### Create network model

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM, SimpleRNN

max_features = 20000
timesteps = 10

model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(512, dropout=0.2))
model.add(Dense(50, activation='softmax', name='output'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train, 
          batch_size=100, 
          epochs=10,
          validation_data=(x_val,y_val))


In [None]:
score, acc = model.evaluate(x_test, y_test, batch_size=200)
print('Test score:', score)
print('Test accuracy:', acc)