In [None]:
!pip install keras gensim

In [23]:
import warnings
warnings.filterwarnings('ignore')

import os
import nltk
import numpy as np
import pandas as pd

# Read in dataset
Load the [Reuter 50_50 training dataset](https://archive.ics.uci.edu/ml/datasets/Reuter_50_50).

TODO:  download and extract directly from website

In [24]:
# source modified from:
# https://github.com/devanshdalal/Author-Identification-task/blob/master/learner.py
path = 'data/C50/C50train/'
authors = os.listdir(path)
data = []

for author in authors:
  texts = os.listdir(path + author + '/')
  for text in texts:
    f=open(path + author + '/' + text, 'r')
    data.append([author, f.read()])
    f.close()
    
df = pd.DataFrame(data, columns=["author", "text"])
df.head()

Unnamed: 0,author,text
0,WilliamKazer,China on Tuesday announced a ban on poultry an...
1,WilliamKazer,China said on Thursday the highest-level U.S. ...
2,WilliamKazer,China has tightened safety measures after a fa...
3,WilliamKazer,China on Thursday tried to play down friction ...
4,WilliamKazer,China is preparing to tap overseas capital mar...


## Preprocess data

In [25]:
#nltk.download()
# download 'punkt' if this is first time in notebook

In [26]:
df["text"] = df["text"].str.lower()
df["text"] = df["text"].str.replace('\n', ' ')
df["tokens"] = df["text"].apply(nltk.word_tokenize)
df.head()

Unnamed: 0,author,text,tokens
0,WilliamKazer,china on tuesday announced a ban on poultry an...,"[china, on, tuesday, announced, a, ban, on, po..."
1,WilliamKazer,china said on thursday the highest-level u.s. ...,"[china, said, on, thursday, the, highest-level..."
2,WilliamKazer,china has tightened safety measures after a fa...,"[china, has, tightened, safety, measures, afte..."
3,WilliamKazer,china on thursday tried to play down friction ...,"[china, on, thursday, tried, to, play, down, f..."
4,WilliamKazer,china is preparing to tap overseas capital mar...,"[china, is, preparing, to, tap, overseas, capi..."


## Gather vocabulary and create embeddings

In [27]:
all_text = df["text"].str.cat()
all_text = nltk.word_tokenize(all_text)

print("Reuters dataset")
print("Total words: {}".format(len(all_text)))
print("Unique words: {}".format(len(set(all_text))))

Reuters dataset
Total words: 1435269
Unique words: 37279


In [28]:
# Create a vocab of the Reuters dataset, ordered by frequency (decending)
f = nltk.FreqDist(all_text)
reuters_most_common = [w for (w,_) in f.most_common()]
print("Top 100 Reuters words:\n", reuters_most_common[:100])

Top 100 Reuters words:
 ['the', ',', '.', 'to', 'of', 'a', 'in', 'and', 'said', "'s", '``', "''", 'for', 'on', 'that', 'is', 'it', 'with', 'be', '$', 'at', 'by', 'its', 'as', 'was', 'from', 'he', 'will', 'but', 'has', 'have', 'would', 'percent', 'are', 'million', 'not', 'which', 'an', 'year', '(', ')', 'this', 'we', 'company', 'had', 'new', 'they', 'market', 'were', 'china', 'billion', 'up', 'been', 'more', 'one', '--', 'also', 'or', 'about', 'analysts', 'after', 'u.s.', 'last', 'their', 'than', 'some', 'over', 'there', 'could', 'who', 'group', 'two', 'share', 'first', 'i', 'companies', 'hong', 'industry', 'kong', 'business', 'other', 'his', 'if', 'bank', 'into', 'stock', 'government', 'expected', 'years', 'out', 'shares', 'analyst', 'sales', 'no', ';', 'all', 'told', 'when', 'chinese', 'next']


In [29]:
# Load top 10,000 English words, according to Google
#    Source:  https://github.com/first20hours/google-10000-english
with open('data/google-10000-english-usa-no-swears.txt', 'r') as f:
  google_most_common = f.read().replace('\n', ' ')
google_most_common = nltk.word_tokenize(google_most_common)
print("Top 100 Google words:\n", google_most_common[:100])

Top 100 Google words:
 ['the', 'of', 'and', 'to', 'a', 'in', 'for', 'is', 'on', 'that', 'by', 'this', 'with', 'i', 'you', 'it', 'not', 'or', 'be', 'are', 'from', 'at', 'as', 'your', 'all', 'have', 'new', 'more', 'an', 'was', 'we', 'will', 'home', 'can', 'us', 'about', 'if', 'page', 'my', 'has', 'search', 'free', 'but', 'our', 'one', 'other', 'do', 'no', 'information', 'time', 'they', 'site', 'he', 'up', 'may', 'what', 'which', 'their', 'news', 'out', 'use', 'any', 'there', 'see', 'only', 'so', 'his', 'when', 'contact', 'here', 'business', 'who', 'web', 'also', 'now', 'help', 'get', 'pm', 'view', 'online', 'c', 'e', 'first', 'am', 'been', 'would', 'how', 'were', 'me', 's', 'services', 'some', 'these', 'click', 'its', 'like', 'service', 'x', 'than', 'find']


In [30]:
# Look through the differences between the two vocabs
unique_reuters_words = [x for x in reuters_most_common[:10000] if x not in google_most_common]
print("Reuters unique words: {}. Here's the top 100.".format(len(unique_reuters_words)))
print(unique_reuters_words[:100])

unique_google_words = [x for x in google_most_common if x not in reuters_most_common[:10000]]
print("\nGoogle unique words: {}. Here's the top 100.".format(len(unique_google_words)))
print(unique_google_words[:100])

Reuters unique words: 4530. Here's the top 100.
[',', '.', "'s", '``', "''", '$', '(', ')', '--', 'u.s.', ';', '&', "n't", 'corp.', "'", '1996', '10', '...', '1997', 'inc.', 'tonnes', '1995', 'pence', '20', '30', 'wang', "'re", 'mci', '1', ':', 'newsroom', 'boeing', '15', '50', 'bre-x', '100', 'airbus', 'co.', 'tung', 'francs', 'takeover', '12', 'traders', '40', '25', 'rival', "'ve", 'uaw', 'klaus', 'stg', '14', 'cocoa', 'yuan', 'barrick', 'shareholder', '-', 'labour', '11', 'ltd.', '1997.', 'conrail', '1996/97', '60', '1995.', 'speculation', 'margins', '300', 'regulators', 'long-term', 'automaker', 'tibet', '1998', 'long-distance', 'murdoch', '2', '1994', '90', 'exporters', 'jiang', 'handover', 'telecoms', '?', '1989', 'eurotunnel', 'crowns', 'privatisation', '16', '17', '171', 'rivals', 'jumped', 'adm', 'dissident', 'csx', 'deng', 'profitable', '200', '80', '18', '13']

Google unique words: 4419. Here's the top 100.
['pm', 'e', 'click', 'x', 're', 'info', 'm', 'ebay', 'dvd', 'website

Let's take a look at some of these unique words, in order of frequency, to see if they are domain-specific.  For the Reuters vocab, some are punctuation, many are numbers, and the remainder are mostly domain-specific (international-business) related words, such as *privitisation, pre-tax*, and *conglomerate* or names such as *murdoch* and *monsanto*.  For the Google vocab, some are letters, some computer-related such as *forums* and *login*, and some are more general such as *color* and *thank*.

Let's use the Google vocab and add punctuations and contractions.

In [31]:
# Extend common vocab to include punctuation + contractions
from string import punctuation
vocab = google_most_common + list(punctuation) + ['--', "'s", "n't", '...', "'re", "'ve"]

### Downloading embedding model

In [32]:
import gensim.downloader as api

# https:/github.com/RaRe-Technologies/gensim-data
# glove-twitter-25
# word2vec-google-news-300
info = api.info()
embed_model = api.load("glove-twitter-25")

In [33]:
len(embed_model.wv['house'])

25

In [34]:
embed_model.most_similar("house")

[('home', 0.9537084102630615),
 ('room', 0.9411842226982117),
 ('at', 0.9270982146263123),
 ('out', 0.9157506227493286),
 ('here', 0.915216326713562),
 ('street', 0.9097360968589783),
 ("'m", 0.9017621874809265),
 ('going', 0.9013768434524536),
 ('town', 0.9007171988487244),
 ('party', 0.8996413946151733)]

### Convert text tokens to vectors

In [35]:
def word_to_vector(tokens):
    return [embed_model.wv[word] if word in embed_model.vocab else np.zeros(25) \
            for word in tokens]
    
df["vectors"] = df["tokens"].apply(word_to_vector)
df.head()

Unnamed: 0,author,text,tokens,vectors
0,WilliamKazer,china on tuesday announced a ban on poultry an...,"[china, on, tuesday, announced, a, ban, on, po...","[[0.02694, -0.39687, -0.11095, -0.79269, -0.22..."
1,WilliamKazer,china said on thursday the highest-level u.s. ...,"[china, said, on, thursday, the, highest-level...","[[0.02694, -0.39687, -0.11095, -0.79269, -0.22..."
2,WilliamKazer,china has tightened safety measures after a fa...,"[china, has, tightened, safety, measures, afte...","[[0.02694, -0.39687, -0.11095, -0.79269, -0.22..."
3,WilliamKazer,china on thursday tried to play down friction ...,"[china, on, thursday, tried, to, play, down, f...","[[0.02694, -0.39687, -0.11095, -0.79269, -0.22..."
4,WilliamKazer,china is preparing to tap overseas capital mar...,"[china, is, preparing, to, tap, overseas, capi...","[[0.02694, -0.39687, -0.11095, -0.79269, -0.22..."


## Convert text and authors to network-ready input
.
### Mapping authors to one-hot encoding

In [36]:
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

# enumerate authors and create one-hot encodings
encoder = LabelEncoder()
encoder.fit(df["author"].values)
encoded_y = encoder.transform(df["author"].values)
y = np_utils.to_categorical(encoded_y)
y

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

### Mapping text to padded sequences

In [37]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(nb_words=20000, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                                   lower=True,split=' ')

tokenizer.fit_on_texts(df["text"].values)
x = tokenizer.texts_to_sequences(df["text"].values)
x = pad_sequences(x)
x

array([[   0,    0,    0, ...,    2,    1, 1027],
       [   0,    0,    0, ...,    1,   61,  979],
       [   0,    0,    0, ...,  584,  947,   68],
       ...,
       [   0,    0,    0, ...,   66, 2601, 2407],
       [   0,    0,    0, ..., 8347,  340, 4595],
       [   0,    0,    0, ...,   66, 2601, 2407]], dtype=int32)

## Training

### Create training, validation, and test sets

In [38]:
# Sample and split dataframe: 60% training, 20% validation, and 20% test
train = df.sample(frac=0.6, replace=False, random_state=1)
test = df.drop(train.index)
val = test.sample(frac=0.5, replace=False, random_state=1)
test = test.drop(val.index)

In [39]:
def get_values(dataframe):
    x = [x for x in dataframe["tokens"]]
    y = [y for y in dataframe["author"]]
    return np.array(x), np.array(y)

x_train, y_train = get_values(train)
x_val, y_val = get_values(val)
x_test, y_test = get_values(test)

# Source below from:  https://github.com/udacity/deep-learning/blob/master/sentiment-rnn/Sentiment_RNN_Solution.ipynb
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(x_train.shape), 
      "\nValidation set: \t{}".format(x_val.shape),
      "\nTest set: \t\t{}".format(x_test.shape))

			Feature Shapes:
Train set: 		(1500,) 
Validation set: 	(500,) 
Test set: 		(500,)


In [40]:
from sklearn.model_selection import train_test_split

# Split into 70% training and 30% test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

### Create network model

In [53]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM, SimpleRNN

max_features = 20000
timesteps = 10

model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(256, dropout=0.2))
#model.add(Dense(100, activation='softmax', name='hash'))
model.add(Dense(50, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train, batch_size=50, epochs=10)
score = model.evaluate(x_test, y_test, batch_size=200)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
