In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import nltk
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

Using TensorFlow backend.


# Read in dataset
Load the [Reuter 50_50 training dataset](https://archive.ics.uci.edu/ml/datasets/Reuter_50_50).

TODO:  download and extract directly from website

In [2]:
# source modified from:
# https://github.com/devanshdalal/Author-Identification-task/blob/master/learner.py
path = 'data/C50/C50train/'
authors = os.listdir(path)
data = []

for author in authors:
  texts = os.listdir(path + author + '/')
  for text in texts:
    f=open(path + author + '/' + text, 'r')
    data.append([author, f.read()])
    f.close()
    
df = pd.DataFrame(data, columns=["author", "text"])
df.head()

Unnamed: 0,author,text
0,RobinSidel,Drugstore giant Revco D.S. Inc. said Monday it...
1,RobinSidel,"Mattel Inc., seeking to expand in the market f..."
2,RobinSidel,A financial agreement between Barney's Inc and...
3,RobinSidel,An independent shareholder advisory firm recom...
4,RobinSidel,Raising the stakes in the escalating battle fo...


## Preprocess data

In [3]:
#nltk.download()
# download 'punkt' if this is first time in notebook

In [4]:
df["text"] = df["text"].str.lower()
df["text"] = df["text"].str.replace('\n', ' ')
df["tokens"] = df["text"].apply(nltk.word_tokenize)
df.head()

Unnamed: 0,author,text,tokens
0,RobinSidel,drugstore giant revco d.s. inc. said monday it...,"[drugstore, giant, revco, d.s, ., inc., said, ..."
1,RobinSidel,"mattel inc., seeking to expand in the market f...","[mattel, inc., ,, seeking, to, expand, in, the..."
2,RobinSidel,a financial agreement between barney's inc and...,"[a, financial, agreement, between, barney, 's,..."
3,RobinSidel,an independent shareholder advisory firm recom...,"[an, independent, shareholder, advisory, firm,..."
4,RobinSidel,raising the stakes in the escalating battle fo...,"[raising, the, stakes, in, the, escalating, ba..."


## Gather vocabulary

In [5]:
all_text = df["text"].str.cat()
all_text = nltk.word_tokenize(all_text)

print("Reuters dataset")
print("Total words: {}".format(len(all_text)))
print("Unique words: {}".format(len(set(all_text))))

Reuters dataset
Total words: 1435270
Unique words: 37279


In [6]:
# Create a vocab of the Reuters dataset, ordered by frequency (decending)
f = nltk.FreqDist(all_text)
reuters_most_common = [w for (w,_) in f.most_common()]
print("Top 100 Reuters words:\n", reuters_most_common[:100])

Top 100 Reuters words:
 ['the', ',', '.', 'to', 'of', 'a', 'in', 'and', 'said', "'s", '``', "''", 'for', 'on', 'that', 'is', 'it', 'with', 'be', '$', 'at', 'by', 'its', 'as', 'was', 'from', 'he', 'will', 'but', 'has', 'have', 'would', 'percent', 'are', 'million', 'not', 'which', 'an', 'year', '(', ')', 'this', 'we', 'company', 'had', 'new', 'they', 'market', 'were', 'china', 'billion', 'up', 'been', 'more', 'one', '--', 'also', 'or', 'about', 'analysts', 'after', 'u.s.', 'last', 'their', 'than', 'some', 'over', 'there', 'could', 'who', 'group', 'two', 'share', 'first', 'i', 'companies', 'hong', 'industry', 'business', 'kong', 'other', 'his', 'if', 'bank', 'into', 'stock', 'government', 'expected', 'years', 'out', 'shares', 'analyst', 'sales', 'no', ';', 'all', 'told', 'when', 'chinese', 'next']


In [7]:
# Load top 10,000 English words, according to Google
#    Source:  https://github.com/first20hours/google-10000-english
with open('data/google-10000-english-usa-no-swears.txt', 'r') as f:
  google_most_common = f.read().replace('\n', ' ')
google_most_common = nltk.word_tokenize(google_most_common)
print("Top 100 Google words:\n", google_most_common[:100])

Top 100 Google words:
 ['the', 'of', 'and', 'to', 'a', 'in', 'for', 'is', 'on', 'that', 'by', 'this', 'with', 'i', 'you', 'it', 'not', 'or', 'be', 'are', 'from', 'at', 'as', 'your', 'all', 'have', 'new', 'more', 'an', 'was', 'we', 'will', 'home', 'can', 'us', 'about', 'if', 'page', 'my', 'has', 'search', 'free', 'but', 'our', 'one', 'other', 'do', 'no', 'information', 'time', 'they', 'site', 'he', 'up', 'may', 'what', 'which', 'their', 'news', 'out', 'use', 'any', 'there', 'see', 'only', 'so', 'his', 'when', 'contact', 'here', 'business', 'who', 'web', 'also', 'now', 'help', 'get', 'pm', 'view', 'online', 'c', 'e', 'first', 'am', 'been', 'would', 'how', 'were', 'me', 's', 'services', 'some', 'these', 'click', 'its', 'like', 'service', 'x', 'than', 'find']


In [8]:
# Look through the differences between the two vocabs
unique_reuters_words = [x for x in reuters_most_common[:10000] if x not in google_most_common]
print("Reuters unique words: {}. Here's the top 100.".format(len(unique_reuters_words)))
print(unique_reuters_words[:100])

unique_google_words = [x for x in google_most_common if x not in reuters_most_common[:10000]]
print("\nGoogle unique words: {}. Here's the top 100.".format(len(unique_google_words)))
print(unique_google_words[:100])

Reuters unique words: 4503. Here's the top 100.
[',', '.', "'s", '``', "''", '$', '(', ')', '--', 'u.s.', ';', '&', "n't", 'corp.', "'", '1996', '10', '...', '1997', 'inc.', 'tonnes', '1995', 'pence', '20', '30', 'wang', "'re", 'mci', '1', ':', 'newsroom', 'boeing', '15', '50', 'bre-x', '100', 'airbus', 'co.', 'tung', 'francs', 'takeover', '12', 'traders', '40', '25', 'rival', "'ve", 'uaw', 'klaus', 'stg', '14', 'cocoa', 'yuan', 'barrick', 'shareholder', '-', 'labour', '11', '1997.', 'ltd.', 'conrail', '1996/97', '60', 'speculation', '1995.', 'margins', '300', 'regulators', 'long-term', 'automaker', 'tibet', '1998', 'long-distance', 'murdoch', '2', '1994', 'exporters', '90', 'jiang', 'handover', 'telecoms', '?', '1989', 'eurotunnel', 'crowns', 'privatisation', '16', '17', '171', 'rivals', 'jumped', 'adm', 'dissident', 'csx', 'deng', 'profitable', '80', '200', '18', '13']

Google unique words: 4392. Here's the top 100.
['pm', 'e', 'click', 'x', 're', 'info', 'm', 'ebay', 'dvd', 'website

Let's take a look at some of these unique words, in order of frequency, to see if they are domain-specific.  For the Reuters vocab, some are punctuation, many are numbers, and the remainder are mostly domain-specific (international-business) related words, such as *privitisation, pre-tax*, and *conglomerate* or names such as *murdoch* and *monsanto*.  For the Google vocab, some are letters, some computer-related such as *forums* and *login*, and some are more general such as *color* and *thank*.

Let's use the Google vocab and add punctuations and contractions.

In [9]:
# Extend common vocab to include punctuation + contractions
from string import punctuation
vocab = google_most_common + list(punctuation) + ['--', "'s", "n't", '...', "'re", "'ve"]

## Convert text and authors to integers

### Mapping authors to integers

In [10]:
# encode authors and labels
author_to_int = {c: i for i, c in enumerate(authors, 1)}
df["labels"] = [author_to_int[y] for y in df["author"].values]
df.head()

Unnamed: 0,author,text,tokens,labels
0,RobinSidel,drugstore giant revco d.s. inc. said monday it...,"[drugstore, giant, revco, d.s, ., inc., said, ...",1
1,RobinSidel,"mattel inc., seeking to expand in the market f...","[mattel, inc., ,, seeking, to, expand, in, the...",1
2,RobinSidel,a financial agreement between barney's inc and...,"[a, financial, agreement, between, barney, 's,...",1
3,RobinSidel,an independent shareholder advisory firm recom...,"[an, independent, shareholder, advisory, firm,...",1
4,RobinSidel,raising the stakes in the escalating battle fo...,"[raising, the, stakes, in, the, escalating, ba...",1


### Downloading embedding model

In [11]:
#!pip install gensim

In [12]:
import gensim.downloader as api

# https:/github.com/RaRe-Technologies/gensim-data
# glove-twitter-25
# word2vec-google-news-300
info = api.info()
embed_model = api.load("glove-twitter-25")

In [13]:
len(embed_model.wv['house'])

25

In [14]:
embed_model.most_similar("house")

[('home', 0.9537084102630615),
 ('room', 0.9411841630935669),
 ('at', 0.9270982146263123),
 ('out', 0.9157505631446838),
 ('here', 0.9152163863182068),
 ('street', 0.9097360968589783),
 ("'m", 0.9017621874809265),
 ('going', 0.9013768434524536),
 ('town', 0.9007171988487244),
 ('party', 0.8996413350105286)]

### Convert text tokens to vectors

In [15]:
def word_to_vector(tokens):
    return [embed_model.wv[word] if word in embed_model.vocab else np.zeros(25) \
            for word in tokens]
    
df["vectors"] = df["tokens"].apply(word_to_vector)
df.head()

Unnamed: 0,author,text,tokens,labels,vectors
0,RobinSidel,drugstore giant revco d.s. inc. said monday it...,"[drugstore, giant, revco, d.s, ., inc., said, ...",1,"[[-0.98918, 0.10242, -0.53404, -0.78541, 1.101..."
1,RobinSidel,"mattel inc., seeking to expand in the market f...","[mattel, inc., ,, seeking, to, expand, in, the...",1,"[[-0.57181, -0.35047, 0.11173, -0.52608, -0.12..."
2,RobinSidel,a financial agreement between barney's inc and...,"[a, financial, agreement, between, barney, 's,...",1,"[[0.21294, 0.31035, 0.17694, 0.87498, 0.067926..."
3,RobinSidel,an independent shareholder advisory firm recom...,"[an, independent, shareholder, advisory, firm,...",1,"[[0.54244, 0.45753, -0.45787, 0.066361, 0.5891..."
4,RobinSidel,raising the stakes in the escalating battle fo...,"[raising, the, stakes, in, the, escalating, ba...",1,"[[-0.96966, 1.1775, -0.76991, 0.53168, 0.60541..."


## Create sequences and batches

## Training

### Create training, validation, and test sets

In [16]:
# Sample and split dataframe:  60% training, 20% validation, and 20% test
train = df.sample(frac=0.6, replace=False, random_state=1)
test = df.drop(train.index)
val = test.sample(frac=0.5, replace=False, random_state=1)
test = test.drop(val.index)

In [39]:
def get_values(dataframe):
    x = [x for x in dataframe["tokens"]]
    y = [y for y in dataframe["labels"]]
    return np.array(x), np.array(y)

x_train, y_train = get_values(train)
x_val, y_val = get_values(val)
x_test, y_test = get_values(test)

# Source below from:  https://github.com/udacity/deep-learning/blob/master/sentiment-rnn/Sentiment_RNN_Solution.ipynb
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(x_train.shape), 
      "\nValidation set: \t{}".format(x_val.shape),
      "\nTest set: \t\t{}".format(x_test.shape))

			Feature Shapes:
Train set: 		(1500,) 
Validation set: 	(500,) 
Test set: 		(500,)


### Create network model

In [48]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(nb_words=2000, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                                   lower=True,split=' ')

tokenizer.fit_on_texts(df["text"].values)
x = tokenizer.texts_to_sequences(df["text"].values)
x = pad_sequences(X)

In [49]:
y = df["labels"].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [51]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM, SimpleRNN

max_features = 30000
timesteps = 10

model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(x_train, y_train, batch_size=10, epochs=10)
score = model.evaluate(x_test, y_test, batch_size=16)

Epoch 1/10
 250/1750 [===>..........................] - ETA: 2:38 - loss: -258.8098 - acc: 0.0280

KeyboardInterrupt: 