In [63]:
import warnings
warnings.filterwarnings('ignore')

import os
import nltk
import numpy as np
import pandas as pd

import tensorflow as tf
import shutil
import tensorflow.contrib.learn as tflearn
import tensorflow.contrib.layers as tflayers
from tensorflow.contrib.learn.python.learn import learn_runner
from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils
import tensorflow.contrib.rnn as rnn

# Read in dataset
Load the [Reuter 50_50 training dataset](https://archive.ics.uci.edu/ml/datasets/Reuter_50_50).

TODO:  download and extract directly from website

In [64]:
# source modified from:
# https://github.com/devanshdalal/Author-Identification-task/blob/master/learner.py
path = 'data/C50/C50train/'
authors = os.listdir(path)
data = []

for author in authors:
  texts = os.listdir(path + author + '/')
  for text in texts:
    f=open(path + author + '/' + text, 'r')
    data.append([author, f.read()])
    f.close()
    
df = pd.DataFrame(data, columns=["author", "text"])
df.head()

Unnamed: 0,author,text
0,WilliamKazer,China on Tuesday announced a ban on poultry an...
1,WilliamKazer,China said on Thursday the highest-level U.S. ...
2,WilliamKazer,China has tightened safety measures after a fa...
3,WilliamKazer,China on Thursday tried to play down friction ...
4,WilliamKazer,China is preparing to tap overseas capital mar...


## Preprocess data

In [65]:
#nltk.download()
# download 'punkt' if this is first time in notebook

In [66]:
df["text"] = df["text"].str.lower()
df["text"] = df["text"].str.replace('\n', ' ')
df["tokens"] = df["text"].apply(nltk.word_tokenize)
df.head()

Unnamed: 0,author,text,tokens
0,WilliamKazer,china on tuesday announced a ban on poultry an...,"[china, on, tuesday, announced, a, ban, on, po..."
1,WilliamKazer,china said on thursday the highest-level u.s. ...,"[china, said, on, thursday, the, highest-level..."
2,WilliamKazer,china has tightened safety measures after a fa...,"[china, has, tightened, safety, measures, afte..."
3,WilliamKazer,china on thursday tried to play down friction ...,"[china, on, thursday, tried, to, play, down, f..."
4,WilliamKazer,china is preparing to tap overseas capital mar...,"[china, is, preparing, to, tap, overseas, capi..."


## Gather vocabulary

In [84]:
all_text = df["text"].str.cat()
all_text = nltk.word_tokenize(all_text)

print("Reuters dataset")
print("Total words: {}".format(len(all_text)))
print("Unique words: {}".format(len(set(all_text))))

Reuters dataset
Total words: 1435269
Unique words: 37279


In [80]:
# Create a vocab of the Reuters dataset, ordered by frequency (decending)
f = nltk.FreqDist(all_text)
reuters_most_common = [w for (w,_) in f.most_common()]
print("Top 100 Reuters words:\n", reuters_most_common[:100])

Top 100 Reuters words:
 ['the', ',', '.', 'to', 'of', 'a', 'in', 'and', 'said', "'s", '``', "''", 'for', 'on', 'that', 'is', 'it', 'with', 'be', '$', 'at', 'by', 'its', 'as', 'was', 'from', 'he', 'will', 'but', 'has', 'have', 'would', 'percent', 'are', 'million', 'not', 'which', 'an', 'year', '(', ')', 'this', 'we', 'company', 'had', 'new', 'they', 'market', 'were', 'china', 'billion', 'up', 'been', 'more', 'one', '--', 'also', 'or', 'about', 'analysts', 'after', 'u.s.', 'last', 'their', 'than', 'some', 'over', 'there', 'could', 'who', 'two', 'group', 'share', 'first', 'i', 'companies', 'hong', 'industry', 'business', 'kong', 'other', 'his', 'if', 'bank', 'into', 'stock', 'government', 'expected', 'years', 'out', 'shares', 'analyst', 'sales', 'no', ';', 'all', 'told', 'when', 'chinese', 'next']


In [85]:
# Load top 20,000 English words, according to Google
#    Source:  https://github.com/first20hours/google-10000-english
with open('data/google-10000-english-usa-no-swears.txt', 'r') as f:
  google_most_common = f.read().replace('\n', ' ')
google_most_common = nltk.word_tokenize(google_most_common)

print("Top 100 Google words:\n", google_most_common[:100])

Top 100 Google words:
 ['the', 'of', 'and', 'to', 'a', 'in', 'for', 'is', 'on', 'that', 'by', 'this', 'with', 'i', 'you', 'it', 'not', 'or', 'be', 'are', 'from', 'at', 'as', 'your', 'all', 'have', 'new', 'more', 'an', 'was', 'we', 'will', 'home', 'can', 'us', 'about', 'if', 'page', 'my', 'has', 'search', 'free', 'but', 'our', 'one', 'other', 'do', 'no', 'information', 'time', 'they', 'site', 'he', 'up', 'may', 'what', 'which', 'their', 'news', 'out', 'use', 'any', 'there', 'see', 'only', 'so', 'his', 'when', 'contact', 'here', 'business', 'who', 'web', 'also', 'now', 'help', 'get', 'pm', 'view', 'online', 'c', 'e', 'first', 'am', 'been', 'would', 'how', 'were', 'me', 's', 'services', 'some', 'these', 'click', 'its', 'like', 'service', 'x', 'than', 'find']


In [86]:
# Look through the differences between the two vocabs
unique_reuters_words = [x for x in reuters_most_common[:20000] if x not in google_most_common]
print("Reuters unique words: {}. Here's the top 100.".format(len(unique_reuters_words)))
print(unique_reuters_words[:100])

unique_google_words = [x for x in google_most_common if x not in reuters_most_common[:20000]]
print("\nGoogle unique words: {}. Here's the top 100.".format(len(unique_google_words)))
print(unique_google_words[:100])

Reuters unique words: 12974. Here's the top 100.
[',', '.', "'s", '``', "''", '$', '(', ')', '--', 'u.s.', ';', '&', "n't", 'corp.', "'", '1996', '10', '...', '1997', 'inc.', 'tonnes', '1995', 'pence', '20', '30', 'wang', "'re", 'mci', '1', ':', 'newsroom', 'boeing', '15', '50', 'bre-x', '100', 'airbus', 'co.', 'tung', 'francs', 'takeover', '12', 'traders', '40', '25', 'rival', "'ve", 'uaw', 'klaus', 'stg', '14', 'cocoa', 'yuan', 'barrick', 'shareholder', '-', 'labour', '11', '1997.', 'ltd.', 'conrail', '1996/97', '60', '1995.', 'speculation', 'margins', '300', 'regulators', 'long-term', 'automaker', 'tibet', '1998', 'long-distance', 'murdoch', '2', '1994', '90', 'exporters', 'jiang', 'handover', 'telecoms', '1989', '?', 'crowns', 'privatisation', 'eurotunnel', '17', '16', '171', 'rivals', 'jumped', 'adm', 'dissident', 'csx', 'deng', 'profitable', '80', '200', '18', '13']

Google unique words: 2861. Here's the top 100.
['pm', 'info', 'ebay', 'k', 'y', 'teen', 'gay', 'forums', 'blog', '

Let's take a look at some of these unique words, in order of frequency, to see if they are domain-specific.  For the Reuters vocab, some are punctuation, many are numbers, and the remainder are mostly domain-specific (international-business) related words, such as *privitisation, pre-tax*, and *conglomerate* or names such as *murdoch* and *monsanto*.  For the Google vocab, some are letters, some computer-related such as *forums* and *login*, and some are more general such as *color* and *thank*.

Let's use the Google vocab and add punctuations and contractions.

In [87]:
# Extend common vocab to include punctuation + contractions
from string import punctuation
vocab = most_common + list(punctuation) + ['--', "'s", "n't", '...', "'re", "'ve"]

## Convert text and authors to integers
.

### Mapping authors to integers

In [15]:
# encode authors and labels
author_to_int = {c: i for i, c in enumerate(authors, 1)}
df["labels"] = [author_to_int[name] for name in df["author"].values]
df.head()

Unnamed: 0,author,text,tokens,encoded,labels
0,WilliamKazer,china on tuesday announced a ban on poultry an...,"[china, on, tuesday, announced, a, ban, on, po...","[630, 10, 1150, 2773, 6, 5431, 10, 9878, 4, 98...",1
1,WilliamKazer,china said on thursday the highest-level u.s. ...,"[china, said, on, thursday, the, highest-level...","[630, 187, 10, 1184, 2, 1, 1, 400, 7, 117, 173...",1
2,WilliamKazer,china has tightened safety measures after a fa...,"[china, has, tightened, safety, measures, afte...","[630, 41, 1, 715, 2030, 151, 6, 8196, 7680, 90...",1
3,WilliamKazer,china on thursday tried to play down friction ...,"[china, on, thursday, tried, to, play, down, f...","[630, 10, 1184, 2353, 5, 517, 312, 13825, 14, ...",1
4,WilliamKazer,china is preparing to tap overseas capital mar...,"[china, is, preparing, to, tap, overseas, capi...","[630, 9, 5228, 5, 6831, 4625, 1164, 1905, 12, ...",1


### Downloading embedding model

In [None]:
#!pip install gensim

In [None]:
import gensim.downloader as api

# https://github.com/RaRe-Technologies/gensim-data
info = api.info()
model = api.load("word2vec-google-news-300")



In [None]:
model.wv['woman']

In [None]:
model.most_similar("cat")

## Create sequences and batches

## Training

### Create training, validation, and test sets

In [17]:
# Sample and split dataframe:  60% training, 20% validation, and 20% test
train = df.sample(frac=0.6, replace=False, random_state=1)
test = df.drop(train.index)
val = test.sample(frac=0.5, replace=False, random_state=1)
test = test.drop(val.index)

In [18]:
def get_values(dataframe):
  x = [model.wv(x) for x in dataframe["tokens"].values]
  y = [author_to_int(y) for y in dataframe["authors"].values]
  return np.array(x), np.array(y)

train_x, train_y = get_values(train)
val_x, val_y = get_values(val)
test_x, test_y = get_values(test)

# Source below from:  https://github.com/udacity/deep-learning/blob/master/sentiment-rnn/Sentiment_RNN_Solution.ipynb
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(1500, 50) 
Validation set: 	(500, 50) 
Test set: 		(500, 50)
