# End-to-end NLP: News Headline classifier

### Setup execution role and session

In [1]:
import numpy as np
import pandas as pd

In [37]:
%%time
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()
print(role)
sess = sagemaker.Session()

arn:aws:iam::349934754982:role/service-role/AmazonSageMaker-ExecutionRole-20190123T091078
CPU times: user 499 ms, sys: 61.9 ms, total: 561 ms
Wall time: 2.48 s


### Download News Aggregator Dataset available at the public UCI dataset repository

In [2]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip

--2019-01-24 18:09:38--  https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.249
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29224203 (28M) [application/zip]
Saving to: ‘NewsAggregatorDataset.zip’


2019-01-24 18:09:40 (21.0 MB/s) - ‘NewsAggregatorDataset.zip’ saved [29224203/29224203]



In [3]:
!unzip NewsAggregatorDataset.zip

Archive:  NewsAggregatorDataset.zip
  inflating: 2pageSessions.csv       
   creating: __MACOSX/
  inflating: __MACOSX/._2pageSessions.csv  
  inflating: newsCorpora.csv         
  inflating: __MACOSX/._newsCorpora.csv  
  inflating: readme.txt              
  inflating: __MACOSX/._readme.txt   


In [4]:
!rm -rf __MACOSX/

In [5]:
#ls

#### Let's visualize the dataset

In [5]:
import pandas as pd
import tensorflow as tf
import re
import numpy as np
import os

In [6]:
column_names = ["TITLE", "URL", "PUBLISHER", "CATEGORY", "STORY", "HOSTNAME", "TIMESTAMP"]
news_dataset = pd.read_csv('newsCorpora.csv', names=column_names, header=None, delimiter='\t')
news_dataset.head()

Unnamed: 0,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


#### For this exercice we'll only use the title (Headline) of the news story and the category as our target variable

In [7]:
df=news_dataset[['TITLE',"CATEGORY"]]

In [8]:
from collections import Counter
Counter(df['CATEGORY'])

Counter({'b': 115967, 'e': 152469, 'm': 45639, 't': 108344})

The dataset has four categories: Business (b), Science & Technology (t), Entertainment (e) and Health & Medicine (m).

In [9]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
leMapped=le.fit_transform(df["CATEGORY"].values)
list(le.classes_)

['b', 'e', 'm', 't']

#### Dummy encode the labels

In [35]:
from sklearn import preprocessing
from keras.utils.np_utils import to_categorical
encoder = preprocessing.LabelEncoder()

docs = df["TITLE"].values

encoder.fit(df["CATEGORY"].values)
encoded_Y = encoder.transform(df["CATEGORY"].values)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = to_categorical(encoded_Y)

In [41]:
#bucket = <bucket> # custom bucket name.
s3_bucket = sess.default_bucket()
s3_prefix = 'news'

In [None]:
list(encoder.classes_)

In [10]:
encoded_Y

array([0, 0, 0, ..., 2, 2, 2])

#### Tokenize documents and set fixed sequence lengths for input feature dimension.

In [129]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(vocab_size)
# pad documents to a max length of 4 words
max_length = 40
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(len(padded_docs))

75287
422419


In [115]:
docs[0]

'Fed official says weak data caused by weather, should not slow taper'

### Import word embeddings

In [19]:
!wget http://nlp.stanford.edu/data/glove.6B.zip && unzip glove.6B.zip

--2019-01-24 18:10:54--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-01-24 18:10:54--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-01-24 18:11:10 (54.0 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [20]:
rm 2pageSessions.csv glove.6B.200d.txt glove.6B.50d.txt glove.6B.300d.txt glove.6B.zip

##### Create embedding matrix

In [21]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [143]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('../blazingtext_word2vec_text8_2019-01-24/vectors.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 71291 word vectors.


In [118]:
#embeddings_index

In [23]:
#print(t.word_index)

In [147]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [148]:
#embedding_matrix.dump("ingredients-embedding-matrix.dat")
np.save(file="./data/embeddings/docs-embedding-matrix",
        arr=embedding_matrix,
        allow_pickle=False)
print(embedding_matrix.shape)

(75287, 100)


### Train, test split

In this section we will prep the data for ingestion for the algortihm. Split the data set in train and test samples and uplad the data to S3

In [149]:
X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.2, random_state=42)

In [44]:
!mkdir data/train/ data/test/ data/embeddings/

In [46]:
np.save('./data/train/train_X.npy', X_train)
np.save('./data/train/train_Y.npy', y_train)
np.save('./data/test/test_X.npy', X_test)
np.save('./data/test/test_Y.npy', y_test)

In [153]:
import sagemaker
from sagemaker.tensorflow import TensorFlow

### Define hyperparameters to push to algorithm

In [130]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import KFold

# fix random seed for reproducibility
seed = 42
np.random.seed(seed)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.2, random_state=42)
# define 10-fold cross validation test harness

saveBestModelWeights = ModelCheckpoint("news_model_weights.h5",
                                       monitor='val_acc',
                                       verbose=1, 
                                       save_best_only=True,
                                       save_weights_only=False,
                                       mode='auto',
                                       period=1)

In [131]:


    # define the model
model = Sequential()
model.add(Embedding(vocab_size, 100, 
                        weights=[embedding_matrix],
                        input_length=40, 
                        trainable=False, 
                        name="embed"))
model.add(Conv1D(filters=128, 
                     kernel_size=3, 
                     activation='relu',
                     name="conv_1"))
model.add(MaxPooling1D(pool_size=5,
                           name="maxpool_1"))
model.add(Flatten(name="flat_1"))
model.add(Dropout(0.3,
                     name="dropout_1"))
model.add(Dense(128, 
                    activation='relu',
                    name="dense_1"))
model.add(Dense(le.classes_.size,
                    activation='softmax',
                    name="out_1"))
    
    # compile the model
model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['acc'])
    

model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed (Embedding)            (None, 40, 100)           7528700   
_________________________________________________________________
conv_1 (Conv1D)              (None, 38, 128)           38528     
_________________________________________________________________
maxpool_1 (MaxPooling1D)     (None, 7, 128)            0         
_________________________________________________________________
flat_1 (Flatten)             (None, 896)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 896)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               114816    
_________________________________________________________________
out_1 (Dense)                (None, 4)                 516       
Total para

In [132]:
        
    # fit the model
model.fit(X_train,
              y_train,
              batch_size=16,
              epochs=5, # no benefit from additional epochs
              verbose=1,
              callbacks=[saveBestModelWeights])
    
scores = model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
acc: 95.40%


In [141]:
example_doc=['Senate prepares to vote on dueling plans to end shutdown']
# integer encode the document
encoded_example = t.texts_to_sequences(example_doc)

# pad documents to a max length of 4 words
max_length = 40
padded_example = pad_sequences(encoded_example, maxlen=max_length, padding='post')

In [142]:
model.predict(padded_example)

array([[0.42525288, 0.13664994, 0.26215264, 0.17594457]], dtype=float32)

Some of these files will be needed in the object store to support scoring service