In [1]:
import keras

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
import tensorflow as tf

In [3]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

In [4]:
import pandas as pd

In [5]:
doc_info = pd.read_csv('data/titles_and_types.csv', sep='\t')
doc_info.head()

Unnamed: 0,post_id,type,title
0,000005df-11ab-4f4d-b5f9-7a7d5cc0614e,Regulation,"7 CFR, PART 774—EMERGENCY LOAN FOR SEED PRODUC..."
1,00000d0e-7b29-4965-a81d-258f4ad96cac,News,Reichley will provide Clear Springs with consu...
2,00001e8e-481d-4cd1-9baa-eb326801c168,Regulation,Council Regulation (EEC) No 1750/92 of 30 June...
3,0000b797-5013-49db-89ea-bbb8bfe5bd74,News,Timely to the Chenghua District Food and Drug ...
4,0001362f-c61a-4540-96b1-3dbf70d82b0a,News,askFSIS AS A RESOURCE


In [6]:
docs = doc_info['post_id'].values.astype('U')

In [7]:
print(len(docs))

115256


In [8]:
labels = doc_info['type'].values.astype('U')
print(len(labels), labels[:5])

115256 ['Regulation' 'News' 'Regulation' 'News' 'News']


In [9]:
def add_label(category):
    if category=='Regulation':
        return 0
    elif category=='News':
        return 1
    elif category=='Guidance':
        return 2
    elif category=='Scientific':
        return 3

In [10]:
labels = doc_info['type'].apply(add_label)
print(labels[:5])

0    0
1    1
2    0
3    1
4    1
Name: type, dtype: int64


In [11]:
labels = labels.values

In [12]:
labels.shape

(115256,)

In [13]:
labels = keras.utils.np_utils.to_categorical(labels)

In [14]:
vocab_size = 15000
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [15]:
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[ 8997  1573 10195  5521]
 [ 1872 11699  8036  2439]
 [ 9690  7606 13147  8955]
 ..., 
 [12050  6292  3036  2973]
 [ 4043 12246  2659 11039]
 [10457  5123  3778 10161]]


In [16]:
padded_docs.shape

(115256, 4)

In [17]:
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(4, activation='sigmoid'))

In [18]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [19]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 8)              120000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 132       
Total params: 120,132
Trainable params: 120,132
Non-trainable params: 0
_________________________________________________________________
None


In [20]:
# !pip install tensorboard

In [20]:
from keras.callbacks import TensorBoard

In [21]:
tb_callback = TensorBoard(log_dir='tf_log')

In [22]:
model.fit(padded_docs, labels, epochs=10, verbose=1, validation_split=0.2, callbacks=[tb_callback])

Train on 92204 samples, validate on 23052 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff5ddf0fcc0>

In [23]:
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 73.720240


In [76]:
# !pip3 install h5py

Collecting h5py
  Downloading h5py-2.7.1-cp35-cp35m-manylinux1_x86_64.whl (5.3MB)
[K    100% |████████████████████████████████| 5.3MB 294kB/s eta 0:00:01
[?25hCollecting numpy>=1.7 (from h5py)
  Using cached numpy-1.13.3-cp35-cp35m-manylinux1_x86_64.whl
Collecting six (from h5py)
  Using cached six-1.11.0-py2.py3-none-any.whl
Installing collected packages: numpy, six, h5py
Successfully installed h5py-2.7.1 numpy-1.13.3 six-1.11.0
[33mYou are using pip version 8.1.1, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [24]:
import h5py

In [25]:
model.save('titles_classify')

In [26]:
doc_info.head()

Unnamed: 0,post_id,type,title
0,000005df-11ab-4f4d-b5f9-7a7d5cc0614e,Regulation,"7 CFR, PART 774—EMERGENCY LOAN FOR SEED PRODUC..."
1,00000d0e-7b29-4965-a81d-258f4ad96cac,News,Reichley will provide Clear Springs with consu...
2,00001e8e-481d-4cd1-9baa-eb326801c168,Regulation,Council Regulation (EEC) No 1750/92 of 30 June...
3,0000b797-5013-49db-89ea-bbb8bfe5bd74,News,Timely to the Chenghua District Food and Drug ...
4,0001362f-c61a-4540-96b1-3dbf70d82b0a,News,askFSIS AS A RESOURCE


In [27]:
docs_regulation = doc_info[doc_info['type']=='Regulation']
docs_regulation.head()

Unnamed: 0,post_id,type,title
0,000005df-11ab-4f4d-b5f9-7a7d5cc0614e,Regulation,"7 CFR, PART 774—EMERGENCY LOAN FOR SEED PRODUC..."
2,00001e8e-481d-4cd1-9baa-eb326801c168,Regulation,Council Regulation (EEC) No 1750/92 of 30 June...
15,00058951-6857-446c-8518-5ebafb8f4071,Regulation,NOM-069-FITO-1995-FOR THE ESTABLISHMENT AND RE...
19,0006fb05-397a-4f14-b016-3e5459226f88,Regulation,Rules on quality of meat of sloughtered animal...
23,000ad58f-d62e-46b1-a8b4-707600bf1076,Regulation,Commission Regulation (EEC) No 1729/93 of 30 J...


In [28]:
docs_news = doc_info[doc_info['type']=='News']
docs_news.head()

Unnamed: 0,post_id,type,title
1,00000d0e-7b29-4965-a81d-258f4ad96cac,News,Reichley will provide Clear Springs with consu...
3,0000b797-5013-49db-89ea-bbb8bfe5bd74,News,Timely to the Chenghua District Food and Drug ...
4,0001362f-c61a-4540-96b1-3dbf70d82b0a,News,askFSIS AS A RESOURCE
5,0001bcb8-74c0-4274-97bf-625da505caa1,News,Heilongjiang Province Food and Drug Administra...
6,0001c118-5e95-48f0-8d61-eb80b393364e,News,"Roquefort Société®, between tradition and mode..."


In [29]:
docs_scientific = doc_info[doc_info['type']=='Scientific']
docs_scientific.head()

Unnamed: 0,post_id,type,title
26,000bc1a0-da9c-4fac-8181-5c4e4b5525f0,Scientific,Reasoned opinion on the review of the existing...
125,00490aff-9d99-4b34-b15d-398dc1cb2efc,Scientific,Technical specifications for monitoring Commun...
132,004b24f1-bc15-4c8a-a51f-b3872954640d,Scientific,Chemosensory characterization of virgin olive ...
136,004d4752-383c-4c5f-87a5-be84e3315410,Scientific,Scientific Opinion on a quantitative estimatio...
221,00859669-2c19-4226-8f9f-f3bd61cc1689,Scientific,Risk/Benefit Communication About Food? A Syste...


In [30]:
docs_guidance = doc_info[doc_info['type']=='Guidance']
docs_guidance.head()

Unnamed: 0,post_id,type,title
14,000567fe-880a-4630-8cd7-fd9995cf7b02,Guidance,Standard 1.2.11 Information requirements—count...
16,0005d3a0-2779-4740-bd09-b1e9cbf6c50c,Guidance,Plant Exports Management System (PEMS) Authori...
21,00081804-3fc2-4a57-8771-d669ca629888,Guidance,Guide To Plant Breeders' Rights In Canada
24,000b0232-b57a-478a-9f81-92f476e4b0cb,Guidance,What will this application do for you?
30,000d9bf3-02d9-45e5-ae37-639065bb3d13,Guidance,Draft Guidance for Industry: Dietary Supplemen...
