In [1]:
#import libs
import pandas as pd
import numpy as np

#load Datasets
articles = pd.read_csv('bbc_news_mixed.csv')
articles.head()

Unnamed: 0,text,label
0,Cairn shares slump on oil setback\n\nShares in...,business
1,Egypt to sell off state-owned bank\n\nThe Egyp...,business
2,Cairn shares up on new oil find\n\nShares in C...,business
3,Low-cost airlines hit Eurotunnel\n\nChannel Tu...,business
4,"Parmalat to return to stockmarket\n\nParmalat,...",business


In [3]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

lblencod = LabelEncoder()
articles.label = lblencod.fit_transform(articles.label)
y = to_categorical(articles.label)

print (y)

[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]]


In [6]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

#split the data
articles_train, articles_test, label_train, label_test = train_test_split(articles['text'], y, test_size=0.2)
articles_all = articles_train.append(articles_test)

#tokenize text
tokenize = Tokenizer()
tokenize.fit_on_texts(articles_all)

#convert text to sequence of Tokens

articles_train_tokens = tokenize.texts_to_sequences(articles_train)
articles_test_tokens = tokenize.texts_to_sequences (articles_test)


#calculate the max size of token and vocab size

max_len = articles_all.str.split().apply(lambda x : len(x)).max()
vocab_size = len(tokenize.word_index) + 1

print ("Max length :{} \nVocab size:{}".format(max_len,vocab_size))

Max length :4432 
Vocab size:32360


In [8]:
#Padding sequences to make it max length

from keras_preprocessing.sequence import pad_sequences

articles_train_pad = pad_sequences(articles_train_tokens, maxlen=max_len, padding='post')
articles_test_pad = pad_sequences(articles_test_tokens, maxlen=max_len, padding='post')

print(articles_train_pad.shape)

(1780, 4432)


In [9]:
# Building a neural network model 

from keras.models import Sequential
from keras.layers import Embedding, Dense, Flatten

# Embeding size
EMBEDDING_SIZE=100
vocab_100 =  int (vocab_size/100)


#intialize sequential model

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_SIZE, input_length=max_len))
model.add(Dense(500,activation='relu'))
model.add(Dense(vocab_100,activation='relu'))
model.add(Flatten())

# Add final layer that will classify into 5 lables.
model.add(Dense(5, activation='softmax'))

#compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 4432, 100)         3236000   
_________________________________________________________________
dense (Dense)                (None, 4432, 500)         50500     
_________________________________________________________________
dense_1 (Dense)              (None, 4432, 323)         161823    
_________________________________________________________________
flatten (Flatten)            (None, 1431536)           0         
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 7157685   
Total params: 10,606,008
Trainable params: 10,606,008
Non-trainable params: 0
_________________________________________________________________


In [11]:
model.fit(articles_train_pad,label_train,epochs=11, validation_data=(articles_test_pad,label_test))

Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11


<tensorflow.python.keras.callbacks.History at 0x29f8621cb08>