## Importing Utilities

In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.datasets import fetch_20newsgroups
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
np.random.seed(400)

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout

Using TensorFlow backend.


In [3]:
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))

## Reading Data

In [5]:
#Reading data
newsgroup_data = fetch_20newsgroups(shuffle=True)

In [5]:
data = pd.DataFrame({'News':newsgroup_data.data,'Topic':newsgroup_data.target})

## Cleaning/Preprocessing Data

In [16]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

In [24]:
data['News'] = data['News'].apply(clean_text)

## Checking average length of news text

In [None]:
data['length']=data.apply(lambda row:len(row['News'].split()),axis=1)

In [None]:
np.mean(data['length'].values)

## Tokenizing and creating vectors

In [25]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 30000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 160
# This is fixed.
EMBEDDING_DIM = 50

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(data['News'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 268996 unique tokens.


In [26]:
X = tokenizer.texts_to_sequences(data['News'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (11314, 160)


In [27]:
Y = pd.get_dummies(data['Topic']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (11314, 20)


## Train test split

In [28]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.25, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(8485, 160) (8485, 20)
(2829, 160) (2829, 20)


## Building LSTM architecture

In [100]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(rate = 0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(20, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 160, 50)           1500000   
_________________________________________________________________
spatial_dropout1d_11 (Spatia (None, 160, 50)           0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 100)               60400     
_________________________________________________________________
dense_11 (Dense)             (None, 20)                2020      
Total params: 1,562,420
Trainable params: 1,562,420
Non-trainable params: 0
_________________________________________________________________
None


## Running

In [101]:
epochs = 5
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Train on 7636 samples, validate on 849 samples
Epoch 1/5

## Evaluation model on Test set

In [102]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 1.341
  Accuracy: 0.566
