In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.contrib import learn
from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib

import sys
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.preprocessing import OneHotEncoder


In [4]:
## dataset provided
data = pd.read_csv("bbc-text.csv")


In [5]:
## function to create 

train_size = int(len(data) * .7)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))

def train_test_conversion(data, train_size):
    train = data[:train_size]
    test = data[train_size:]
    return train, test

Train size: 1557
Test size: 668


In [6]:
## creating and saving post-processed data

train_cat, test_cat = train_test_conversion(data['category'], train_size)
train_text, test_text = train_test_conversion(data['text'], train_size)


convert_train = pd.concat([train_cat,train_text], axis=1)
convert_train.to_csv(r'train.csv')

convert_test = pd.concat([test_cat,test_text], axis=1)
convert_test.to_csv(r'test.csv')

In [7]:
## creating embeddings

print('Indexing word vectors.')
embeddings_index = {}

f= open('glove.6B.100d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word]= coefs
f.close()

print('Found %s word vectors.'%len(embeddings_index))


Indexing word vectors.
Found 400000 word vectors.


In [8]:
def clean_text( doc, remove_stopwords=False):
    # remove HTML
    doc_text = BeautifulSoup(doc).get_text()
    # remove non-letters
    doc_text = re.sub("[^a-zA-Z]"," ", doc_text)
    # remove multiple white spaces and trailing white spaces
    doc_text = re.sub(" +"," ",doc_text)
    doc_text = doc_text.strip()
    # convert words to lower case and split them
    words = doc_text.lower().split()
    # optionally remove stop words.
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return " ".join(words)

In [9]:
texts = data['text']
labels = data['category']

MAX_NB_WORDS = 2000

In [10]:
## tokenizeing vectorinzing and cleaning data

processed_texts = [clean_text(t) for t in texts]
vectorizer = CountVectorizer(max_features=MAX_NB_WORDS)
vectorizer_fit = vectorizer.fit_transform(processed_texts)

words  = vectorizer.get_feature_names()
counts = vectorizer_fit.toarray().sum(axis=0)
counts_words = zip(counts,words)
#counts_words.sort(reverse=True)
counts_words = sorted(counts_words, key=lambda x: x[1])

In [11]:
## defining padding_sequence and One-hot encoder

def pad_sequences(seq, maxlen):
    if len(seq) >= maxlen:
        return np.array(seq[-maxlen:]).astype('int32')
    else:
        return np.pad(seq, (maxlen - len(seq)%maxlen, 0), 'constant').astype('int32')
    
def one_hot(x):
    return np.array(OneHotEncoder().fit_transform(x.reshape(-1,1)).todense())

In [12]:
MAX_SEQUENCE_LENGTH = 1000
vocabulary = [str(w[1]) for w in counts_words]
word_index = dict(zip(vocabulary, range(MAX_NB_WORDS)))

sequences = []
for doc in processed_texts:
    sequence=[]
    for word in doc.split():
        if word not in word_index:
            continue
        sequence.append(word_index[word])
    sequences.append(sequence)

data = np.vstack([pad_sequences(s,MAX_SEQUENCE_LENGTH) for s in sequences])
labels = np.asarray(labels)

In [13]:
x_train,x_test,y_train,y_test = train_test_split(data, labels, stratify=labels, test_size=0.3)

#y_train = one_hot(np.asarray(y_train))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y_train.shape)

print('Preparing embedding matrix.')

EMBEDDING_DIM = 100
    # prepare embedding matrix
num_words = MAX_NB_WORDS
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Shape of data tensor: (2225, 1000)
Shape of label tensor: (1557,)
Preparing embedding matrix.


In [14]:
encoder = LabelEncoder()
encoder.fit(train_cat)
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

In [15]:
  ## Function to build a model.
 ##  Accepts standard inputs: x, y and mode (TRAIN, test)
 ## Returns a ModelFnOps Object that will be passed to learn.Estimator


def cnn_model_fn(features, labels, mode):
    

    # Input Layer
    input_layer = tf.reshape(features, [-1, MAX_SEQUENCE_LENGTH])

    # embedding layer and look up
    embeddings = tf.get_variable(name="embeddings", shape=embedding_matrix.shape,
                                 initializer=tf.constant_initializer(embedding_matrix), trainable=False)
    embed = tf.nn.embedding_lookup(embeddings, input_layer)

    #  3 convolution, pooling and batch normalization layers
    conv1 = tf.layers.conv1d(inputs=embed, filters=128,kernel_size=5, padding="VALID", activation=tf.nn.relu)
    # one could add l2 regularization as:
    # conv1 = tf.layers.conv1d(inputs=embed, filters=128,kernel_size=5, padding="VALID",
    #     activation=tf.nn.relu, activity_regularizer=tf.contrib.layers.l2_regularizer(0.001))
    pool1 = tf.layers.max_pooling1d(inputs=conv1, pool_size=5, strides=5, padding="VALID")
    bn1 = tf.layers.batch_normalization(pool1)

    conv2 = tf.layers.conv1d(inputs=bn1, filters=128, kernel_size=5, padding="VALID", activation=tf.nn.relu)
    pool2 = tf.layers.max_pooling1d(inputs=conv2, pool_size=5, strides=5, padding="VALID")
    bn2 = tf.layers.batch_normalization(pool2)

    conv3 = tf.layers.conv1d(inputs=bn2, filters=128, kernel_size=5, padding="VALID", activation=tf.nn.relu)
    pool3 = tf.layers.max_pooling1d(inputs=conv3, pool_size=35, strides=35, padding="VALID")
    bn3 = tf.layers.batch_normalization(pool3)

    # Dense Layer
    bn3_flat = tf.reshape(bn3, [-1, 128])
    dense = tf.layers.dense(inputs=bn3_flat, units=128, activation=tf.nn.relu)
    dropout = tf.layers.dropout(inputs=dense, rate=0.5, training=mode == learn.ModeKeys.TRAIN)

    # Logits Layer
    logits = tf.layers.dense(inputs=dropout, units=20)

    # defining and keep tracking of loss and optimization
    loss = None
    train_op = None

    # here mode is defined train, test,
    if mode != learn.ModeKeys.INFER:
        onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=20)
        loss = tf.losses.softmax_cross_entropy(
            onehot_labels=onehot_labels, logits=logits)

    # Configure the Training Op (for TRAIN mode)
    if mode == learn.ModeKeys.TRAIN:
        train_op = tf.contrib.layers.optimize_loss(
            loss=loss,
            global_step=tf.contrib.framework.get_global_step(),
            learning_rate=0.001,
            optimizer="Adam")

    # Generate Predictions
    predictions = {
        "classes": tf.argmax(
            input=logits, axis=1),
        "probabilities": tf.nn.softmax(
            logits, name="softmax_tensor")
    }

    # Return a ModelFnOps object
    return model_fn_lib.ModelFnOps(
        mode=mode, predictions=predictions, loss=loss, train_op=train_op)


In [17]:
    ## implementing function of model cnn
    
    tf.logging.set_verbosity(tf.logging.INFO)
    text_classifier = learn.Estimator(
        model_fn=cnn_model_fn, model_dir="./model_layer")

    # Train the model
    text_classifier.fit(x=x_train,y=y_train,batch_size=128,steps=1250)

    

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000020174CC6DD8>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_train_distribute': None, '_eval_distribute': None, '_device_fn': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_protocol': None, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': './model_layer'}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into ./model_layer\mode

Estimator(params=None)

In [18]:
# Configure the accuracy metric for evaluation

metrics = {"accuracy":learn.MetricSpec(
        metric_fn=tf.metrics.accuracy, prediction_key="classes")}

# Evaluate the model and print results

eval_results = text_classifier.evaluate(x=x_test, y=y_test, metrics=metrics)
print(eval_results)

Instructions for updating:
Use tf.estimator.EstimatorSpec.eval_metric_ops.
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
INFO:tensorflow:Starting evaluation at 2019-07-05-04:28:59
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./model_layer\model.ckpt-1250
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-07-05-04:29:00
INFO:te