In [35]:
import numpy as np
import pandas as pd 
import itertools
import os
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
layers = keras.layers
models = keras.models


In [36]:
data = pd.read_csv("bbc-text.csv")
data.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [37]:
data['category'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [38]:
train_size = int(len(data) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))

def train_test_split(data, train_size):
    train = data[:train_size]
    test = data[train_size:]
    return train, test

Train size: 1780
Test size: 445


## Data preparation

In [39]:
train_cat, test_cat = train_test_split(data['category'], train_size)
train_text, test_text = train_test_split(data['text'], train_size)

max_words = 1000
tokenize = keras.preprocessing.text.Tokenizer(num_words=max_words, 
                                              char_level=False)

tokenize.fit_on_texts(train_text) # fit tokenizer to our training text data
x_train = tokenize.texts_to_matrix(train_text)
x_test = tokenize.texts_to_matrix(test_text)

# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_cat)
y_train = encoder.transform(train_cat)
y_test = encoder.transform(test_cat)

# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (1780, 1000)
x_test shape: (445, 1000)
y_train shape: (1780, 5)
y_test shape: (445, 5)


## Train the model

In [40]:
batch_size = 32
epochs = 2
drop_ratio = 0.5

# Build the model
model = models.Sequential()
model.add(layers.Dense(512, input_shape=(max_words,)))
model.add(layers.Activation('relu'))
model.add(layers.Dense(num_classes))
model.add(layers.Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

# Evaluate the accuracy of our trained model
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Epoch 1/2
Epoch 2/2
Test loss: 0.14432784914970398
Test accuracy: 0.9550561904907227


## Hyperparameter tuning

In [63]:
def run_experiment(batch_size, epochs, drop_ratio):
  model = models.Sequential()
  model.add(layers.Dense(512, input_shape=(max_words,)))
  model.add(layers.Activation('relu'))
  model.add(layers.Dropout(drop_ratio))
  model.add(layers.Dense(num_classes))
  model.add(layers.Activation('softmax'))

  model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
  history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=0,
                    validation_split=0.1)
  score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=0)
  print('Test loss:', score[0])
  print('Test accuracy:', score[1])

batch_size = 16
epochs = 4
drop_ratio = 0.4
run_experiment(batch_size, epochs, drop_ratio)

Test loss: 0.1307440549135208
Test accuracy: 0.9595505595207214


## Text Summarization


In [42]:
from nltk.tokenize import sent_tokenize, word_tokenize 
def get_sentences(article):
  extracts=sent_tokenize(article)
  sentences=[]
  for extract in extracts:
    clean_sentence=extract.replace("[^a-zA-Z0-9]"," ")   ## Removing special characters
    obtained=word_tokenize(clean_sentence) 
    sentences.append(obtained)
  return sentences


from nltk.cluster.util import cosine_distance
def get_similarity(sent_1,sent_2,stop_words):
  
  sent_1=[w.lower() for w in sent_1]
  sent_2=[w.lower() for w in sent_2]

  total=list(set(sent_1+sent_2)) ## Removing duplicate words in total set

  vec_1= [0] * len(total)
  vec_2= [0] * len(total)


  ## Count Vectorization of two sentences
  for w in sent_1:
    if w not in stop_words:
      vec_1[total.index(w)]+=1

  for w in sent_2:
    if w not in stop_words:
      vec_2[total.index(w)]+=1

  return 1-cosine_distance(vec_1,vec_2)

from nltk.corpus import stopwords
import numpy as np
def build_matrix(sentences):
  stop_words = stopwords.words('english')

  sim_matrix=np.zeros((len(sentences),len(sentences)))
  ## Adjacency matrix

  for id1 in range(len(sentences)):
    for id2 in range(len(sentences)):
      if id1==id2:  #escaping diagonal elements
        continue
      else:
        sim_matrix[id1][id2]=get_similarity(sentences[id1],sentences[id2],stop_words)

  return sim_matrix

def textrank(text, eps=0.000001, d=0.85):
    score_mat = np.ones(len(text)) / len(text)
    delta=1
    while delta>eps:
        score_mat_new = np.ones(len(text)) * (1 - d) / len(text) + d * text.T.dot(score_mat)
        delta = abs(score_mat_new - score_mat).sum()
        score_mat = score_mat_new

    return score_mat_new


def summarizer(article):
  summarized=[]

  sentence=get_sentences(article)

  sim_matrix=build_matrix(sentence)

  score=textrank(sim_matrix)

  ranked_sentence = sorted(((score[i],s) for i,s in enumerate(sentence)), reverse=True)

  req = len(ranked_sentence)//4

  if req == 0: req = 1
  
  for i in range(req):
      summarized.append(" ".join(ranked_sentence[i][1]))

  return summarized


import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Working on Test Data and checking results

In [62]:
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_ 

for i in range(0, 15):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    Summary=summarizer(test_text.iloc[i])
    print("Predicted label: " + predicted_label)  
    print("Summary: \n" + " ".join(Summary) + "\n")


Predicted label: entertainment
Summary: 
jackson is thought to have secured the most lucrative film directing deal in history to remake king kong which is currently in production in wellington . jackson who is currently filming a remake of hollywood classic king kong said he thought that the sale of mgm studios to the sony corporation would cast further uncertainty on the project .

Predicted label: tech
Summary: 
those people live in any city and village and so we need ordinary people people with interesting faces . there are hundreds and hundreds of characters in a typical bioware game said shauna perry bioware s audio and external resources producer . the company which makes role playing games such as knights of the old republic and neverwinter nights is seeking people aged 18 to 99. the canada-based company says it was looking for a wide variety of people to use as face models for characters .

Predicted label: politics
Summary: 
mr clarke who will unveil his plans on monday said e

  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Predicted label: sport
Summary: 
radcliffe will compete in london paula radcliffe will compete in the flora london marathon this year after deciding her schedule for 2005. the 31-year-old won the race in 2002 on her marathon debut defended her title 12 months later and will now seek a third title in the 17 april race . it doesn t get any better than this for the 25th anniversary said race director david bedford . three years ago radcliffe smashed the women s world record in two hours 18 minutes 15 seconds .

Predicted label: sport
Summary: 
her rise means australia have a player in the top 10 of the men s and women s rankings for the first time in 21 years .

Predicted label: tech
Summary: 
ultimate game award for doom 3 sci-fi shooter doom 3 has blasted away the competition at a major games ceremony the golden joystick awards . it was the only title to win twice winning ultimate game of the year and best pc game at the awards presented by little britain star matt lucas . the much-anti