In [None]:
sentence=[
          "I really like this book",
          "I love this place"
]

In [None]:
import tensorflow
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot

In [None]:
one_hot_rep= [one_hot(words,100) for words in sentence]

In [None]:
one_hot_rep

[[73, 81, 86, 62, 78], [73, 12, 62, 76]]

In [None]:
length= 8
embedded_doc= pad_sequences(one_hot_rep, padding='pre', maxlen= length)
print(embedded_doc)

[[ 0  0  0 73 81 86 62 78]
 [ 0  0  0  0 73 12 62 76]]


In [None]:
dim=10
vocab_size=100
model= Sequential()
model.add(Embedding(vocab_size ,dim, input_length= length))

In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 8, 10)             1000      
Total params: 1,000
Trainable params: 1,000
Non-trainable params: 0
_________________________________________________________________


In [None]:
pred= model.predict(embedded_doc)

In [None]:
pred

array([[[-0.04287918, -0.02877076, -0.02354172, -0.03075363,
         -0.0178406 , -0.03285166, -0.01275345, -0.01005472,
         -0.00189047, -0.02027624],
        [-0.04287918, -0.02877076, -0.02354172, -0.03075363,
         -0.0178406 , -0.03285166, -0.01275345, -0.01005472,
         -0.00189047, -0.02027624],
        [-0.04287918, -0.02877076, -0.02354172, -0.03075363,
         -0.0178406 , -0.03285166, -0.01275345, -0.01005472,
         -0.00189047, -0.02027624],
        [ 0.02780923,  0.01220452, -0.03800594,  0.04233992,
         -0.00432057,  0.03962095, -0.04240117, -0.03157319,
          0.01541325,  0.03793525],
        [-0.04040948, -0.00493157, -0.02408328, -0.03020506,
         -0.04969352,  0.00653899, -0.03759919, -0.00504839,
          0.0105625 , -0.04016125],
        [ 0.00627334, -0.02896903,  0.03973602, -0.0031049 ,
         -0.00641809,  0.01902397,  0.01207932,  0.04797149,
         -0.04365454,  0.02577943],
        [-0.01120736,  0.00095668, -0.02445908,  0.0

In [None]:
pred.shape

(2, 8, 10)

In [None]:
pred[0][0] # sentence 1

array([-0.04287918, -0.02877076, -0.02354172, -0.03075363, -0.0178406 ,
       -0.03285166, -0.01275345, -0.01005472, -0.00189047, -0.02027624],
      dtype=float32)

In [None]:
pred[0][1] # sentence 2

array([-0.04287918, -0.02877076, -0.02354172, -0.03075363, -0.0178406 ,
       -0.03285166, -0.01275345, -0.01005472, -0.00189047, -0.02027624],
      dtype=float32)

# **CountVectorizer**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer()

bow= cv.fit_transform(sentence)
print(bow)

  (0, 4)	1
  (0, 1)	1
  (0, 5)	1
  (0, 0)	1
  (1, 5)	1
  (1, 2)	1
  (1, 3)	1


In [None]:
feature_names= cv.get_feature_names()

In [None]:
print(feature_names)

['book', 'like', 'love', 'place', 'really', 'this']


In [None]:
import pandas as pd
pd.DataFrame(bow.toarray(), columns= feature_names)

Unnamed: 0,book,like,love,place,really,this
0,1,1,0,0,1,1
1,0,0,1,1,0,1


# **TFIDF**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv= TfidfVectorizer()

tv_vector= tv.fit_transform(sentence)
print(tv_vector)

  (0, 0)	0.534046329052269
  (0, 5)	0.37997836159100784
  (0, 1)	0.534046329052269
  (0, 4)	0.534046329052269
  (1, 3)	0.6316672017376245
  (1, 2)	0.6316672017376245
  (1, 5)	0.4494364165239821


In [None]:
feature_names= tv.get_feature_names()

In [None]:
import pandas as pd
pd.DataFrame(tv_vector.toarray(), columns= feature_names)

Unnamed: 0,book,like,love,place,really,this
0,0.534046,0.534046,0.0,0.0,0.534046,0.379978
1,0.0,0.0,0.631667,0.631667,0.0,0.449436


# **BBC News data**

Multiclass classification using Word2Vec and LSTM

In [None]:
import pandas as pd
data=pd.read_csv('/content/drive/MyDrive/bbc_news_mixed (1).csv')

In [None]:
data.head()

Unnamed: 0,text,label
0,Cairn shares slump on oil setback\n\nShares in...,business
1,Egypt to sell off state-owned bank\n\nThe Egyp...,business
2,Cairn shares up on new oil find\n\nShares in C...,business
3,Low-cost airlines hit Eurotunnel\n\nChannel Tu...,business
4,"Parmalat to return to stockmarket\n\nParmalat,...",business


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
data.label.value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: label, dtype: int64

In [None]:
label2= pd.get_dummies(data["label"])

In [None]:
label2.head()

Unnamed: 0,business,entertainment,politics,sport,tech
0,1,0,0,0,0
1,1,0,0,0,0
2,1,0,0,0,0
3,1,0,0,0,0
4,1,0,0,0,0


In [None]:
y = LabelBinarizer().fit_transform(label2)

In [None]:
y[:5]

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0]])

In [None]:
z= pd.DataFrame(y)
z.value_counts()

0  1  2  3  4
0  0  0  1  0    511
1  0  0  0  0    510
0  0  1  0  0    417
      0  0  1    401
   1  0  0  0    386
dtype: int64

In [None]:
data.head()

Unnamed: 0,text,label
0,Cairn shares slump on oil setback\n\nShares in...,business
1,Egypt to sell off state-owned bank\n\nThe Egyp...,business
2,Cairn shares up on new oil find\n\nShares in C...,business
3,Low-cost airlines hit Eurotunnel\n\nChannel Tu...,business
4,"Parmalat to return to stockmarket\n\nParmalat,...",business


In [None]:
from gensim.utils import simple_preprocess
preprocessed_bbc = data.text.apply(lambda x: simple_preprocess(x))
preprocessed_bbc.head()

0    [cairn, shares, slump, on, oil, setback, share...
1    [egypt, to, sell, off, state, owned, bank, the...
2    [cairn, shares, up, on, new, oil, find, shares...
3    [low, cost, airlines, hit, eurotunnel, channel...
4    [parmalat, to, return, to, stockmarket, parmal...
Name: text, dtype: object

In [None]:
# import word2vec
from gensim.models import Word2Vec

# train a word2vec model from the given data set
w2v_model = Word2Vec(preprocessed_bbc, size=300, min_count=2, sg=1)

In [None]:
print('vocabulary size:', len(w2v_model.wv.vocab))

vocabulary size: 18588


In [None]:
w2v_model.wv.most_similar('oil')

[('gas', 0.8566049337387085),
 ('telecoms', 0.8119033575057983),
 ('costs', 0.8108860850334167),
 ('giant', 0.8099585175514221),
 ('unit', 0.7962621450424194),
 ('fuel', 0.7939218282699585),
 ('energy', 0.7836349606513977),
 ('industrial', 0.7809122800827026),
 ('steel', 0.7744253873825073),
 ('exports', 0.7730951309204102)]

In [None]:
w2v_model.save('/content/drive/MyDrive/bbc_w2v_model.h5')

In [None]:
def get_embedding_w2v(doc_tokens, pre_trained):
    embeddings = []
    if pre_trained:
        model = w2vec
    else:
        model = w2v_model
    for tok in doc_tokens:
        if tok in model.wv.vocab:
            embeddings.append(model.wv.word_vec(tok))
    return np.mean(embeddings, axis=0)

In [None]:
import numpy as np
X_w2v_model = preprocessed_bbc.apply(lambda x: get_embedding_w2v(x, pre_trained=0))
X_w2v_model = pd.DataFrame(X_w2v_model.tolist())
print('X shape:', X_w2v_model.shape)

X shape: (2225, 300)


In [None]:
from sklearn.model_selection import train_test_split
X_train_wm, X_test_wm, y_train_wm, y_test_wm = train_test_split(X_w2v_model, y)

In [None]:
X_train_wm.shape,X_test_wm.shape

((1668, 300), (557, 300))

In [None]:
y_train_wm.shape, y_test_wm.shape

((1668, 5), (557, 5))

In [None]:
X_train_wm=np.array(X_train_wm).reshape(1668, 300,1)
X_train_wm.shape

(1668, 300, 1)

In [None]:
X_test_wm=np.array(X_test_wm).reshape(557, 300,1)
X_test_wm.shape

(557, 300, 1)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense

model1=Sequential()
model1.add(LSTM(100,input_shape=(300,1)))
model1.add(Dense(8,activation="relu"))
model1.add(Dense(5,activation="softmax"))

model1.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])

In [None]:
model1.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_7 (LSTM)                (None, 100)               40800     
_________________________________________________________________
dense_21 (Dense)             (None, 8)                 808       
_________________________________________________________________
dense_22 (Dense)             (None, 5)                 45        
Total params: 41,653
Trainable params: 41,653
Non-trainable params: 0
_________________________________________________________________


In [None]:
model1.fit(X_train_wm, y_train_wm,validation_data=(X_test_wm, y_test_wm), epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f56b02b3eb8>

In [None]:
model1.save('/content/drive/MyDrive/bbc_news_model.h5')

In [None]:
model1.evaluate(X_test_wm,y_test_wm)



[0.47802647948265076, 0.8348294496536255]

In [None]:
#classes=["business",	"entertainment",	"politics",	"sport",	"tech"]

In [None]:
def prediction(doc):
  classes=["business",	"entertainment",	"politics",	"sport",	"tech"]
  doc= simple_preprocess(doc)
  doc= get_embedding_w2v(doc, pre_trained=0)
  doc1= doc.reshape(1,300,1)
  p= model1.predict(doc1)
  return classes[np.argmax(p)]

In [None]:
doc1= "Pankaj Tripathi, currently seen on Criminal Justice: Behind Closed Doors, opens up about dealing with fame"
print(f"The article belongs to {prediction(doc1)} category",)

The article belongs to tech category


In [None]:
doc2= "OnePlus 9 Alleged Live Images Tip Flat Hole-Punch Display, Reverse Wireless Charging Support"
print(f"The article belongs to {prediction(doc2)} category",)

The article belongs to tech category


In [None]:
doc3= "Tesla public company duties are a much bigger factor, but going private is impossible now (sigh),” Musk said in response to a tweet saying he should optimize his time in areas such as innovation"
print(f"The article belongs to {prediction(doc3)} category",)

The article belongs to politics category


In [None]:
doc4= "PSG sack Tuchel, Pochettino set to become new manager - reports."
print(f"The article belongs to {prediction(doc4)} category",)

The article belongs to sport category


In [None]:
doc5= "In a press conference on Tuesday, Kejriwal said the development of Uttar Pradesh has been held back by 'corrupt' leaders in the state"
print(f"The article belongs to {prediction(doc5)} category",)

The article belongs to business category


In [None]:
doc6= "RIL plans to rebrand the IMG Reliance as its completely owned subsidiary post-acquisition of 50 per cent shares held by the wholly-owned subsidiary of IMG, the MG Singapore Pte. Ltd."
print(f"The article belongs to {prediction(doc6)} category",)

The article belongs to business category


 **Loading the saved model**

In [None]:
from tensorflow.keras.models import load_model
bbc = load_model('/content/drive/MyDrive/bbc_news_model.h5')

In [None]:
#embed = load_model('/content/drive/MyDrive/bbc_w2v_model.h5')
import gensim
embed = gensim.models.Word2Vec.load('/content/drive/MyDrive/bbc_w2v_model.h5')

In [None]:
def get_embedding_w2v_bbc(doc_tokens):
    embeddings = []
    for tok in doc_tokens:
        if tok in embed.wv.vocab:
            embeddings.append(embed.wv.word_vec(tok))
    return np.mean(embeddings, axis=0)

In [None]:
def predictionbbc(doc):
  classes=["business",	"entertainment",	"politics",	"sport",	"tech"]
  doc= simple_preprocess(doc)
  doc= get_embedding_w2v_bbc(doc)
  doc1= doc.reshape(1,300,1)
  p= bbc.predict(doc1)
  return classes[np.argmax(p)]

In [None]:
print(f"The article belongs to {predictionbbc(test_doc)} category",)

The article belongs to sport category
