In [1]:
import pandas as pd 
import tensorflow as tf
from tensorflow import keras
import numpy as np 
import matplotlib.pyplot as plt
import os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.layers import Embedding
import seaborn as sns


In [9]:
df = pd.read_json('../../../data/data.json', lines=True)
df.drop(['authors', 'link', 'date'], axis = 1, inplace = True) 
df.head()

Unnamed: 0,category,headline,short_description
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,She left her husband. He killed their children...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Of course it has a song.
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,The actor and his longtime girlfriend Anna Ebe...
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,The actor gives Dems an ass-kicking for not fi...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,"The ""Dietland"" actress said using the bags is ..."


In [10]:
categories = df['category'].value_counts().index

def groupper(grouplist,name):
    for ele in categories:
        if ele in grouplist:
            df.loc[df['category'] == ele, 'category'] = name

In [11]:
groupper( grouplist= ['WELLNESS', 'HEALTHY LIVING','HOME & LIVING','STYLE & BEAUTY' ,'STYLE'] , name =  'LIFESTYLE AND WELLNESS')

groupper( grouplist= [ 'PARENTING', 'PARENTS' ,'EDUCATION' ,'COLLEGE'] , name =  'PARENTING AND EDUCATION')

groupper( grouplist= ['SPORTS','ENTERTAINMENT' , 'COMEDY','WEIRD NEWS','ARTS'] , name =  'SPORTS AND ENTERTAINMENT')

groupper( grouplist= ['TRAVEL', 'ARTS & CULTURE','CULTURE & ARTS','FOOD & DRINK', 'TASTE'] , name =  'TRAVEL-TOURISM & ART-CULTURE')

groupper( grouplist= ['WOMEN','QUEER VOICES', 'LATINO VOICES', 'BLACK VOICES'] , name =  'EMPOWERED VOICES')

groupper( grouplist= ['BUSINESS' ,  'MONEY'] , name =  'BUSINESS-MONEY')

groupper( grouplist= ['THE WORLDPOST' , 'WORLDPOST' , 'WORLD NEWS'] , name =  'WORLDNEWS')

groupper( grouplist= ['ENVIRONMENT' ,'GREEN'] , name =  'ENVIRONMENT')

groupper( grouplist= ['TECH', 'SCIENCE'] , name =  'SCIENCE AND TECH')

groupper( grouplist= ['FIFTY' , 'IMPACT' ,'GOOD NEWS','CRIME'] , name =  'GENERAL')

groupper( grouplist= ['WEDDINGS', 'DIVORCE',  'RELIGION','MEDIA'] , name =  'MISC')

In [12]:
print("We have a total of {} categories now".format(df['category'].nunique()))
df['category'].value_counts()

We have a total of 12 categories now


LIFESTYLE AND WELLNESS          40619
POLITICS                        32739
SPORTS AND ENTERTAINMENT        30296
TRAVEL-TOURISM & ART-CULTURE    20578
EMPOWERED VOICES                15461
PARENTING AND EDUCATION         14780
MISC                            12448
GENERAL                          9663
WORLDNEWS                        8420
BUSINESS-MONEY                   7644
SCIENCE AND TECH                 4260
ENVIRONMENT                      3945
Name: category, dtype: int64

In [13]:
df2 = df.copy()

In [14]:
print(len(df2[df2['short_description'] == ""]))

19712


In [15]:
df2['text'] = df2['headline'].astype(str)+"-"+df2['short_description']
df2.drop(columns =['headline','short_description'],axis = 1, inplace=True)
df2.astype(str)
df2

Unnamed: 0,category,text
0,GENERAL,There Were 2 Mass Shootings In Texas Last Week...
1,SPORTS AND ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...
2,SPORTS AND ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 5...
3,SPORTS AND ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,SPORTS AND ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...
...,...,...
200848,SCIENCE AND TECH,RIM CEO Thorsten Heins' 'Significant' Plans Fo...
200849,SPORTS AND ENTERTAINMENT,Maria Sharapova Stunned By Victoria Azarenka I...
200850,SPORTS AND ENTERTAINMENT,"Giants Over Patriots, Jets Over Colts Among M..."
200851,SPORTS AND ENTERTAINMENT,Aldon Smith Arrested: 49ers Linebacker Busted ...


In [16]:
from sklearn.utils import shuffle
df2 = shuffle(df2)
df2.reset_index(inplace=True, drop=True) 

In [17]:
X = df2['text']
Y= df2['category']
#80% to train , 10% for validation , 10% for testing


X_train, X_val, y_train, y_val = train_test_split(X,Y, test_size=0.2, random_state=42)
X_val, X_test , y_val, y_test= train_test_split(X_val,y_val, test_size=0.5, random_state=42)

In [18]:
vocab_size =20000
max_length = 150
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [19]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index

X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train,maxlen= max_length,padding=padding_type, truncating=trunc_type)
y_train = np.asarray(y_train)
y_train = pd.get_dummies(y_train)

X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val,maxlen= max_length,padding=padding_type, truncating=trunc_type)
y_val = np.asarray(y_val)
y_val = pd.get_dummies(y_val)

train_set = np.array(X_train)
val_set = np.array(X_val)

train_label = np.array(y_train)
val_label = np.array(y_val)


y_test = pd.get_dummies(y_test)
y_test = np.asarray(y_test)
y_test = np.argmax(y_test,axis=1)   #this would be our ground truth label while testing

print(train_set.shape)
print(train_label.shape)


print(val_set.shape)
print(val_label.shape)

(160682, 150)
(160682, 12)
(20085, 150)
(20085, 12)


In [20]:
path_to_glove_file =  '../../../../../glove.6B.100d.txt'

In [22]:
#Initialising the embedding matrix with glove vec embeddings

num_tokens = len(tokenizer.word_index.items()) + 2
embedding_dim = 100
hits = 0
misses = 0


embeddings_index = {}
with open(path_to_glove_file, encoding="utf8" ) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))


# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Found 400001 word vectors.
Converted 66004 words (38804 misses)


In [26]:
early_stop=tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                            patience=3, min_delta=0.0001)


tf.keras.backend.clear_session()
embed_size = 100
model = keras.models.Sequential([
                                 
        Embedding(num_tokens,
        embedding_dim,
        embeddings_initializer=keras.initializers.Constant(embedding_matrix),
        mask_zero=True,input_shape=[None],trainable=False),
        keras.layers.Bidirectional(keras.layers.LSTM(32, dropout = 0.4)),
        keras.layers.Dense(12, activation="softmax")
            
        ])


model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         10481000  
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                34048     
_________________________________________________________________
dense (Dense)                (None, 12)                780       
Total params: 10,515,828
Trainable params: 34,828
Non-trainable params: 10,481,000
_________________________________________________________________


In [27]:
opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
history = model.fit( train_set,train_label,
                     batch_size = 32,
                     steps_per_epoch=len(X_train) // 32, 
                     validation_data = (val_set , val_label),
                     validation_steps = len(val_set)//32, epochs=20,
                     callbacks=  early_stop )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [41]:
model.save('./best_model2')

In [31]:
model3 = keras.models.load_model('./best_model2')


In [25]:
text_1 = 'Pelosi: Biden Can’t Cancel Student Debt '
text_2 = "Feds Take on Student Homelessness"
score = model3.evaluate([text_1, text_2])
print("%s: %.2f%%" % (model3.metrics_names[1], score[1]*100))

accuracy: 0.00%


In [35]:
val_set[0]

array([ 230,   34,  100, 4933,  716, 6732,   89,  588,   11, 2709,  522,
       1961,   49,  150,   89,  147,   72,   42,   58,    1,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0])

In [36]:
model3.predict(val_set)


array([[5.4872001e-04, 7.5347912e-01, 3.0899842e-05, ..., 2.0094652e-02,
        3.6069824e-04, 2.7039887e-03],
       [7.4096411e-03, 4.3876776e-01, 3.2646044e-03, ..., 1.1931347e-01,
        1.8968061e-02, 5.4024428e-02],
       [1.4406631e-02, 1.2106130e-02, 2.3615919e-03, ..., 1.3777937e-01,
        6.7087388e-01, 1.9020332e-02],
       ...,
       [5.2680145e-03, 4.7150757e-02, 2.5488217e-03, ..., 2.5438381e-02,
        7.8933220e-03, 7.8599136e-03],
       [1.2766710e-02, 3.6938712e-02, 4.6631144e-04, ..., 3.1692013e-02,
        1.0604068e-03, 8.8037336e-03],
       [1.0493499e-01, 1.3349617e-01, 1.8987173e-02, ..., 4.9183499e-02,
        2.8089635e-02, 1.0951228e-01]], dtype=float32)

test accuracy 0.0
