In [1]:
import pandas as pd 
import tensorflow as tf
from tensorflow import keras
import numpy as np 
import matplotlib.pyplot as plt
import os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.layers import Embedding
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_json('../../../data/data.json', lines=True)
df.drop(['authors', 'link', 'date'], axis = 1, inplace = True) 
df.head()

Unnamed: 0,category,headline,short_description
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,She left her husband. He killed their children...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Of course it has a song.
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,The actor and his longtime girlfriend Anna Ebe...
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,The actor gives Dems an ass-kicking for not fi...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,"The ""Dietland"" actress said using the bags is ..."


In [3]:
categories = df['category'].value_counts().index

def groupper(grouplist,name):
    for ele in categories:
        if ele in grouplist:
            df.loc[df['category'] == ele, 'category'] = name

In [4]:
groupper( grouplist= ['WELLNESS', 'HEALTHY LIVING','HOME & LIVING','STYLE & BEAUTY' ,'STYLE'] , name =  'LIFESTYLE AND WELLNESS')

groupper( grouplist= [ 'PARENTING', 'PARENTS' ,'EDUCATION' ,'COLLEGE'] , name =  'PARENTING AND EDUCATION')

groupper( grouplist= ['SPORTS','ENTERTAINMENT' , 'COMEDY','WEIRD NEWS','ARTS'] , name =  'SPORTS AND ENTERTAINMENT')

groupper( grouplist= ['TRAVEL', 'ARTS & CULTURE','CULTURE & ARTS','FOOD & DRINK', 'TASTE'] , name =  'TRAVEL-TOURISM & ART-CULTURE')

groupper( grouplist= ['WOMEN','QUEER VOICES', 'LATINO VOICES', 'BLACK VOICES'] , name =  'EMPOWERED VOICES')

groupper( grouplist= ['BUSINESS' ,  'MONEY'] , name =  'BUSINESS-MONEY')

groupper( grouplist= ['THE WORLDPOST' , 'WORLDPOST' , 'WORLD NEWS'] , name =  'WORLDNEWS')

groupper( grouplist= ['ENVIRONMENT' ,'GREEN'] , name =  'ENVIRONMENT')

groupper( grouplist= ['TECH', 'SCIENCE'] , name =  'SCIENCE AND TECH')

groupper( grouplist= ['FIFTY' , 'IMPACT' ,'GOOD NEWS','CRIME'] , name =  'GENERAL')

groupper( grouplist= ['WEDDINGS', 'DIVORCE',  'RELIGION','MEDIA'] , name =  'MISC')

In [5]:
print("We have a total of {} categories now".format(df['category'].nunique()))
df['category'].value_counts()

We have a total of 12 categories now


LIFESTYLE AND WELLNESS          40619
POLITICS                        32739
SPORTS AND ENTERTAINMENT        30296
TRAVEL-TOURISM & ART-CULTURE    20578
EMPOWERED VOICES                15461
PARENTING AND EDUCATION         14780
MISC                            12448
GENERAL                          9663
WORLDNEWS                        8420
BUSINESS-MONEY                   7644
SCIENCE AND TECH                 4260
ENVIRONMENT                      3945
Name: category, dtype: int64

In [6]:
df2 = df.copy()

In [7]:
print(len(df2[df2['short_description'] == ""]))

19712


In [8]:
df2['text'] = df2['headline'].astype(str)+"-"+df2['short_description']
df2.drop(columns =['headline','short_description'],axis = 1, inplace=True)
df2.astype(str)
df2

Unnamed: 0,category,text
0,GENERAL,There Were 2 Mass Shootings In Texas Last Week...
1,SPORTS AND ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...
2,SPORTS AND ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 5...
3,SPORTS AND ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,SPORTS AND ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...
...,...,...
200848,SCIENCE AND TECH,RIM CEO Thorsten Heins' 'Significant' Plans Fo...
200849,SPORTS AND ENTERTAINMENT,Maria Sharapova Stunned By Victoria Azarenka I...
200850,SPORTS AND ENTERTAINMENT,"Giants Over Patriots, Jets Over Colts Among M..."
200851,SPORTS AND ENTERTAINMENT,Aldon Smith Arrested: 49ers Linebacker Busted ...


In [9]:
from sklearn.utils import shuffle
df2 = shuffle(df2)
df2.reset_index(inplace=True, drop=True) 

In [10]:
X = df2['text']
Y= df2['category']
#80% to train , 10% for validation , 10% for testing


X_train, X_val, y_train, y_val = train_test_split(X,Y, test_size=0.2, random_state=42)
X_val, X_test , y_val, y_test= train_test_split(X_val,y_val, test_size=0.5, random_state=42)

In [21]:
vocab_size =20000
max_length = 150
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [22]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index

X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train,maxlen= max_length,padding=padding_type, truncating=trunc_type)
y_train = np.asarray(y_train)
y_train = pd.get_dummies(y_train)

X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val,maxlen= max_length,padding=padding_type, truncating=trunc_type)
y_val = np.asarray(y_val)
y_val = pd.get_dummies(y_val)

train_set = np.array(X_train)
val_set = np.array(X_val)

train_label = np.array(y_train)
val_label = np.array(y_val)


y_test = pd.get_dummies(y_test)
y_test = np.asarray(y_test)
y_test = np.argmax(y_test,axis=1)   #this would be our ground truth label while testing

print(train_set.shape)
print(train_label.shape)


print(val_set.shape)
print(val_label.shape)

In [26]:
path_to_glove_file =  '../../../../../glove.6B.100d.txt'

In [24]:
#Initialising the embedding matrix with glove vec embeddings

num_tokens = len(tokenizer.word_index.items()) + 2
embedding_dim = 100
hits = 0
misses = 0


embeddings_index = {}
with open(path_to_glove_file, encoding="utf8" ) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))


# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 0 words (0 misses)


In [7]:
early_stop=tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                            patience=3, min_delta=0.0001)


tf.keras.backend.clear_session()
embed_size = 100
model = keras.models.Sequential([
                                 
        Embedding(num_tokens,
        embedding_dim,
        embeddings_initializer=keras.initializers.Constant(embedding_matrix),
        mask_zero=True,input_shape=[None],trainable=False),
        keras.layers.Bidirectional(keras.layers.LSTM(32, dropout = 0.4)),
        keras.layers.Dense(12, activation="softmax")
            
        ])


model.summary()

NameError: name 'num_tokens' is not defined

In [None]:
opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
history = model.fit( train_set,train_label,
                     batch_size = 32,
                     steps_per_epoch=len(X_train) // 32, 
                     validation_data = (val_set , val_label),
                     validation_steps = len(val_set)//32, epochs=20,
                     callbacks=  early_stop )

In [None]:
model.save('./best_model2')

In [6]:
model3 = keras.models.load_model('./best_model2')


In [None]:
text_1 = 'Pelosi: Biden Can’t Cancel Student Debt '
text_2 = "Feds Take on Student Homelessness"
score = model3.evaluate([text_1, text_2])
print("%s: %.2f%%" % (model3.metrics_names[1], score[1]*100))

In [14]:
val_set[0]

array([ 5760,  7945,    34,     3,    66,    32,  2286,  3942,  2108,
        1969,    54,  1888,    58,     3,  9125,    19,  2108,    11,
          40, 16411,   978,     3,    66,  2108,    11,   190,  2286,
         239,    34,     3,    53,    15,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [15]:
model3.predict(val_set)


array([[3.6910813e-02, 1.1493210e-02, 1.5198196e-02, ..., 6.9558094e-03,
        1.1811847e-02, 5.1052584e-03],
       [2.2438861e-02, 5.8973189e-02, 1.1863653e-02, ..., 5.9264308e-01,
        2.7157020e-02, 1.4335324e-02],
       [7.5351402e-02, 6.0673542e-02, 3.2407050e-03, ..., 8.2748748e-02,
        2.4171911e-02, 5.3345613e-02],
       ...,
       [1.8171513e-02, 1.7380415e-02, 2.7777815e-02, ..., 3.9915524e-02,
        9.5636500e-03, 3.6668838e-03],
       [3.2689201e-03, 2.6595858e-01, 5.9076177e-04, ..., 6.9552744e-03,
        2.1038502e-02, 1.0439575e-03],
       [4.0602931e-03, 1.8098781e-02, 4.6208080e-02, ..., 1.1442822e-02,
        7.4463710e-02, 4.1517881e-03]], dtype=float32)

In [2]:
df2 = pd.read_csv('cleaned_news.csv')
df2

Unnamed: 0,category,text
0,GENERAL,2 mass shooting texas last week 1 tvshe left h...
1,SPORTS AND ENTERTAINMENT,smith join diplo nicky jam 2018 world cup offi...
2,SPORTS AND ENTERTAINMENT,hugh grant marries first time age 57the actor ...
3,SPORTS AND ENTERTAINMENT,jim carrey blast castrato adam schiff democrat...
4,SPORTS AND ENTERTAINMENT,julianna margulies us donald trump poop bag pi...
...,...,...
200848,SCIENCE AND TECH,rim ceo thorsten heins significant plan blackb...
200849,SPORTS AND ENTERTAINMENT,maria sharapova stun victoria azarenka austral...
200850,SPORTS AND ENTERTAINMENT,giant patriot jet colt among improbable super ...
200851,SPORTS AND ENTERTAINMENT,aldon smith arrest 49ers linebacker bust duico...


In [3]:
df2.dropna(subset=['text'], inplace=True)

In [4]:
X= df2.text
y= df2.category

X_tr, X_test, y_tr, y_test = train_test_split(X, y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_tr, y_tr, random_state=42)

In [5]:
train_set = np.array(X_train)
val_set = np.array(X_val)

train_label = np.array(y_train)
val_label = np.array(y_val)


y_test = pd.get_dummies(y_test)
y_test = np.asarray(y_test)
y_test = np.argmax(y_test,axis=1)   #this would be our ground truth label while testing

print(train_set.shape)
print(train_label.shape)


print(val_set.shape)
print(val_label.shape)

(112972,)
(112972,)
(37658,)
(37658,)


In [16]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from imblearn.pipeline import make_pipeline

model = keras.models.Sequential([
        keras.layers.Bidirectional(keras.layers.LSTM(32, dropout = 0.4)),
        keras.layers.Dense(12, activation="softmax")])

early_stop=tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                            patience=3, min_delta=0.0001)

pipe = make_pipeline(CountVectorizer(max_features=10000, ngram_range=(1, 2)),
                        TfidfTransformer(),
                        SMOTE(),
                        model)

opt = keras.optimizers.Adam(learning_rate=0.001)

model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])

history = model.fit( train_set,train_label,
                     batch_size = 32,
                     steps_per_epoch=len(train_set) // 32, 
                     validation_data = (val_set , val_label),
                     validation_steps = len(val_set)//32, epochs=20,
                     callbacks=  early_stop )
#pipe.fit(X_train, y_train)
#y_pred = pipe.predict(X_test)
#accuracy = accuracy_score(y_pred, y_test)
    
#names.append(name)
#results.append(accuracy)
#model.append(pipe)
    
msg = "%s: %f" % (name, accuracy)
print(msg)

Epoch 1/20


ValueError: in user code:

    C:\Users\kylev\anaconda3\envs\learn-env\lib\site-packages\tensorflow\python\keras\engine\training.py:806 train_function  *
        return step_function(self, iterator)
    C:\Users\kylev\anaconda3\envs\learn-env\lib\site-packages\tensorflow\python\keras\engine\training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\kylev\anaconda3\envs\learn-env\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\kylev\anaconda3\envs\learn-env\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\kylev\anaconda3\envs\learn-env\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\kylev\anaconda3\envs\learn-env\lib\site-packages\tensorflow\python\keras\engine\training.py:789 run_step  **
        outputs = model.train_step(data)
    C:\Users\kylev\anaconda3\envs\learn-env\lib\site-packages\tensorflow\python\keras\engine\training.py:747 train_step
        y_pred = self(x, training=True)
    C:\Users\kylev\anaconda3\envs\learn-env\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:975 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs,
    C:\Users\kylev\anaconda3\envs\learn-env\lib\site-packages\tensorflow\python\keras\engine\input_spec.py:176 assert_input_compatibility
        raise ValueError('Input ' + str(input_index) + ' of layer ' +

    ValueError: Input 0 of layer sequential_3 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [None, 1]


array([['top gop strategist bash donald trump try get electedthis republican cave'],
       ['gayle king address charlie rise allegationswhen think anguish woman report news'],
       ['jimmy kimmel break michael flynn news perfect star war analogybut who darth vader'],
       ...,
       ['exhusband deplete life saving videowhile rind ask husband write iou never collect couple get divorce partially due unpaid'],
       ['land two homemakeover tv showspeople constantly ask get show answer always combination luck logistics five tip help get home makeover dream'],
       ['9 place traveler blow steam photosthe question blow steam without hurt people generally wreak havoc also near']],
      dtype=object)