In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, Dropout, LSTM, BatchNormalization
from tensorflow.keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder

import numpy as np
import pandas as pd
from tensorflow.keras.callbacks import ModelCheckpoint
from collections import Counter
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
import gensim
from sklearn.model_selection import train_test_split
import re
nltk.download('stopwords')
decode_map = {0: "NEGATIVE", 1: "POSITIVE", 4: "POSITIVE"}
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
def decode_sentiment(label):
    return decode_map[int(label)]
def preprocess(text, stem=False):
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SEQUENCE_LENGTH = 150
SENTIMENT_THRESHOLDS = (0.4, 0.6)
TRAIN_SIZE=0.55
EPOCHS = 8
BATCH_SIZE = 64
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [71]:
df = pd.read_csv('news_data.csv')
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")
df.titles = df.titles.apply(lambda x: preprocess(x))
df.target = df.target.apply(lambda x: decode_sentiment(x))

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))
counts = dict(Counter(df_train.target))
print(counts)

TRAIN size: 153
TEST size: 39
{'POSITIVE': 78, 'NEGATIVE': 75}


In [0]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!pip -q install gensim --upgrade
!rm -r sample_data
!kaggle datasets download rtatman/glove-global-vectors-for-word-representation
!unzip -q glove-global-vectors-for-word-representation

[K     |████████████████████████████████| 24.2MB 158kB/s 
[?25hDownloading glove-global-vectors-for-word-representation.zip to /content
 99% 454M/458M [00:12<00:00, 41.7MB/s]
100% 458M/458M [00:12<00:00, 39.4MB/s]


In [0]:
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file="glove.6B.200d.txt", word2vec_output_file="model.w2v")
w2v_model = KeyedVectors.load_word2vec_format('model.w2v')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [35]:
documents = [_text.split() for _text in df_train.titles] 
W2V_SIZE = 200
W2V_WINDOW = 7
W2V_EPOCH = 2
W2V_MIN_COUNT = 10

words = w2v_model.wv.vocab.keys()
vocab_size = len(words)

  import sys


In [0]:
import pickle
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.titles)
vocab_size = len(tokenizer.word_index) + 1
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.titles), maxlen=SEQUENCE_LENGTH)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.titles), maxlen=SEQUENCE_LENGTH)
labels = df_train.target.unique().tolist()
labels.append(NEUTRAL)
encoder = LabelEncoder()
encoder.fit(df_train.target.tolist())

y_train = encoder.transform(df_train.target.tolist())
y_test = encoder.transform(df_test.target.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

In [74]:
print("x_train", x_train.shape)
print("y_train", y_train.shape)
print()
print("x_test", x_test.shape)
print("y_test", y_test.shape)

x_train (153, 150)
y_train (153, 1)

x_test (39, 150)
y_test (39, 1)


In [77]:
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
from tensorflow.keras.optimizers import Adam, Nadam

for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]

embedding_layer = Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False)

model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(20, dropout=0.35, recurrent_dropout=0.35, return_sequences=False)))
model.add(Dense(2, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Nadam(lr=1e-15),
              metrics=['accuracy'])
model.summary()

  """
  


Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 150, 200)          778800    
_________________________________________________________________
dropout_16 (Dropout)         (None, 150, 200)          0         
_________________________________________________________________
bidirectional_30 (Bidirectio (None, 40)                35360     
_________________________________________________________________
dense_15 (Dense)             (None, 2)                 82        
Total params: 814,242
Trainable params: 35,442
Non-trainable params: 778,800
_________________________________________________________________


In [78]:
model.fit(x_train,y_train,batch_size=10,epochs=6)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x7fd0098510b8>

In [80]:
model.evaluate(x_test,y_test)
one_good = 0
one_bad = 0
good = 0
bad=0
for i,j in zip(x_test,y_test):
  true = j[0]
  predict = np.argmax(model.predict(np.array([i])))
  if true == 0:
    if predict == 0:
      good+=1
    else:
      bad +=1
  else:
    if predict == 1:
      one_good+=1
    else:
      one_bad+=1
print(good,bad)
print(one_good,one_bad)

12 9
10 8


In [0]:
model.save('news_model.h5')