In [None]:
import json
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
train=pd.read_csv("/content/drive/My Drive/fnews/train.csv")
val=pd.read_csv("/content/drive/My Drive/fnews/val.csv")

In [None]:
test=pd.read_csv("/content/drive/My Drive/fnews/test.csv")
sub=pd.read_csv("/content/sample_submission.csv")

In [None]:
train.head()

Unnamed: 0,text,label
0,The court granted by a 5-4 vote a request made...,real
1,""" Pennsylvania was a crucial swing state in th...",real
2,The company today is rolling out an update to ...,fake
3,"When it comes to trade policy, Hillary Clinton...",real
4,S. stocks had their worst April start since 19...,real


In [None]:
train_text=train["text"]
train_label=train["label"].map({"fake":0,"real":1})
val_text=val["text"]
val_label=val["label"].map({"fake":0,"real":1})

In [None]:
test_text=test["text"]

In [None]:
vocab_size = 10000
embedding_dim = 16
max_length = 1288
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_text)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(train_text)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

val_sequences = tokenizer.texts_to_sequences(val_text)
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
vocab_size=len(word_index)
len(word_index)

154673

In [None]:
training_padded = np.array(training_padded)
#training_labels = np.array(training_labels)
val_padded = np.array(val_padded)
#testing_labels = np.array(testing_labels)

In [None]:
model1 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=5, min_lr=0.01)
Earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

In [None]:
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1288, 16)          2474768   
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 2,475,201
Trainable params: 2,475,201
Non-trainable params: 0
_________________________________________________________________


In [None]:
num_epochs = 30
history = model1.fit(training_padded, train_label, epochs=num_epochs, validation_data=(val_padded, val_label), verbose=2,callbacks=[Earlystop,reduce_lr],shuffle=False)

Epoch 1/30
7251/7251 - 306s - loss: 0.3193 - accuracy: 0.8653 - val_loss: 0.1836 - val_accuracy: 0.9315
Epoch 2/30
7251/7251 - 310s - loss: 0.1611 - accuracy: 0.9392 - val_loss: 0.1544 - val_accuracy: 0.9411
Epoch 3/30
7251/7251 - 308s - loss: 0.1383 - accuracy: 0.9481 - val_loss: 0.1444 - val_accuracy: 0.9446
Epoch 4/30
7251/7251 - 313s - loss: 0.1284 - accuracy: 0.9522 - val_loss: 0.1385 - val_accuracy: 0.9472
Epoch 5/30
7251/7251 - 310s - loss: 0.1224 - accuracy: 0.9548 - val_loss: 0.1355 - val_accuracy: 0.9486
Epoch 6/30
7251/7251 - 306s - loss: 0.1184 - accuracy: 0.9563 - val_loss: 0.1337 - val_accuracy: 0.9492
Epoch 7/30
7251/7251 - 309s - loss: 0.1154 - accuracy: 0.9573 - val_loss: 0.1329 - val_accuracy: 0.9496
Epoch 8/30
7251/7251 - 312s - loss: 0.1127 - accuracy: 0.9584 - val_loss: 0.1320 - val_accuracy: 0.9504
Epoch 9/30
7251/7251 - 309s - loss: 0.1100 - accuracy: 0.9593 - val_loss: 0.1308 - val_accuracy: 0.9506
Epoch 10/30
7251/7251 - 315s - loss: 0.1075 - accuracy: 0.9600 -

In [None]:
model3= tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model3.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model3.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1288, 16)          2474768   
_________________________________________________________________
conv1d (Conv1D)              (None, 1284, 128)         10368     
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 24)                3096      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 2,488,257
Trainable params: 2,488,257
Non-trainable params: 0
_________________________________________________________________


In [None]:
num_epochs = 40
history = model3.fit(training_padded, train_label, epochs=num_epochs, validation_data=(val_padded, val_label), verbose=2,callbacks=[Earlystop,reduce_lr],shuffle=False)

Epoch 1/40
7251/7251 - 174s - loss: 0.1389 - accuracy: 0.9470 - val_loss: 0.0875 - val_accuracy: 0.9676
Epoch 2/40
7251/7251 - 172s - loss: 0.0573 - accuracy: 0.9800 - val_loss: 0.0825 - val_accuracy: 0.9713
Epoch 3/40
7251/7251 - 171s - loss: 0.0316 - accuracy: 0.9895 - val_loss: 0.1136 - val_accuracy: 0.9677
Epoch 4/40
7251/7251 - 172s - loss: 0.0195 - accuracy: 0.9934 - val_loss: 0.1205 - val_accuracy: 0.9697
Epoch 5/40
7251/7251 - 174s - loss: 0.0137 - accuracy: 0.9951 - val_loss: 0.1316 - val_accuracy: 0.9701


In [None]:
model4 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model4.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model4.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1288, 16)          2474768   
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                12544     
_________________________________________________________________
dense (Dense)                (None, 24)                1560      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 2,488,897
Trainable params: 2,488,897
Non-trainable params: 0
_________________________________________________________________


In [None]:
num_epochs=40
history = model4.fit(training_padded, train_label, epochs=num_epochs, validation_data=(val_padded, val_label), verbose=2,callbacks=[Earlystop,reduce_lr],shuffle=False)

Epoch 1/40
7251/7251 - 688s - loss: 0.1344 - accuracy: 0.9498 - val_loss: 0.1038 - val_accuracy: 0.9631
Epoch 2/40
7251/7251 - 701s - loss: 0.0759 - accuracy: 0.9725 - val_loss: 0.0876 - val_accuracy: 0.9700
Epoch 3/40
7251/7251 - 706s - loss: 0.0525 - accuracy: 0.9816 - val_loss: 0.0916 - val_accuracy: 0.9699
Epoch 4/40
7251/7251 - 705s - loss: 0.0363 - accuracy: 0.9875 - val_loss: 0.1094 - val_accuracy: 0.9683
Epoch 5/40
7251/7251 - 702s - loss: 0.0246 - accuracy: 0.9922 - val_loss: 0.0946 - val_accuracy: 0.9732


In [None]:
val_pred=model4.predict(val_padded)

In [None]:
pred=[]
for i in val_pred:
  if(i<0.5):
    pred.append(0)
  else:
    pred.append(1)

In [None]:
from sklearn.metrics import f1_score
f1_score(val_label, pred )

0.9811650379428947

In [None]:
test_sequences = tokenizer.texts_to_sequences(test_text)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
test_pred=model4.predict(test_padded)

In [None]:
pred1=[]
for i in test_pred:
  if(i<0.5):
    pred1.append("fake")
  else:
    pred1.append("real")

In [None]:
len(pred1)

115999

In [None]:
sub["label"]=pred1
sub.to_csv("sub6.csv",index=False)

# WITHOUT STOP WORDS

In [None]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords 
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
train_sentences=[]
for row in train_text:
  for word in stop_words:
    token = " " + word + " "
    row = row.replace(token, " ")
  train_sentences.append(row)

In [None]:
val_sentences=[]
for row in val_text:
  for word in stop_words:
    token = " " + word + " "
    row = row.replace(token, " ")
  val_sentences.append(row)

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(train_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

val_sequences = tokenizer.texts_to_sequences(val_sentences)
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
vocab_size=len(word_index)
len(word_index)

154671

In [None]:
training_padded = np.array(training_padded)
#training_labels = np.array(training_labels)
val_padded = np.array(val_padded)
#testing_labels = np.array(testing_labels)

In [None]:
model4 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model4.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model4.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1288, 16)          2474736   
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                12544     
_________________________________________________________________
dense_2 (Dense)              (None, 24)                1560      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 25        
Total params: 2,488,865
Trainable params: 2,488,865
Non-trainable params: 0
_________________________________________________________________


In [None]:
num_epochs = 30
history = model4.fit(training_padded, train_label, epochs=num_epochs, validation_data=(val_padded, val_label), verbose=2,callbacks=[Earlystop,reduce_lr],shuffle=False)

Epoch 1/30
7251/7251 - 837s - loss: 0.1519 - accuracy: 0.9425 - val_loss: 0.1099 - val_accuracy: 0.9593
Epoch 2/30
7251/7251 - 868s - loss: 0.0871 - accuracy: 0.9680 - val_loss: 0.0973 - val_accuracy: 0.9652
Epoch 3/30
7251/7251 - 870s - loss: 0.0632 - accuracy: 0.9777 - val_loss: 0.1054 - val_accuracy: 0.9653
Epoch 4/30
7251/7251 - 870s - loss: 0.0446 - accuracy: 0.9850 - val_loss: 0.1228 - val_accuracy: 0.9629
Epoch 5/30
7251/7251 - 867s - loss: 0.0313 - accuracy: 0.9897 - val_loss: 0.1289 - val_accuracy: 0.9634
