In [1]:
# Importing required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Loading data (df, train_data, test_data, labels_train, labels_test)

import pickle

df = pickle.load(open('pklFiles/df.pkl', 'rb'))
train_data = pickle.load(open('pklFiles/train_data.pkl', 'rb'))
test_data = pickle.load(open('pklFiles/test_data.pkl', 'rb'))
train_labels = pickle.load(open('pklFiles/train_labels.pkl', 'rb'))
test_labels = pickle.load(open('pklFiles/test_labels.pkl', 'rb'))

In [3]:
# Defining variables for num_words, maxlen, output_dim

MAX_SEQUENCE_LENGTH = 500
MAX_WORDS = 35000
EMBEDDING_DIM = 200

In [4]:
# Required libraries

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Dropout, Conv1D, MaxPooling1D, BatchNormalization, Flatten

In [5]:
# CNN model

cnn_model = Sequential()

cnn_model.add(Embedding(input_dim=MAX_WORDS, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
cnn_model.add(Dropout(rate=0.5))
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=10))

cnn_model.add(Dropout(rate=0.5))
cnn_model.add(BatchNormalization())
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=10))

cnn_model.add(Dropout(rate=0.5))
cnn_model.add(BatchNormalization())
cnn_model.add(Flatten())

cnn_model.add(Dense(units=128, activation='relu'))
cnn_model.add(Dense(units=2, activation='softmax'))

cnn_model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

In [6]:
# Training CNN model

cnn_model.fit(train_data, train_labels, 
              batch_size=128, 
              epochs=5, 
              validation_data=(test_data, test_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1f450e9cc40>

In [7]:
# Predictions

predicted_labels = cnn_model.predict(test_data)
print(predicted_labels.round())

[[1. 0.]
 [0. 1.]
 [0. 1.]
 ...
 [0. 1.]
 [0. 1.]
 [1. 0.]]


## Model evaluation

In [8]:
# Required Libraries

from sklearn.metrics import precision_recall_fscore_support, classification_report

In [9]:
# Precision, Recall, F-score, Support

precision, recall, fscore, support = precision_recall_fscore_support(test_labels, predicted_labels.round())

print('Precision : {}'.format(precision))
print('Recall    : {}'.format(recall))
print('F-score   : {}'.format(fscore))
print('Support   : {}'.format(support))

Precision : [0.98906819 0.9265233 ]
Recall    : [0.92054264 0.98994734]
F-score   : [0.95357591 0.95718584]
Support   : [2064 2089]


In [10]:
# Classification report

print(classification_report(test_labels, predicted_labels.round()))

              precision    recall  f1-score   support

           0       0.99      0.92      0.95      2064
           1       0.93      0.99      0.96      2089

   micro avg       0.96      0.96      0.96      4153
   macro avg       0.96      0.96      0.96      4153
weighted avg       0.96      0.96      0.96      4153
 samples avg       0.96      0.96      0.96      4153



## Saving CNN model

In [12]:
cnn_model.save("Trained Models/cnn_model.h5")

## Testing on user input

In [18]:
# User input

title = "This security flaw puts millions of computers at risk for a serious (but unlikely) hack"
author = "Clare Duffy"
text = "New York (CNN Business)A new report from a Dutch security researcher details a hacking mechanism that targets a common feature on millions of computers: the Thunderbolt port. Bjorn Ruytenberg, a researcher at Eindhoven University in the Netherlands, identified a security flaw in the Thunderbolt port that could allow a hacker to break into a computer and access all of its data in a matter of minutes, even if the computer's owner has taken security precautions. If your computer has such a port, an attacker who gets brief physical access to it can read and copy all your data, even if your drive is encrypted and your computer is locked or set to sleep, Ruytenberg said in the report. He dubbed the hacking technique Thunderspy. Thunderspy is stealth, meaning that you cannot find any traces of the attack, he said. The attack also does not require any engagement on the part of the computer's user, unlike other types of attacks such as phishing. Developed by Intel (INTC) in 2011, the Thunderbolt port enables fast data transfers. It is present on many PC and Apple laptops and — increasingly — some desktops. Although Intel recently developed a tool to address security concerns with the port, it isn't available on computers manufactured before 2019. Ruytenberg demonstrated the attack, which took just about five minutes, in a YouTube video published along with the report. For its part, Intel says that if users take normal security precautions and don't leave their computers somewhere a hacker could access them for even a few minutes — even if they have encrypted drives — they shouldn't be too worried about this type of hack. While the Thunderspy attack is technically possible on many computers with a Thunderbolt port, it requires that the hacker gains physical access to the computer for several minutes — enough time to unscrew the back panel of a laptop, plug in a device to the Thunderbolt and override security features, reattach the back of the laptop and then access the computer's data. Most people likely do not have valuable enough data on their computers for a hacker to want to carry out such a targeted attack. Even beyond Thunderspy, security experts have long warned of risks that could come from letting a hacker gain physical access to a computer. A group of security researchers last year identified several vulnerabilities related to Thunderbolt ports. In response, Intel created a tool called Kernel Direct Memory Access (DMA) to mitigate such attacks, which was implemented into major operating systems from Windows, Linux and Mac in 2019, Jerry Bryant, Intel's director of communications for product assurance and security, said in a blog post Sunday. The underlying vulnerability identified by Ruytenberg's Thunderspy technique is the same as those addressed by that mitigation tool, Byrant said in the post. The company added that Ruytenberg did not demonstrate successful attacks against machines with the DMA tool enabled."

The above news article has been taken from [CNN Business News](https://edition.cnn.com/2020/05/12/tech/intel-thunderbolt-security-vulnerability/index.html).

In [1]:
# Function to predict news type 

def predict_news_cnn(title, author, text):
    # Lower case
    total_info = title + ' ' + author + ' ' + text
    total_info = total_info.lower()
    
    # Removing punctuations
    def Punctuation(string):
        punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
        for x in string.lower():
            if x in punctuations:
                string = string.replace(x, "")
        
        return string
    
    total_info = Punctuation(total_info)
    
    # Eliminating extra spaces
    total_info = total_info.replace('   ', ' ')
    total_info = total_info.replace('  ', ' ')
    
    # Removing stopwords
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(total_info) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    total_info = ''
    for word in filtered_sentence:
        total_info += word + ' '
        
    total_info = total_info.rstrip()
    
    # Loading tokenizer
    tokenizer = pickle.load(open('pklFiles/tokenizer.pkl', 'rb'))
    
    # Defining variables for maxlen
    MAX_SEQUENCE_LENGTH = 500
    
    test_sequence = tokenizer.texts_to_sequences([total_info])
    
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    test_data = pad_sequences(test_sequence, maxlen=MAX_SEQUENCE_LENGTH)
    
    # Loading CNN model
    from tensorflow.keras.models import load_model
    cnn_model = load_model('Trained Models/cnn_model.h5')
    
    # Prediction
    predicted_label = cnn_model.predict(test_data)
    
    # Result
    if int(predicted_label.round()[0][0]) == 1:
        print("News is 'Reliable'.")
    else:
        print("News is 'Unreliable'.")

In [20]:
predict_news_cnn(title, author, text)

News is 'Reliable'.
