In [None]:
# Import dependencies
from pathlib import Path
import pandas as pd
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the news file with cleaned Dataset
news__tokenized_file = Path('/content/drive/MyDrive/Colab Notebooks/Resources/nlp_cleaned_news.csv')

# Read the CSV file directly into pandas DataFrame
news_tokenized_df = pd.read_csv(news__tokenized_file, sep=',')

# Remove 'Unnamed' column
news_tokenized_df = news_tokenized_df.drop(columns = 'Unnamed: 0', axis = 1)

news_tokenized_df.head()

Unnamed: 0,text,class
0,budget fight loom republicans flip fiscal scri...,1
1,military accept transgender recruit monday pen...,1
2,senior republican senator let mr mueller job w...,1
3,fbi russia probe help australian diplomat tip ...,1
4,trump want postal service charge amazon shipme...,1
...,...,...
44682,mcpain john mccain furious iran treat sailor c...,0
44683,justice yahoo settle e mail privacy class acti...,0
44684,sunnistan ally safe zone plan territorial boot...,0
44685,blow million al jazeera america finally call q...,0


In [None]:
#Drop NaN values
news_tokenized_df.dropna(inplace=True)

news_tokenized_df.shape

(44678, 2)

In [None]:
# Create a tokenizer and fit it to the text, so the model can work with numbers, instead of text
# Tokenizer will keep only the top 10,000 words and oov_token is a placeholder for the words "Out Of Vocabulary"
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(news_tokenized_df['text'])



In [None]:
# Convert each word in the articles with unique integer based on the tokenizer
sequences = tokenizer.texts_to_sequences(news_tokenized_df['text'])


In [None]:
# Adjusting sequences, so they have the same length, so the model can process them in batches
padded_sequences = pad_sequences(sequences, padding='post')
padded_sequences

array([[ 393,  185, 3943, ...,    0,    0,    0],
       [  92,  624, 1355, ...,    0,    0,    0],
       [ 321,   11,  118, ...,    0,    0,    0],
       ...,
       [   1,  348,  790, ...,    0,    0,    0],
       [1165,   61,  272, ...,    0,    0,    0],
       [1518, 4934,  101, ...,    0,    0,    0]], dtype=int32)

In [None]:
# Split the preprocessed data into target array
y = news_tokenized_df["class"].values

# Split the preprocessed data into feature array by first comverting series into a list and the list into array using numpy array
X = np.array(padded_sequences.tolist())


In [None]:
# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define the model

# Setting the length of number of words in each text
sequence_length = len(X_train[0])

# Initiating the Sequential model
model = Sequential([

    # Embedding Layer
    Embedding(input_dim=10000,  # Size of the vocabulary (number of unique words in the text)
              output_dim=32,  # Size of the vector space in which words will be embedded
              input_length=sequence_length),  # Length of input sequences

    # LSTM Layer 1 with 64 memory units. return_sequences=True means it will return the full sequence to the next layer.
    LSTM(64, return_sequences=True),

    # LSTM Layer 2 with 64 memory units. This layer will only return the output of the last sequence step
    LSTM(64),

    # Dense output layer with one neuron. Sigmoid activation function is used to output values between 0 and 1 (binary classification)
    Dense(1, activation='sigmoid')  #
])



In [None]:
# See model's summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4456, 32)          320000    
                                                                 
 lstm (LSTM)                 (None, 4456, 64)          24832     
                                                                 
 lstm_1 (LSTM)               (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 377921 (1.44 MB)
Trainable params: 377921 (1.44 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the model
model.fit(X_train, y_train, epochs=10)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = model.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")
