# Compulsory Assignment 3

Leonard Brenk, Finn Federsan, Felix Wltschek

## Question 1

In [38]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam

In [39]:
# read csvs

true_csv = pd.read_csv("True.csv")
fake_csv = pd.read_csv("Fake.csv")

In [40]:
def load_datasets(fake, true):
    fake_csv = pd.read_csv(fake)
    true__csv = pd.read_csv(true)
    fake_csv['label'] = 0
    true_csv['label'] = 1
    df = pd.concat([fake_csv, true_csv], ignore_index=True)
    return df

### 1.1. Preprocess

In [41]:
# Combine title and text, drop unused columns, clean punctuation, extra spaces, and make lowercase

def preprocess_df(df):
    df['text'] = df['title'] + ' ' + df['text']
    df = df.drop(['title', 'subject', 'date'], axis=1)
    df['text'] = df['text'].apply(lambda x: re.sub(f"[{re.escape(string.punctuation)}]", " ", x))
    df['text'] = df['text'].apply(lambda x: re.sub("\s+", " ", x))
    df['text'] = df['text'].apply(lambda x: x.lower())

    return df


### 1.2 Stoppwords

In [42]:
# Remove stopwords and short words from the 'text' column in the DataFrame

def remove_stops(df):
    stop_words = set(stopwords.words('english'))
    df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words and len(word) > 2]))

### 1.3 Split

In [43]:
# Split data

def split_df(df):
    X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

### 1.4 Tokenize

In [44]:
# Tokenize the text data in train and test sets, pad sequences to a fixed length, and determine the vocabulary size

def tok_pad_df(X_train, X_test, max_len=250):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)

    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)

    X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
    X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

    vocab_size = len(tokenizer.word_index) + 1

    return X_train_padded, X_test_padded, vocab_size

### 1.5 Model 

In [45]:
# Bi-LSTM model with embedding layer, two bidirectional LSTM layers, dense layers with Adam optimizer, binary cross-entropy loss, and accuracy metric

def build_model(vocab_size, max_len=250):
    model = Sequential() # for adding the layers 
    model.add(Embedding(vocab_size, 128, input_length=max_len)) # turns tokens into layers of vectors 
    model.add(Bidirectional(LSTM(64, return_sequences=True))) # bidirectional to learn from both future and past 
    model.add(Bidirectional(LSTM(64))) #
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

### 1.6 Training

In [46]:
def train_model(model, X_train, y_train, batch_size=64, epochs=2):
    history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)
    return history

### 1.7 Evaluate accuracy

In [47]:
# Evaluate the model on test data, print the accuracy, and return the loss and accuracy values
def evaluate_model(model, X_test, y_test):
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f"Model Accuracy: {accuracy * 100:.2f}%")
    return loss, accuracy

### Run everything 

In [48]:
# Load datasets
fake_file = 'Fake.csv'
true_file = 'True.csv'
df = load_datasets(fake_file, true_file)
df.head()  

# Preprocessing
preprocessed_df = preprocess_df(df)
preprocessed_df.head()  

# Splitting
X_train, X_test, y_train, y_test = split_df(preprocessed_df)

# Tokenizing and padding (all inputs same sized)
X_train_padded, X_test_padded, vocab_size = tok_pad_df(X_train, X_test)

# Building model
model = build_model(vocab_size)

# Training model
train_model(model, X_train_padded, y_train)

# Evaluating
evaluate_model(model, X_test_padded, y_test)




Epoch 1/2
Epoch 2/2
Model Accuracy: 98.85%


(0.03386697918176651, 0.9885300993919373)