# **Model Preprocessing**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import unicodedata
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.combine import SMOTETomek
import joblib

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# create filepath
true_filepath = "/content/drive/MyDrive/fake_news/data/True.csv"
fake_filepath = "/content/drive/MyDrive/fake_news/data/Fake.csv"
# read the data file
true_df = pd.read_csv(true_filepath).drop(["title", "subject", "date"], axis = 1)
fake_df = pd.read_csv(fake_filepath).drop(["title", "subject", "date"], axis = 1)


In [None]:
# display the first five rows
true_df.head()

Unnamed: 0,text
0,WASHINGTON (Reuters) - The head of a conservat...
1,WASHINGTON (Reuters) - Transgender people will...
2,WASHINGTON (Reuters) - The special counsel inv...
3,WASHINGTON (Reuters) - Trump campaign adviser ...
4,SEATTLE/WASHINGTON (Reuters) - President Donal...


In [None]:
# display the first five rows
fake_df.head()

Unnamed: 0,text
0,Donald Trump just couldn t wish all Americans ...
1,House Intelligence Committee Chairman Devin Nu...
2,"On Friday, it was revealed that former Milwauk..."
3,"On Christmas day, Donald Trump announced that ..."
4,Pope Francis used his annual Christmas Day mes...


In [None]:
# check for duplicated rows
true_df.duplicated().sum()

np.int64(225)

In [None]:
# drop duplicated rows
true_df = true_df.drop_duplicates()

In [None]:
# recheck for duplicated rows
true_df.duplicated().sum()

np.int64(0)

In [None]:
# check for duplicated rows
fake_df.duplicated().sum()

np.int64(6026)

In [None]:
# drop duplicated rows
fake_df = fake_df.drop_duplicates()

In [None]:
# recheck for duplicated rows
fake_df.duplicated().sum()

np.int64(0)

## Text Cleaning

In [None]:

def clean_text(text):
    # convert to lowercase
    text = text.lower()

    # safely split on first dash
    if "-" in text:
        parts = text.split("-", 1)  # split on first dash
        if len(parts) > 1:
            text = parts[1].strip()  # keep everything after the dash

    # remove \n, tab, extra spaces
    text = re.sub(r"\s+", " ", text)

    # remove digits
    text = re.sub(r"\d+", "", text)

    # remove punctuation and special characters
    text = re.sub(r"[^a-z\s]", "", text)

    return text


In [None]:
# apply cleaning function to the text column
true_df['cleaned_text'] = true_df['text'].apply(clean_text)

In [None]:
true_df.head()

Unnamed: 0,text,cleaned_text
0,WASHINGTON (Reuters) - The head of a conservat...,the head of a conservative republican faction ...
1,WASHINGTON (Reuters) - Transgender people will...,transgender people will be allowed for the fir...
2,WASHINGTON (Reuters) - The special counsel inv...,the special counsel investigation of links bet...
3,WASHINGTON (Reuters) - Trump campaign adviser ...,trump campaign adviser george papadopoulos tol...
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,president donald trump called on the us postal...


In [None]:

def clean_text2(text):
    # convert to lowercase
    text = text.lower()

    # remove \n, tab, extra spaces
    text = re.sub(r"\s+", " ", text)

    # remove digits
    text = re.sub(r"\d+", "", text)

    # remove punctuation and special characters
    text = re.sub(r"[^a-z\s]", "", text)

    return text


In [None]:
# apply cleaning function to the text column
fake_df['cleaned_text'] = fake_df['text'].apply(clean_text2)

In [None]:
fake_df.head()

Unnamed: 0,text,cleaned_text
0,Donald Trump just couldn t wish all Americans ...,donald trump just couldn t wish all americans ...
1,House Intelligence Committee Chairman Devin Nu...,house intelligence committee chairman devin nu...
2,"On Friday, it was revealed that former Milwauk...",on friday it was revealed that former milwauke...
3,"On Christmas day, Donald Trump announced that ...",on christmas day donald trump announced that h...
4,Pope Francis used his annual Christmas Day mes...,pope francis used his annual christmas day mes...


## Add Target

In [None]:
true_df["class"] = "TRUE"
fake_df["class"] = "FAKE"

## Combine Data

In [None]:
true_fake_df = pd.concat([true_df, fake_df], ignore_index = True)

In [None]:
true_fake_df.head()

Unnamed: 0,text,cleaned_text,class
0,WASHINGTON (Reuters) - The head of a conservat...,the head of a conservative republican faction ...,True
1,WASHINGTON (Reuters) - Transgender people will...,transgender people will be allowed for the fir...,True
2,WASHINGTON (Reuters) - The special counsel inv...,the special counsel investigation of links bet...,True
3,WASHINGTON (Reuters) - Trump campaign adviser ...,trump campaign adviser george papadopoulos tol...,True
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,president donald trump called on the us postal...,True


## Tokenize the text

In [None]:
# create tokenizer
tokenizer = Tokenizer(num_words = 20000, oov_token = '<OOV>')
tokenizer.fit_on_texts(true_fake_df['cleaned_text'])

# convert text to squences of integers
sequences = tokenizer.texts_to_sequences(true_fake_df['cleaned_text'])


## Pad sequences

In [None]:
max_length = 256
padded_sequences = pad_sequences(sequences, maxlen = max_length, padding = 'post', truncating = 'post')
print(padded_sequences)

[[    2   426     4 ...   397     3 11269]
 [ 1427    47    39 ...    58  5419   308]
 [    2   510  1290 ...   598   447     7]
 ...
 [ 4867     7     3 ...     0     0     0]
 [  812  1037  1012 ...  4430     6  2075]
 [  812  1037  1012 ...  6050    17     2]]


## Train-test split

In [None]:
X = padded_sequences
y = true_fake_df['class'].values

# split into 80% train and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(30917, 256) (30917,)
(7730, 256) (7730,)


In [None]:
pd.Series(y_train).value_counts()

Unnamed: 0,count
TRUE,16948
FAKE,13969


In [None]:
pd.Series(y_train).head()

Unnamed: 0,0
0,FAKE
1,FAKE
2,FAKE
3,FAKE
4,TRUE


In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [None]:
pd.Series(y_train).head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,1


## Save Preprocessing Data & Tokenizer

In [None]:
save_filepath = "/content/drive/MyDrive/fake_news/data/preprocessed_data.pkl"
joblib.dump((X_train, X_test, y_train, y_test), save_filepath)

['/content/drive/MyDrive/fake_news/data/preprocessed_data.pkl']

In [None]:
save_filepath2 = "/content/drive/MyDrive/fake_news/tokenizer.pkl"
joblib.dump(tokenizer, save_filepath2)

['/content/drive/MyDrive/fake_news/tokenizer.pkl']