In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import os


nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...


True

In [2]:
true_path = "../data/raw/True.csv"
fake_path = "../data/raw/Fake.csv"


df_true = pd.read_csv(true_path)
df_fake = pd.read_csv(fake_path)


print(df_true.head())
print(df_fake.head())

                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  
0  December 31, 2017   
1  December 29, 2017   
2  December 31, 2017   
3  December 30, 2017   
4  December 29, 2017   
                                               title  \
0   Donald Trump Sends Out Embarrassing Ne

In [3]:
df_true['label'] = 0 # real news
df_fake['label'] = 1 # fake news


df = pd.concat([df_true, df_fake], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)

In [4]:
df = df.drop(columns=['subject', 'date'])
df['content'] = df['title'] + " " + df['text']
df = df.drop(columns=['title', 'text'])
df.head()

Unnamed: 0,label,content
0,1,SAY GOOD BYE TO LONDON: Radical Muslim WINS Lo...
1,1,2016 Campaign Coverage WRECKS ‘Liberal Media ...
2,1,BOMBSHELL: FBI REVEALS LYING OBAMA Used A Pseu...
3,0,"Latest gun control bid falters in Congress, De..."
4,0,Trump urges Congress to pass short-term spendi...


In [6]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

In [7]:
df['clean_text'] = df['content'].apply(preprocess)
df.head()

Unnamed: 0,label,content,clean_text
0,1,SAY GOOD BYE TO LONDON: Radical Muslim WINS Lo...,say good bye london radical muslim win london ...
1,1,2016 Campaign Coverage WRECKS ‘Liberal Media ...,campaign coverage wreck liberal medium bias my...
2,1,BOMBSHELL: FBI REVEALS LYING OBAMA Used A Pseu...,bombshell fbi reveals lying obama used pseudon...
3,0,"Latest gun control bid falters in Congress, De...",latest gun control bid falter congress democra...
4,0,Trump urges Congress to pass short-term spendi...,trump urge congress pas short term spending bi...


In [8]:
train, test = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)


os.makedirs("../data/processed", exist_ok=True)
train.to_csv("../data/processed/train.csv", index=False)
test.to_csv("../data/processed/test.csv", index=False)


print("Saved processed train.csv and test.csv!")

Saved processed train.csv and test.csv!
