# Natural Language Processing Final Assignment

## Real - Fake News Classification: A Comparison of Natural Language Models for Classification

In [37]:
# Import Libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [14]:
# Load Data

df_raw = pd.read_csv("data/news_dataset.csv")

In [15]:
df_raw.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [16]:
# combine the title and text into one column "feature" - easier

df = df_raw.copy()

df["feature"] = (df["title"] + ' ' + df["text"]).fillna('')

In [17]:
df.head()

Unnamed: 0,title,text,subject,date,label,feature
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0,Donald Trump Sends Out Embarrassing New Year’...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0,Drunk Bragging Trump Staffer Started Russian ...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0,Sheriff David Clarke Becomes An Internet Joke...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0,Trump Is So Obsessed He Even Has Obama’s Name...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0,Pope Francis Just Called Out Donald Trump Dur...


### Preprocessing Function

In [38]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/jakobhren/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jakobhren/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jakobhren/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [40]:
def preprocess_text(text):
    
    text = text.str.lower()
    
    # this remove links and twitter handels, and emails etc
    text = text.str.replace(r'http\S+|www\S+|https\S+', '', regex=True)
    text = text.str.replace(r'\S+@\S+', '', regex=True)
    text = text.str.replace(r'@\w+', '', regex=True)
    
    text = text.str.replace(f'[{string.punctuation}]', '', regex=True)
    
    text = text.str.replace(r'\d+', '', regex=True)
    
    text = text.str.replace(r'\s+', ' ', regex=True).str.strip()
    
    text_tokens = text.apply(nltk.word_tokenize)
    
    stop_words = set(stopwords.words('english'))
    text_tokens = text_tokens.apply(lambda tokens: [w for w in tokens if w not in stop_words])
    
    lemmatizer = WordNetLemmatizer()
    text_tokens = text_tokens.apply(lambda tokens: [lemmatizer.lemmatize(w) for w in tokens])
    
    processed_text = text_tokens.apply(lambda tokens: ' '.join(tokens))
    
    return processed_text


In [41]:
df['feature'] = preprocess_text(df['feature'])

### Train - Test Split

In [47]:
X = df.feature
y = df["label"]

In [48]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, stratify=y, random_state=7)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=7)



X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [49]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(27110,)
(5809,)
(5810,)
