## Data Processing

In [None]:
pip install pandas nltk sklearn

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from nltk.stem.porter import PorterStemmer
from nltk.lm import Vocabulary
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [20]:
# read data from csv files
fake = pd.read_csv("./dataset/fake.csv")
real = pd.read_csv("./dataset/real.csv")

# inserting binary labels for real and fake news
fake['class'] = 0
real['class'] = 1

##### Helper functions

In [21]:
def remove_empty_rows(dataset):
    '''
    Helper function: 
    1. Takes in a dataset, remove any rows with empty content in it
    2. Returns it.
    '''
    empy_texts_ids = dataset[dataset['text'] == ' '].append(dataset[dataset['text'] == '  ']).index
    dataset = dataset.drop(empy_texts_ids).reset_index(drop=True)
    return dataset

In [22]:
# removing rows with empty content
fake = remove_empty_rows(fake)
real = remove_empty_rows(real)

# removing unnecessary columns
fake = fake.drop(columns=['subject','date'])
real = real.drop(columns=['subject','date'])

# combine the two datasets into one, then shuffle and reorder the indexes
dataset = pd.concat([fake, real]).sample(frac=1)
dataset.reset_index(drop=True,inplace=True)
dataset.head()

Unnamed: 0,title,text,class
0,U.S. to vote against U.N. resolution calling f...,UNITED NATIONS (Reuters) - The United States w...,1
1,Trump FURIOUS As Latest Obamacare Numbers Pro...,Donald Trump s approval rating plummeted after...,0
2,Trump Fans Are Asked About ‘P*ssy Grabbing’ R...,Donald Trump s lewd and predatory remarks abou...,0
3,Russia's Putin tells Palestinians' Abbas he su...,MOSCOW (Reuters) - Russian President Vladimir ...,1
4,Detroit school system's manager to step down t...,DETROIT (Reuters) - Detroit Public Schools’ em...,1


### Tokenizing

This process includes several steps to tokenize the data for a more efficient training set.
1. Eliminating null values
2. Applying regex conditions: 
    * deleting punctuations, lower case conversion, removing stopwords, removing numerical values.
3. Lemmatizing
4. Vectorizing data to numerical form
    * vectorizing using TF-IDF

Tokenizing the content in the dataset and apply regex conditions.

In [24]:
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def tokenize(content):
    '''
    Function to apply stemming to the news content.
    Apply regex conditions.
    Content is one row; i.e text/title of article.
    '''
    # tokenize the sentence
    tokenized_content = re.sub('[^a-zA-Z]',' ',content)
    tokenized_content = tokenized_content.lower()
    tokenized_content = tokenized_content.split()
    # tokenized_content = [ps.stem(word) for word in tokenized_content if word not in stopwords.words('english')]
    # tokenized_content = [word for word in tokenized_content if word not in stopwords.words('english')]
    tokenized_content = [lemmatizer.lemmatize(word) for word in tokenized_content if word not in stopwords.words('english')]
    tokenized_content = ' '.join(tokenized_content)
    return tokenized_content

    # filtering the sentence
    # filtered_content = token_filter(tokenized_content, 5)
    # filtered_content = ' '.join(filtered_content)
    # return filtered_content

# def token_filter(tokenized_content, thresh=5):
#     '''
#     Takes in tokenized_content (one row; i.e text/title of article)
#     Function to filter out any unknown english words.
#     '''
#     words = tokenized_content.split(' ')
#     # print(words)
#     vocab = Vocabulary(words, unk_cutoff=thresh)
#     filtered_content = []

#     for word in words:
#         # check for unknown words
#         if vocab.lookup(word) == '<UNK>':
#             # print(word)
#             filtered_content.append('<unk>')
#             print(filtered_content)
#         else:
#             filtered_content.append(word)
    
#     # print(filtered_content)
#     return filtered_content

##### Before tokenizing:

In [59]:
dataset.head()

Unnamed: 0,title,text,class
0,U.S. to vote against U.N. resolution calling f...,UNITED NATIONS (Reuters) - The United States w...,1
1,Trump FURIOUS As Latest Obamacare Numbers Pro...,Donald Trump s approval rating plummeted after...,0
2,Trump Fans Are Asked About ‘P*ssy Grabbing’ R...,Donald Trump s lewd and predatory remarks abou...,0
3,Russia's Putin tells Palestinians' Abbas he su...,MOSCOW (Reuters) - Russian President Vladimir ...,1
4,Detroit school system's manager to step down t...,DETROIT (Reuters) - Detroit Public Schools’ em...,1


In [62]:
%%time
# apply regex tokenizing conditions directly to dataset
dataset['title'] = dataset['title'].apply(tokenize)
dataset['text'] = dataset['text'].apply(tokenize)

CPU times: user 28min 53s, sys: 5min 45s, total: 34min 39s
Wall time: 35min 10s


##### After tokenizing:

In [63]:
dataset.head()

Unnamed: 0,title,text,class
0,u vote u n resolution calling end cuba embargo,united nation reuters united state vote u n ge...,1
1,trump furious latest obamacare number prove go...,donald trump approval rating plummeted disastr...,0
2,trump fan asked p ssy grabbing remark get real...,donald trump lewd predatory remark woman causi...,0
3,russia putin tell palestinian abbas support ta...,moscow reuters russian president vladimir puti...,1
4,detroit school system manager step month,detroit reuters detroit public school emergenc...,1


* Only taking news title

In [79]:
dataset_title_only = dataset.drop(columns=['text'])
dataset_title_only

Unnamed: 0,title,class
0,u vote u n resolution calling end cuba embargo,1
1,trump furious latest obamacare number prove go...,0
2,trump fan asked p ssy grabbing remark get real...,0
3,russia putin tell palestinian abbas support ta...,1
4,detroit school system manager step month,1
...,...,...
44262,hillary scold major contributor terrorist grou...,0
44263,watch jon stewart come retirement warn america...,0
44264,senator urge trump administration act myanmar ...,1
44265,charlottesville violence test trump presidenti...,1


### Train, test, validate, split

g

In [32]:
# g's custom function
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [80]:
trn, val, tst = train_validate_test_split(dataset_title_only, train_percent=0.7, validate_percent=0.2, seed=1)
# true_trn, true_val, true_tst = train_validate_test_split(real, train_percent=0.7, validate_percent=0.2, seed=1)

In [81]:
print("Training set: " + str(len(trn)))
print("Validation set: " + str(len(val)))
print("Test set: " + str(len(tst)))

Training set: 30986
Validation set: 8853
Test set: 4428


In [82]:
trn

Unnamed: 0,title,class
33764,funny moron crooked hillary frighten every ame...,0
28339,u judge throw texas voter id law supported trump,1
26407,former gop rep think adam schiff recuse russia...,0
42900,thai junta set firm date election many false s...,1
32967,first person killed terrorist speeding truck a...,0
...,...,...
39501,saudi arabia something truly disgusting attemp...,0
26996,jockeying cash north korea allows racetrack ga...,1
7320,egypt declares three day mourning attack north...,1
38938,trump indonesian business partner see conflict...,1


### TO CSV

The exported files will contain the processed dataset that will be used for further exploration.

In [83]:
### Save texts
trn.to_csv('./preprocessed_data/trn.csv', header=False, index=False)
val.to_csv('./preprocessed_data/val.csv', header=False, index=False)
tst.to_csv('./preprocessed_data/tst.csv', header=False, index=False)

In [86]:
trn['title'][0]

'u vote u n resolution calling end cuba embargo'

end of g

In [None]:
# from sklearn.model_selection import train_test_split

# # Let's say we want to split the data in 80:10:10 for train:valid:test dataset
# train_size=0.8

# X = tokenized_title
# y = classes

# # In the first step we will split the data in training and remaining dataset
# X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.8)

# # Now since we want the valid and test size to be equal (10% each of overall data). 
# # we have to define valid_size=0.5 (that is 50% of remaining data)
# test_size = 0.5
# X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)
# print(X_train.shape), print(y_train.shape)
# print(X_valid.shape), print(y_valid.shape)
# print(X_test.shape), print(y_test.shape)

In [30]:
# # getting the seperated datasets (train, test, validate) with both x and y
# train_title = pd.merge(X_train, y_train, right_index=True, left_index=True)
# validate_title = pd.merge(X_valid, y_valid, right_index=True, left_index=True)
# test_title = pd.merge(X_test, y_test, right_index=True, left_index=True)

# train_title.reset_index(inplace=True, drop=True)
# validate_title.reset_index(inplace=True, drop=True)
# test_title.reset_index(inplace=True, drop=True)

### TO CSV

The exported files will contain the processed dataset that will be used for further exploration.

In [None]:
# train_title.to_csv('./preprocessed_data/train_title.csv', header=True, index=True)
# validate_title.to_csv('./preprocessed_data/validate_title.csv', header=True, index=True)
# test_title.to_csv('./preprocessed_data/test_title.csv', header=True, index=True)