In [1]:
import pandas as pd

In [2]:
dataset = '../datasets/ISOT'
real = pd.read_csv(f'{dataset}/True.csv')
fake = pd.read_csv(f'{dataset}/Fake.csv')

#### Real News

In [3]:
real.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


In [4]:
# to lowercase
real['title'] = real['title'].str.lower()
real['text'] = real['text'].str.lower()
real['subject'] = real['subject'].str.lower()
real['date'] = real['date'].str.lower()

In [5]:
# duplicates
real.duplicated().sum(), real['text'].duplicated().sum(), real['title'].duplicated().sum()

(np.int64(206), np.int64(225), np.int64(592))

In [6]:
# drop rows w/ duplicated text
real.drop_duplicates(subset='text', keep='first', inplace=True)
real.reset_index(drop=True, inplace=True)
len(real)

21192

In [8]:
# check source
from_reuters = real['text'].apply(lambda t: 'reuters' in t)
from_reuters.sum(), len(real) - from_reuters.sum()

(np.int64(21154), np.int64(38))

In [9]:
# add source column
real['source'] = from_reuters.map({True: 'reuters', False: 'other'}) 

In [11]:
real[real['source']!='reuters'].head()

Unnamed: 0,title,text,subject,date,source
103,democratic u.s. senator seeks audit of epa chi...,washington () - the top democrat on the senate...,politicsnews,"december 18, 2017",other
425,factbox: republicans to watch in u.s. senate t...,washington - some key u.s. senators still had ...,politicsnews,"november 30, 2017",other
1134,gao opens door for congress to review leverage...,new york (ifr) - the investigative arm of cong...,politicsnews,"october 19, 2017",other
3472,white house unveils list of ex-lobbyists grant...,the white house on wednesday disclosed a group...,politicsnews,"june 1, 2017",other
4331,factbox: trump supreme court appointee to affe...,"neil gorsuch, president donald trump’s appoint...",politicsnews,"april 7, 2017",other


In [12]:
# save cleaned real dataset
real.to_csv(f'{dataset}/real_clean.csv', index=False)

#### Fake News

In [13]:
fake.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB


In [14]:
# to lowercase
fake['title'] = fake['title'].str.lower()
fake['text'] = fake['text'].str.lower()
fake['subject'] = fake['subject'].str.lower()
fake['date'] = fake['date'].str.lower()

In [15]:
# duplicates
fake.duplicated().sum(), fake['text'].duplicated().sum(), fake['title'].duplicated().sum()

(np.int64(3), np.int64(6026), np.int64(5578))

In [16]:
# drop rows w/ duplicated text
fake.drop_duplicates(subset='text', keep='first', inplace=True)
fake.reset_index(drop=True, inplace=True)
len(fake)

17455

In [None]:
# find sources
# pd.set_option('display.max_colwidth', None)
# fake.sample(5)

In [455]:
sources = [
    '21st century wire', 'reason.com', 'hammond news', 'alternate current radio', '21wire',
    'fox news', 'news360', 'the guardian', 'wfb', 'dispatch', 'the american mirror', 'ijreview',
    'cnsnews', 'gateway pundit', 'daily mail', 'washington examiner', 'express uk', 'ktar news',
    'cnn', 'conservative treehouse', 'daily caller', 'the blaze', 'cbc', 'tmz', 'vulture', 'kmov',
    'the hayride', 'breitbart', 'brietbart', 'gp', 'mr. conservative', 'fox 2', 'chron', 'ap', 'abc news',
    'the olympian', 'the hill', 'deadline', 'tampa bay', 'politico', 'wt', 'zero hedge', 'nyp',
    'hollywood reporter', 'wxyz', 'examiner.com', 'bbc', 'la times', 'getty', 'flickr', 'screengrab',
    'youtube', 'twitter', 'facebook', 'wall street journal', 'nbcdfw', 'nyt', 'fortune',
    'washington free beacon', 'huffington post', 'bizpac review', 'washington times', 'sltrb',
    'the college fix', 'eag news', 'cnbc', 'krtv', 'bpr', 'whitehouse.gov', 'mbr', 'wesh.com',
    'screenshot', 'boston herald', 'wnd', 'wikimedia', 'politically short', 'biz pac', 'kcs', 'espn',
    'washington post', 'national review', 'reuters', 'downtrend', 'yahoo news', 'weasel zippers',
    'dfp', 'npr', 'page six', 'rcp', 'the federalist', 'tpm', 'the detroit news', 'wbrz', 
    'ny daily news', 'myfox8', 'palm beach post', 'mrctv', 'the bureau', 'detroit free press', 
    'moonbattery', 'radar online', 'gatestone institute', 'star tribune', 'business insider', 
    'the lonely conservative', 'mediaite', 'national enquirer', 'public domain', 'ai archives',
    'the lid', 'ws', 'stars and stripes', 
]

In [456]:
len(sources)

112

In [457]:
def get_source(text):
    # source mentioned at end of article?
    for source in sources:
            if source in text[-100:]:
                return source
    return 'other'

fake['source'] = fake['text'].apply(get_source)
fake['source'].value_counts()

source
other           4476
getty           3767
ap              2308
screenshot       706
twitter          680
                ... 
examiner.com       1
wesh.com           1
news360            1
kcs                1
reason.com         1
Name: count, Length: 112, dtype: int64

In [None]:
fake[fake['source']=='other'].sample(3)

In [462]:
# save cleaned fake data
fake.to_csv(f'{dataset}/fake_clean.csv', index=False)