In [1]:
import sys

In [2]:
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install nltk
!{sys.executable} -m pip install regex



In [3]:
import os
import pandas as pd
import re
import string
import nltk
from nltk.tokenize import RegexpTokenizer, word_tokenize, TweetTokenizer
from nltk import FreqDist
from nltk.corpus import stopwords

Importing the dataset

In [4]:
cwd = os.getcwd()
dataset_dir = os.path.join(cwd,'Dataset')
df = pd.read_csv(os.path.join(dataset_dir,'train_news_cleaned.csv'))

In [5]:
print("Dataset shape:", df.shape)

Dataset shape: (62197, 9)


In [6]:
df.head()

Unnamed: 0,title,text,label,headline_len,news_len,caps_in_headline,norm_caps_in_headline,caps_in_news,norm_caps_in_news
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,130,5049,46,0.353846,161,0.031888
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,137,216,91,0.664234,5,0.023148
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,105,8010,4,0.038095,246,0.030712
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,95,1916,16,0.168421,123,0.064196
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,78,1530,13,0.166667,97,0.063399


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62197 entries, 0 to 62196
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   title                  62197 non-null  object 
 1   text                   62197 non-null  object 
 2   label                  62197 non-null  int64  
 3   headline_len           62197 non-null  int64  
 4   news_len               62197 non-null  int64  
 5   caps_in_headline       62197 non-null  int64  
 6   norm_caps_in_headline  62197 non-null  float64
 7   caps_in_news           62197 non-null  int64  
 8   norm_caps_in_news      62197 non-null  float64
dtypes: float64(2), int64(5), object(2)
memory usage: 4.3+ MB


In [8]:
news = 'text'
headline = 'title'

Clean Headline and News data.

Helper Functions

In [9]:
def concat_lists_of_strings(df, column):
    """Concatenate a series of lists of strings from a column in a dataframe"""
    return [x for list_ in df[column].values for x in list_]

In [10]:
def find_strings(string_, regex):
    """Find and Return a list of URLs in the input string"""
    list_ = re.findall(regex, string_)
    return [s[0] for s in list_]

In [11]:
def freq_dist_of_col(df, col):
    """Return a Frequency Distribution of a column"""
    corpus_tokens = concat_lists_of_strings(df, col)
    corpus_freq_dist = FreqDist(corpus_tokens)
    print(f'The number of unique tokens in the corpus is {len(corpus_freq_dist)}')
    return corpus_freq_dist

In [12]:
def review_freq_dis(df, col, n):
    """
    Create a Frequency Distribution of a column of a dataframe and display
    the n most common tokens.
    """
    corpus_freq_dist = freq_dist_of_col(df, col)
    display(corpus_freq_dist.most_common(n))

In [13]:
def remove_punctuation(word_list, punctuation_list):
    """Remove punctuation tokens from a list of tokens"""
    return [w for w in word_list if w not in punctuation_list]

In [14]:
def remove_single_characters(word_list, exception_list):
    """Remove all the single characters, except those on the exception list"""
    return [w for w in word_list if (len(w) > 1 or w in exception_list)]

In [15]:
def remove_words(word_list, words_to_remove):
    """Remove all the words in the words_to_remove list from the words_list"""
    return [w for w in word_list if w not in words_to_remove]

Token Frequency Distribution

In [16]:
tknzr = RegexpTokenizer(r'\w+|\$[\d\.]+|\([@\w\d]+\)')
df['news_tokens'] = df[news].apply(tknzr.tokenize)

In [17]:
corpus_freq_dist = freq_dist_of_col(df, 'news_tokens')

The number of unique tokens in the corpus is 298642


In [18]:
corpus_freq_dist.most_common(150)

[('the', 1749040),
 ('to', 964919),
 ('of', 864335),
 ('and', 769986),
 ('a', 755940),
 ('in', 622785),
 ('that', 446468),
 ('s', 399921),
 ('is', 323625),
 ('on', 312139),
 ('for', 309562),
 ('said', 220238),
 ('The', 218304),
 ('with', 215886),
 ('was', 215567),
 ('it', 201268),
 ('Trump', 194347),
 ('he', 193258),
 ('as', 190320),
 ('by', 166996),
 ('his', 162349),
 ('have', 157764),
 ('be', 156843),
 ('has', 156573),
 ('not', 148535),
 ('are', 147062),
 ('from', 146347),
 ('I', 143822),
 ('at', 140324),
 ('an', 126035),
 ('who', 124911),
 ('this', 114086),
 ('they', 109317),
 ('would', 94761),
 ('t', 94324),
 ('about', 92479),
 ('had', 92454),
 ('will', 91631),
 ('their', 90125),
 ('you', 86010),
 ('or', 84985),
 ('but', 81789),
 ('been', 80584),
 ('were', 77728),
 ('we', 77545),
 ('more', 75436),
 ('people', 74962),
 ('which', 72055),
 ('Mr', 70739),
 ('her', 70547),
 ('one', 67166),
 ('U', 66070),
 ('S', 64201),
 ('all', 63893),
 ('out', 63656),
 ('Clinton', 61876),
 ('she', 6165

Tokens used only once

In [19]:
len([w for w in corpus_freq_dist.most_common() if w[1] == 1])

140253

Tokens used less than 5 times

In [20]:
len([w for w in corpus_freq_dist.most_common() if w[1] <= 5])

219253

At the top of the frequency distribution, the usual stop words are present, along with with words associated with politics or the names of political figures, institutions or countries.

The amount of words that are used only once or 5 or less times is relatively small given the size of the corpus.

Investigate if URLs are present in the news article text

In [21]:
URL_REGEX = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

In [22]:
df['news_urls'] = df[news].apply(lambda x: find_strings(x, URL_REGEX))

In [23]:
urls_in_news = concat_lists_of_strings(df, 'news_urls')

In [24]:
urls_in_news

['https://t.co/oXiLXcBRly',
 'https://t.co/H6BwjHzokH',
 'https://t.co/CzpGfL6u4M',
 'https://t.co/DaD4XaNvI5',
 'https://t.co/SboVZmOuu2',
 'https://t.co/ScXDGFcbGp',
 'https://t.co/d4orBrHJMw',
 'https://t.co/5JsyVAKQRL',
 'https://t.co/NQluVs1KvA',
 'https://t.co/VKNvDUXLtT',
 'https://t.co/Adlj9AvNPR',
 'https://t.co/XDTeATVGSe',
 'https://t.co/bNHSlf1uOx',
 'https://t.co/ToREd7VwDM',
 'https://t.co/L09bBy8gHh',
 'https://t.co/fsRl25AD12',
 'pic.twitter.com/GMco1PkJiL',
 'pic.twitter.com/LD65yMUMVn',
 'pic.twitter.com/Hm1QkJi5Tp',
 'pic.twitter.com/MjxKrdL7zS',
 'tmsnrt.rs/2A1LfXV',
 'https://t.co/2CcD02mXzl',
 'https://t.co/UCIqhqGnXu',
 'https://www.youtube.com/watch?v=KegE285GQucThe',
 'tmsnrt.rs/2nm68H0',
 'pic.twitter.com/XXWSoXJqLh',
 'https://t.co/ts3f4mtX13',
 'https://t.co/QYNtcVGlKJ',
 'https://t.co/kmModmdsJ8',
 'https://t.co/c7t8TpF39o',
 'pic.twitter.com/37j1z6CnjD',
 'pic.twitter.com/Qdg276pSmL',
 'pic.twitter.com/YpdLq7yaTI',
 'pic.twitter.com/4JfoYRbz4u',
 'https://

In [25]:
len(urls_in_news)

12721

In [26]:
url_freq_dist = FreqDist(urls_in_news)

In [27]:
url_freq_dist.most_common(150)

[('http://www.infowarsstore.com/health-and-wellness/infowars-life/brain-force.html?ims=tzrwu&utm_campaign=Infowars+Placement&utm_source=Infowars.com&utm_medium=Widget&utm_content=Brain+Force',
  180),
 ('bit.ly/2jBh4LU', 98),
 ('bit.ly/2jpEXYR', 98),
 ('http://www.infowars.com/wp-content/uploads/2015/10/brainforce-25-200-e1476824046577.jpg',
  90),
 ('http://www.voltairenet.org/article1', 76),
 ('connect.facebook.net/en_US/sdk.js#xfbml=1&#038;version=v2.3', 73),
 ('www.TheDailySheeple.com', 46),
 ('www.voltairenet.org/article1', 32),
 ('https://facebook.com/LukeWeAreChange', 31),
 ('https://twitter.com/Lukewearechange', 31),
 ('http://instagram.com/lukewearechange', 31),
 ('connect.facebook.net/en_GB/sdk.js#xfbml=1&#038;version=v2.3', 19),
 ('connect.facebook.net/en_US/sdk.js#xfbml=1&version=v2.3', 19),
 ('https://t.co/6OZtrfIwim', 16),
 ('http://link-address.com', 12),
 ('www.zerohedge.com', 12),
 ('www.1100kfnx.com.LISTEN', 10),
 ('http://www.claritypress.com/LendmanIII.html', 10),
 

The first two links are mentioned multiple times but when I tried to check them out they were sealed. As more time goes on more of the links will stop working. So it would be better to replace them with a placeholder {link}.

In [28]:
df['clean_news'] = df[news].apply(lambda x: re.sub(URL_REGEX, '{link}', x))

Investigate if URLs are present in the headline text

In [29]:
df['headline_urls'] = df[headline].apply(lambda x: find_strings(x, URL_REGEX))

In [30]:
urls_in_headline = concat_lists_of_strings(df, 'headline_urls')

In [31]:
urls_in_headline

['https://fedup.wpengine.com/wp-content/uploads/2015/04/entitled.jpg',
 'https://fedup.wpengine.com/wp-content/uploads/2015/04/hillarystreetart.jpg',
 'https://youtu.be/BFpFCy_b2SM',
 'http://journal-neo.org/2016/11/07/israel-is-becoming-pivotal-to-china-s-mid-eastern-calculus/',
 'https://100percentfedup.com/served-roy-moore-vietnamletter-veteran-sets-record-straight-honorable-decent-respectable-patriotic-commander-soldier/',
 'https://100percentfedup.com/12-yr-old-black-conservative-whose-video-to-obama-went-viral-do-you-really-love-america-receives-death-threats-from-left/',
 'https://100percentfedup.com/video-hillary-asked-about-trump-i-just-want-to-eat-some-pie/']

Replacing the links in the headlines with placeholder aswell.

In [32]:
df['clean_headline'] = df[headline].apply(lambda x: re.sub(URL_REGEX, '{link}', x))

Investigate Twitter handles in news articles.

In [33]:
TWITTER_HANDLE_REGEX = r'(?<=^|(?<=[^\w]))(@\w{1,15})\b'

In [34]:
df['twitter_handles'] = df['clean_news'].apply(lambda x: re.findall(TWITTER_HANDLE_REGEX, x))

In [35]:
twitter_handles = concat_lists_of_strings(df, 'twitter_handles')

In [36]:
twitter_freq_dist = FreqDist(twitter_handles)

In [37]:
twitter_freq_dist.most_common(50)

[('@realDonaldTrump', 3270),
 ('@POTUS', 411),
 ('@21WIRE', 295),
 ('@HillaryClinton', 270),
 ('@FoxNews', 254),
 ('@pamkeyNEN', 238),
 ('@CNN', 200),
 ('@seanhannity', 194),
 ('@nytimes', 143),
 ('@joelpollak', 130),
 ('@warnerthuston', 121),
 ('@IanHanchett', 119),
 ('@dznussbaum', 116),
 ('@foxandfriends', 107),
 ('@jeff_poor', 107),
 ('@NBCNews', 106),
 ('@AWRHawkins', 105),
 ('@jeromeehudson', 105),
 ('@MagnifiTrent', 102),
 ('@AnnCoulter', 100),
 ('@elizabethforma', 99),
 ('@ABC', 96),
 ('@BobPriceBBTX', 94),
 ('@WalshFreedom', 91),
 ('@ben_kew', 89),
 ('@MrNashington', 87),
 ('@JordanUhl', 87),
 ('@tonyposnanski', 86),
 ('@PressSec', 83),
 ('@FLOTUS', 77),
 ('@wikileaks', 75),
 ('@JxhnBinder', 74),
 ('@bessbell', 69),
 ('@BernieSanders', 64),
 ('@KellyannePolls', 63),
 ('@kurteichenwald', 63),
 ('@BraddJaffy', 62),
 ('@IvankaTrump', 61),
 ('@CNNPolitics', 61),
 ('@SpeakerRyan', 60),
 ('@megynkelly', 60),
 ('@ABFalecbaldwin', 60),
 ('@JeromeEHudson', 59),
 ('@CBSNews', 57),
 ('@P

In [38]:
len(twitter_handles)

30552

In [39]:
len(twitter_freq_dist)

10408

In [40]:
df['clean_news'] = df['clean_news'].apply(lambda x: re.sub(TWITTER_HANDLE_REGEX, '@twitter-handle', x))

Capitalization

Because words with all caps are an import way that emphasis is made online, we will keep words that are in all caps while making all the letters in other words lower case. Words of length of one will be made lower case though since they are likely A or I which can be made lowercase without losing much emphasis.

In [41]:
def lower_unless_all_caps(string_):
    """
    Make all words in the input string lowercase unless that 
    word is in all caps
    """
    words = string_.split()
    processed_words = [w.lower() if not (w.isupper() and len(w) > 1) else w for w in words]
    return ' '.join(processed_words)

In [42]:
df['clean_news'] = df['clean_news'].apply(lower_unless_all_caps)

In [43]:
df['clean_headline'] = df['clean_headline'].apply(lower_unless_all_caps)

In [44]:
df.head()

Unnamed: 0,title,text,label,headline_len,news_len,caps_in_headline,norm_caps_in_headline,caps_in_news,norm_caps_in_news,news_tokens,news_urls,clean_news,headline_urls,clean_headline,twitter_handles
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,130,5049,46,0.353846,161,0.031888,"[No, comment, is, expected, from, Barack, Obam...",[],no comment is expected from barack obama membe...,[],LAW ENFORCEMENT ON HIGH ALERT following threat...,[@LOLatWhiteFear]
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,137,216,91,0.664234,5,0.023148,"[Now, most, of, the, demonstrators, gathered, ...",[],"now, most of the demonstrators gathered last n...",[],UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,[]
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,105,8010,4,0.038095,246,0.030712,"[A, dozen, politically, active, pastors, came,...",[],a dozen politically active pastors came here f...,[],"bobby jindal, raised hindu, uses story of chri...",[]
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,95,1916,16,0.168421,123,0.064196,"[The, RS, 28, Sarmat, missile, dubbed, Satan, ...",[],"the RS-28 sarmat missile, dubbed satan 2, will...",[],SATAN 2: russia unvelis an image of its terrif...,[]
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,78,1530,13,0.166667,97,0.063399,"[All, we, can, say, on, this, one, is, it, s, ...",[],all we can say on this one is it s about time ...,[],about time! christian group sues amazon and SP...,[]


Number in data

I will replace the numbers with a space because some of the sentences run together and end with a number. Replacing the number with a space will split the sentences.

In [45]:
df['clean_news'] = df['clean_news'].apply(lambda x: re.sub(r'9\/11', 'nine-eleven', x))

In [46]:
df['clean_news'] = df['clean_news'].apply(lambda x: re.sub(r'\d+', ' ', x))

In [47]:
df['clean_headline'] = df['clean_headline'].apply(lambda x: re.sub(r'9\/11', 'nine-eleven', x))

In [48]:
df['clean_headline'] = df['clean_headline'].apply(lambda x: re.sub(r'\d+', ' ', x))

In [49]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kanai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Tokens in the current clean news articles

In [50]:
df['clean_news_tokens'] = df['clean_news'].apply(word_tokenize)

In [51]:
review_freq_dis(df, 'clean_news_tokens', 150)

The number of unique tokens in the corpus is 355196


[('the', 1955877),
 (',', 1860077),
 ('.', 1368159),
 ('to', 969992),
 ('of', 867265),
 ('and', 798399),
 ('a', 783138),
 ('in', 670604),
 ('that', 467505),
 ('s', 377218),
 ('’', 347273),
 ('is', 327245),
 ('on', 323676),
 ('for', 320482),
 ('it', 258259),
 ('“', 249921),
 ('”', 248432),
 ('he', 244797),
 ('with', 222023),
 ('said', 218556),
 ('was', 216659),
 ('as', 206678),
 ('trump', 189910),
 ('by', 172675),
 ('his', 170830),
 ('have', 158607),
 ('not', 158372),
 ('has', 157155),
 ('be', 156879),
 ('at', 150158),
 ('from', 149333),
 ('are', 149075),
 ('this', 145597),
 ('they', 134693),
 ('an', 130245),
 ('but', 127712),
 ('who', 126948),
 ('i', 120379),
 ('we', 117449),
 (')', 113267),
 (':', 112691),
 ('(', 112290),
 ('you', 102938),
 ('would', 95406),
 ('about', 94005),
 ('will', 93313),
 ('had', 92790),
 ('their', 92438),
 ('or', 86236),
 ('t', 82593),
 ('been', 80598),
 ('people', 78789),
 ('more', 78648),
 ('she', 78287),
 ('were', 78072),
 ('president', 76154),
 ('one', 749

Removing all of the Punctuation tokens except for the exclamation point, because it seems like it may be an indicator of Fake news. Also removing all the single characters except for i.

In [52]:
df['clean_news_tokens'] = df['clean_news_tokens'].apply(lambda x: remove_single_characters(x, ['i', '!']))

In [53]:
review_freq_dis(df, 'clean_news_tokens', 150)

The number of unique tokens in the corpus is 354881


[('the', 1955877),
 ('to', 969992),
 ('of', 867265),
 ('and', 798399),
 ('in', 670604),
 ('that', 467505),
 ('is', 327245),
 ('on', 323676),
 ('for', 320482),
 ('it', 258259),
 ('he', 244797),
 ('with', 222023),
 ('said', 218556),
 ('was', 216659),
 ('as', 206678),
 ('trump', 189910),
 ('by', 172675),
 ('his', 170830),
 ('have', 158607),
 ('not', 158372),
 ('has', 157155),
 ('be', 156879),
 ('at', 150158),
 ('from', 149333),
 ('are', 149075),
 ('this', 145597),
 ('they', 134693),
 ('an', 130245),
 ('but', 127712),
 ('who', 126948),
 ('i', 120379),
 ('we', 117449),
 ('you', 102938),
 ('would', 95406),
 ('about', 94005),
 ('will', 93313),
 ('had', 92790),
 ('their', 92438),
 ('or', 86236),
 ('been', 80598),
 ('people', 78789),
 ('more', 78648),
 ('she', 78287),
 ('were', 78072),
 ('president', 76154),
 ('one', 74990),
 ('her', 73450),
 ('which', 72788),
 ('if', 69407),
 ('mr.', 69342),
 ('what', 68493),
 ('all', 67775),
 ('there', 65488),
 ('can', 63572),
 ('when', 63521),
 ('out', 62572

Tokens in the current clean headline

In [54]:
df['clean_headline_tokens'] = df['clean_headline'].apply(word_tokenize)

In [55]:
review_freq_dis(df, 'clean_headline_tokens', 150)

The number of unique tokens in the corpus is 43720


[('’', 20614),
 ('to', 20369),
 ('the', 17371),
 (':', 17333),
 ('trump', 16159),
 (',', 15668),
 ('in', 11976),
 ('of', 10359),
 ('-', 9604),
 ('for', 8883),
 ('new', 8612),
 ('on', 8319),
 ('s', 8277),
 ('a', 6589),
 ('york', 6563),
 ('times', 6488),
 ('and', 5691),
 ('‘', 4993),
 ("'s", 4913),
 ('U.S.', 4719),
 (')', 4629),
 ('(', 4628),
 ('with', 4496),
 ('is', 4425),
 ('VIDEO', 4205),
 ('says', 3877),
 ('”', 3748),
 ('“', 3513),
 ("'", 3144),
 ('[', 3138),
 (']', 3138),
 ('at', 2976),
 ('after', 2877),
 ('!', 2819),
 ('clinton', 2774),
 ('as', 2757),
 ('?', 2720),
 ('obama', 2651),
 ('by', 2609),
 ('from', 2520),
 ('hillary', 2486),
 ('breitbart', 2437),
 ('over', 2345),
 ('it', 2246),
 ('house', 2205),
 ('video', 2187),
 ('about', 2045),
 ('will', 2028),
 ('his', 1998),
 ('.', 1982),
 ('he', 1976),
 ('not', 1957),
 ('donald', 1818),
 ('be', 1784),
 ('white', 1752),
 ('t', 1715),
 ('russia', 1678),
 ('president', 1545),
 ('that', 1516),
 ('are', 1514),
 ('just', 1463),
 ('election

Remove Punctuation and Single Letter Tokens from Clean Headline

In [56]:
df['clean_headline_tokens'] = df['clean_headline_tokens'].apply(lambda x: remove_single_characters(x, ['i', '!']))

In [57]:
review_freq_dis(df, 'clean_headline_tokens', 150)

The number of unique tokens in the corpus is 43606


[('to', 20369),
 ('the', 17371),
 ('trump', 16159),
 ('in', 11976),
 ('of', 10359),
 ('for', 8883),
 ('new', 8612),
 ('on', 8319),
 ('york', 6563),
 ('times', 6488),
 ('and', 5691),
 ("'s", 4913),
 ('U.S.', 4719),
 ('with', 4496),
 ('is', 4425),
 ('VIDEO', 4205),
 ('says', 3877),
 ('at', 2976),
 ('after', 2877),
 ('!', 2819),
 ('clinton', 2774),
 ('as', 2757),
 ('obama', 2651),
 ('by', 2609),
 ('from', 2520),
 ('hillary', 2486),
 ('breitbart', 2437),
 ('over', 2345),
 ('it', 2246),
 ('house', 2205),
 ('video', 2187),
 ('about', 2045),
 ('will', 2028),
 ('his', 1998),
 ('he', 1976),
 ('not', 1957),
 ('donald', 1818),
 ('be', 1784),
 ('white', 1752),
 ('russia', 1678),
 ('president', 1545),
 ('that', 1516),
 ('are', 1514),
 ('just', 1463),
 ('election', 1406),
 ('this', 1383),
 ('you', 1375),
 ('WATCH', 1340),
 ('bill', 1333),
 ('state', 1289),
 ('out', 1287),
 ('who', 1217),
 ('up', 1202),
 ('what', 1196),
 ('has', 1191),
 ('how', 1182),
 ('republican', 1147),
 ('north', 1135),
 ('again

Removing "'s"

While the fake news frequently or always didn't removed the apostrophe from 's, it doesn't look like that was done to the true news. 's will need to be removed so that it doesn't become a false indicator of true news.

In [58]:
df['clean_headline_tokens'] = df['clean_headline_tokens'].apply(lambda x: remove_words(x, ["'s"]))
df['clean_news_tokens'] = df['clean_news_tokens'].apply(lambda x: remove_words(x, ["'s"]))

Remove Date Words

To better generalize the models removing all the date words.

In [59]:
date_words = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 
              'saturday', 'sunday', 'january', 'february', 'march', 'april',
             'may', 'june', 'july', 'august', 'september', 'october',
             'november', 'december']

In [60]:
df['clean_headline_tokens'] = df['clean_headline_tokens'].apply(lambda x: remove_words(x, date_words))
df['clean_news_tokens'] = df['clean_news_tokens'].apply(lambda x: remove_words(x, date_words))

In [61]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kanai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Remove Stop Words

In [62]:
stop_words = stopwords.words('english')

In [63]:
display(stop_words)

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [64]:
most_freq_clean_news = [x[0] for x in list(freq_dist_of_col(df, 'clean_news_tokens').most_common(150))]

The number of unique tokens in the corpus is 354861


In [65]:
most_freq_clean_news

['the',
 'to',
 'of',
 'and',
 'in',
 'that',
 'is',
 'on',
 'for',
 'it',
 'he',
 'with',
 'said',
 'was',
 'as',
 'trump',
 'by',
 'his',
 'have',
 'not',
 'has',
 'be',
 'at',
 'from',
 'are',
 'this',
 'they',
 'an',
 'but',
 'who',
 'i',
 'we',
 'you',
 'would',
 'about',
 'will',
 'had',
 'their',
 'or',
 'been',
 'people',
 'more',
 'she',
 'were',
 'president',
 'one',
 'her',
 'which',
 'if',
 'mr.',
 'what',
 'all',
 'there',
 'can',
 'when',
 'out',
 'after',
 'new',
 'clinton',
 'its',
 'also',
 'up',
 'no',
 'state',
 'so',
 'U.S.',
 'than',
 'over',
 'our',
 'some',
 'other',
 'like',
 'just',
 'do',
 'him',
 'into',
 'could',
 'states',
 'them',
 'government',
 'obama',
 'because',
 'time',
 'now',
 'house',
 'donald',
 'united',
 'two',
 'even',
 'campaign',
 'how',
 'many',
 'only',
 'against',
 'first',
 'last',
 'republican',
 'any',
 'most',
 'years',
 'told',
 'white',
 'news',
 'election',
 'year',
 'did',
 'those',
 'party',
 'while',
 'before',
 'country',
 'bei

In [66]:
def intersection(lst1, lst2):
    """Return the intersection of two lists"""

    temp = set(lst2) 
    lst3 = [value for value in lst1 if value in temp] 
    return lst3

In [67]:
common_words = intersection(stop_words, most_freq_clean_news)

In [68]:
common_words

['i',
 'my',
 'we',
 'our',
 'you',
 'your',
 'he',
 'him',
 'his',
 'she',
 'her',
 'it',
 'its',
 'they',
 'them',
 'their',
 'what',
 'which',
 'who',
 'this',
 'that',
 'these',
 'those',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'do',
 'did',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'into',
 'during',
 'before',
 'after',
 'to',
 'from',
 'up',
 'in',
 'out',
 'on',
 'over',
 'then',
 'here',
 'there',
 'when',
 'where',
 'how',
 'all',
 'any',
 'more',
 'most',
 'other',
 'some',
 'no',
 'not',
 'only',
 'so',
 'than',
 'very',
 'can',
 'will',
 'just',
 'don',
 'should',
 'now',
 're']

In [69]:
len(common_words)

87

In [70]:
len(stop_words)

179

In [71]:
def difference(lst1, lst2):
    """Return the difference of two lists"""

    temp = set(lst2) 
    lst3 = [value for value in lst1 if value not in temp] 
    return lst3

In [72]:
words_in_nltk_not_news = difference(stop_words, most_freq_clean_news)

In [73]:
words_in_nltk_not_news

['me',
 'myself',
 'ours',
 'ourselves',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'yours',
 'yourself',
 'yourselves',
 'himself',
 "she's",
 'hers',
 'herself',
 "it's",
 'itself',
 'theirs',
 'themselves',
 'whom',
 "that'll",
 'am',
 'having',
 'does',
 'doing',
 'a',
 'until',
 'between',
 'through',
 'above',
 'below',
 'down',
 'off',
 'under',
 'again',
 'further',
 'once',
 'why',
 'both',
 'each',
 'few',
 'such',
 'nor',
 'own',
 'same',
 'too',
 's',
 't',
 "don't",
 "should've",
 'd',
 'll',
 'm',
 'o',
 've',
 'y',
 'ain',
 'aren',
 "aren't",
 'couldn',
 "couldn't",
 'didn',
 "didn't",
 'doesn',
 "doesn't",
 'hadn',
 "hadn't",
 'hasn',
 "hasn't",
 'haven',
 "haven't",
 'isn',
 "isn't",
 'ma',
 'mightn',
 "mightn't",
 'mustn',
 "mustn't",
 'needn',
 "needn't",
 'shan',
 "shan't",
 'shouldn',
 "shouldn't",
 'wasn',
 "wasn't",
 'weren',
 "weren't",
 'won',
 "won't",
 'wouldn',
 "wouldn't"]

In [74]:
words_in_news_not_nltk = difference(most_freq_clean_news, stop_words)

In [75]:
words_in_news_not_nltk

['said',
 'trump',
 'would',
 'people',
 'president',
 'one',
 'mr.',
 'new',
 'clinton',
 'also',
 'state',
 'U.S.',
 'like',
 'could',
 'states',
 'government',
 'obama',
 'time',
 'house',
 'donald',
 'united',
 'two',
 'even',
 'campaign',
 'many',
 'first',
 'last',
 'republican',
 'years',
 'told',
 'white',
 'news',
 'election',
 'year',
 'party',
 'country',
 'twitter-handle',
 'hillary',
 'american',
 'get',
 'reuters',
 '``',
 'going',
 'make',
 'former',
 'world',
 '!',
 'say',
 'political',
 "''",
 'made',
 'percent',
 'national',
 'back',
 'since',
 'media',
 'way',
 'think',
 'know',
 'law',
 'police',
 'security',
 'much']

Looking at the remaining frequent words from the news text, that are all very concentrated on political news.

Saving data

In [76]:
df.to_csv(os.path.join(dataset_dir,'train_news_preprocessed.csv'),index=False)