In [1]:
import sys

In [2]:
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install nltk
!{sys.executable} -m pip install regex



In [3]:
import os
import pandas as pd
import re
import string
from nltk.tokenize import RegexpTokenizer, word_tokenize, TweetTokenizer
from nltk import FreqDist
from nltk.corpus import stopwords

Importing the dataset

In [4]:
cwd = os.getcwd()
dataset_dir = os.path.join(cwd,'Dataset')
df = pd.read_csv(os.path.join(dataset_dir,'train_news_cleaned.csv'))

In [5]:
print("Dataset shape:", df.shape)

Dataset shape: (19865, 11)


In [6]:
df.head()

Unnamed: 0,id,headline,written_by,news,label,headline_len,news_len,caps_in_headline,norm_caps_in_headline,caps_in_news,norm_caps_in_news
0,9653,Ethics Questions Dogged Agriculture Nominee as...,Eric Lipton and Steve Eder,"WASHINGTON — In Sonny Perdue’s telling, Geo...",0,84,7936,11,0.130952,227,0.028604
1,10041,U.S. Must Dig Deep to Stop Argentina’s Lionel ...,David Waldstein,HOUSTON — Venezuela had a plan. It was a ta...,0,72,6112,13,0.180556,256,0.041885
2,19113,Cotton to House: ’Do Not Walk the Plank and Vo...,Pam Key,"Sunday on ABC’s “This Week,” while discussing ...",0,100,425,12,0.12,28,0.065882
3,6868,"Paul LePage, Besieged Maine Governor, Sends Co...",Jess Bidgood,"AUGUSTA, Me. — The beleaguered Republican g...",0,100,6516,15,0.15,196,0.03008
4,7596,A Digital 9/11 If Trump Wins,Finian Cunningham,Finian Cunningham has written extensively on...,1,28,9164,5,0.178571,309,0.033719


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19865 entries, 0 to 19864
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     19865 non-null  int64  
 1   headline               19865 non-null  object 
 2   written_by             18013 non-null  object 
 3   news                   19865 non-null  object 
 4   label                  19865 non-null  int64  
 5   headline_len           19865 non-null  int64  
 6   news_len               19865 non-null  int64  
 7   caps_in_headline       19865 non-null  int64  
 8   norm_caps_in_headline  19865 non-null  float64
 9   caps_in_news           19865 non-null  int64  
 10  norm_caps_in_news      19865 non-null  float64
dtypes: float64(2), int64(6), object(3)
memory usage: 1.7+ MB


Clean Headline and News data.

Helper Functions

In [8]:
def concat_lists_of_strings(df, column):
    """Concatenate a series of lists of strings from a column in a dataframe"""
    return [x for list_ in df[column].values for x in list_]

In [9]:
def find_strings(string_, regex):
    """Find and Return a list of URLs in the input string"""
    list_ = re.findall(regex, string_)
    return [s[0] for s in list_]

In [10]:
def freq_dist_of_col(df, col):
    """Return a Frequency Distribution of a column"""
    corpus_tokens = concat_lists_of_strings(df, col)
    corpus_freq_dist = FreqDist(corpus_tokens)
    print(f'The number of unique tokens in the corpus is {len(corpus_freq_dist)}')
    return corpus_freq_dist

In [11]:
def review_freq_dis(df, col, n):
    """
    Create a Frequency Distribution of a column of a dataframe and display
    the n most common tokens.
    """
    corpus_freq_dist = freq_dist_of_col(df, col)
    display(corpus_freq_dist.most_common(n))

In [12]:
def remove_punctuation(word_list, punctuation_list):
    """Remove punctuation tokens from a list of tokens"""
    return [w for w in word_list if w not in punctuation_list]

In [13]:
def remove_single_characters(word_list, exception_list):
    """Remove all the single characters, except those on the exception list"""
    return [w for w in word_list if (len(w) > 1 or w in exception_list)]

In [14]:
def remove_words(word_list, words_to_remove):
    """Remove all the words in the words_to_remove list from the words_list"""
    return [w for w in word_list if w not in words_to_remove]

Token Frequency Distribution

In [15]:
tknzr = RegexpTokenizer(r'\w+|\$[\d\.]+|\([@\w\d]+\)')
df['news_tokens'] = df['news'].apply(tknzr.tokenize)

In [16]:
corpus_freq_dist = freq_dist_of_col(df, 'news_tokens')

The number of unique tokens in the corpus is 219721


In [17]:
corpus_freq_dist.most_common(150)

[('the', 809536),
 ('to', 415552),
 ('of', 410410),
 ('and', 361382),
 ('a', 344505),
 ('in', 279789),
 ('that', 204884),
 ('s', 160759),
 ('is', 152331),
 ('for', 135665),
 ('on', 122564),
 ('was', 100542),
 ('The', 100296),
 ('with', 97674),
 ('it', 93607),
 ('as', 88298),
 ('said', 79656),
 ('he', 77802),
 ('I', 77382),
 ('by', 72935),
 ('are', 71315),
 ('have', 71134),
 ('be', 69933),
 ('at', 67049),
 ('Mr', 66235),
 ('from', 65745),
 ('not', 65662),
 ('his', 64657),
 ('has', 63214),
 ('an', 58260),
 ('Trump', 55234),
 ('who', 54265),
 ('this', 51743),
 ('they', 51196),
 ('you', 47380),
 ('had', 45127),
 ('or', 43330),
 ('their', 43301),
 ('about', 40663),
 ('will', 39611),
 ('t', 38894),
 ('but', 36872),
 ('we', 36858),
 ('were', 36611),
 ('would', 36589),
 ('been', 36551),
 ('more', 34965),
 ('her', 34452),
 ('people', 33515),
 ('which', 33069),
 ('one', 32695),
 ('all', 30616),
 ('she', 30110),
 ('It', 29956),
 ('can', 28938),
 ('out', 28017),
 ('In', 27831),
 ('what', 27143),
 

Tokens used only once

In [18]:
len([w for w in corpus_freq_dist.most_common() if w[1] == 1])

97002

Tokens used less than 5 times

In [19]:
len([w for w in corpus_freq_dist.most_common() if w[1] <= 5])

159116

At the top of the frequency distribution, the usual stop words are present, along with with words associated with politics or the names of political figures, institutions or countries.

The amount of words that are used only once or 5 or less times is relatively small given the size of the corpus.

Investigate if URLs are present in the news article text

In [20]:
URL_REGEX = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

In [21]:
df['news_urls'] = df['news'].apply(lambda x: find_strings(x, URL_REGEX))

In [22]:
urls_in_news = concat_lists_of_strings(df, 'news_urls')

In [23]:
urls_in_news

['https://t.co/XrZsEHOXwb',
 'http://archive.org/web/',
 'www.TheDailySheeple.com',
 'www.TheDailySheeple.com',
 'pic.twitter.com/hEKPQ7R1x3',
 'pic.twitter.com/Ue0qlhqT5w',
 'https://t.co/n82d9jXopX',
 'pic.twitter.com/FVEieSYj5w',
 'pic.twitter.com/w9FjK9KD6A',
 'pic.twitter.com/oiAcUgqkKK',
 'www.SHTFplan.com',
 'http://ar.rt.com/i5hy',
 'http://humansarefree.com/2016/11/rise-of-divine-human.html',
 'pic.twitter.com/aWeHSFh1VP',
 'http://www.politico.com/story/2016/10/donald-trump-campaign-lobbyist-russian-pipeline-229264',
 'http://www.politico.com/story/2016/10/donald-trump-campaign-lobbyist-russian-pipeline-229264',
 'http://maglobal.com/about-us/our-team/john-negroponte',
 'https://www.lewrockwell.com/lrc-blog/trumps-prospects-improve/',
 'http://www.medicalchoice.news/',
 'http://www.naturalnews.com/hospitals.html',
 'http://science.naturalnews.com/hospitals.htm',
 'http://www.naturalnews.com/hospital.html',
 'www.washingtonpost.com/news/to-your-health/w',
 'https://www.lewrock

In [24]:
len(urls_in_news)

2863

In [25]:
url_freq_dist = FreqDist(urls_in_news)

In [26]:
url_freq_dist.most_common(150)

[('http://www.infowarsstore.com/health-and-wellness/infowars-life/brain-force.html?ims=tzrwu&utm_campaign=Infowars+Placement&utm_source=Infowars.com&utm_medium=Widget&utm_content=Brain+Force',
  156),
 ('http://www.infowars.com/wp-content/uploads/2015/10/brainforce-25-200-e1476824046577.jpg',
  78),
 ('http://www.voltairenet.org/article1', 76),
 ('www.TheDailySheeple.com', 41),
 ('www.voltairenet.org/article1', 32),
 ('https://facebook.com/LukeWeAreChange', 31),
 ('https://twitter.com/Lukewearechange', 31),
 ('http://instagram.com/lukewearechange', 31),
 ('http://link-address.com', 12),
 ('www.zerohedge.com', 12),
 ('http://ruptly.tv', 9),
 ('http://corp.kaltura.com/products/video-platform-features', 9),
 ('http://corp.kaltura.com/Products/Features/Video-Management', 9),
 ('http://corp.kaltura.com/Video-Solutions', 9),
 ('http://corp.kaltura.com/Products/Features/Video-Player', 9),
 ('http://www.claritypress.com/LendmanIII.html', 9),
 ('http://wearechange.org/', 9),
 ('https://www.patr

The first two links are mentioned multiple times but when I tried to check them out they were sealed. As more time goes on more of the links will stop working. So it would be better to replace them with a placeholder {link}.

In [27]:
df['clean_news'] = df['news'].apply(lambda x: re.sub(URL_REGEX, '{link}', x))

Investigate if URLs are present in the headline text

In [28]:
df['headline_urls'] = df['headline'].apply(lambda x: find_strings(x, URL_REGEX))

In [29]:
urls_in_headline = concat_lists_of_strings(df, 'headline_urls')

In [30]:
urls_in_headline

['http://journal-neo.org/2016/11/07/israel-is-becoming-pivotal-to-china-s-mid-eastern-calculus/',
 'https://youtu.be/BFpFCy_b2SM']

Replacing the links in the headlines with placeholder aswell.

In [31]:
df['clean_headline'] = df['headline'].apply(lambda x: re.sub(URL_REGEX, '{link}', x))

Investigate Twitter handles in news articles.

In [32]:
TWITTER_HANDLE_REGEX = r'(?<=^|(?<=[^\w]))(@\w{1,15})\b'

In [33]:
df['twitter_handles'] = df['clean_news'].apply(lambda x: re.findall(TWITTER_HANDLE_REGEX, x))

In [34]:
twitter_handles = concat_lists_of_strings(df, 'twitter_handles')

In [35]:
twitter_freq_dist = FreqDist(twitter_handles)

In [36]:
twitter_freq_dist.most_common(50)

[('@realDonaldTrump', 263),
 ('@pamkeyNEN', 238),
 ('@joelpollak', 128),
 ('@warnerthuston', 121),
 ('@IanHanchett', 119),
 ('@dznussbaum', 116),
 ('@jeff_poor', 107),
 ('@jeromeehudson', 105),
 ('@AWRHawkins', 105),
 ('@MagnifiTrent', 102),
 ('@BobPriceBBTX', 93),
 ('@ben_kew', 89),
 ('@MrNashington', 87),
 ('@JxhnBinder', 72),
 ('@HillaryClinton', 65),
 ('@POTUS', 59),
 ('@JeromeEHudson', 59),
 ('@AaronKleinShow', 57),
 ('@MichelleDiana', 54),
 ('@tdwilliamsrome', 54),
 ('@LucasNolan_', 53),
 ('@themightygwinn', 48),
 ('@tciccotta', 47),
 ('@charliespiering', 44),
 ('@Get2Church', 40),
 ('@wikileaks', 40),
 ('@ABFalecbaldwin', 31),
 ('@kurteichenwald', 31),
 ('@WalshFreedom', 31),
 ('@AnnCoulter', 29),
 ('@DanaBrunetti', 29),
 ('@CNN', 28),
 ('@NickGuthe', 28),
 ('@megynkelly', 27),
 ('@jaketapper', 25),
 ('@AdelleNaz', 25),
 ('@TatianaSiegel27', 25),
 ('@NikkiReed_I_Am', 25),
 ('@TomlinsonCJ', 24),
 ('@es_snipes', 23),
 ('@RT_com', 22),
 ('@CNNPolitics', 21),
 ('@markknoller', 21),


In [37]:
len(twitter_handles)

7952

In [38]:
len(twitter_freq_dist)

2915

In [39]:
df['clean_news'] = df['clean_news'].apply(lambda x: re.sub(TWITTER_HANDLE_REGEX, '@twitter-handle', x))

Capitalization

Because words with all caps are an import way that emphasis is made online, we will keep words that are in all caps while making all the letters in other words lower case. Words of length of one will be made lower case though since they are likely A or I which can be made lowercase without losing much emphasis.

In [40]:
def lower_unless_all_caps(string_):
    """
    Make all words in the input string lowercase unless that 
    word is in all caps
    """
    words = string_.split()
    processed_words = [w.lower() if not (w.isupper() and len(w) > 1) else w for w in words]
    return ' '.join(processed_words)

In [41]:
df['clean_news'] = df['clean_news'].apply(lower_unless_all_caps)

In [42]:
df['clean_headline'] = df['clean_headline'].apply(lower_unless_all_caps)

In [43]:
df.head()

Unnamed: 0,id,headline,written_by,news,label,headline_len,news_len,caps_in_headline,norm_caps_in_headline,caps_in_news,norm_caps_in_news,news_tokens,news_urls,clean_news,headline_urls,clean_headline,twitter_handles
0,9653,Ethics Questions Dogged Agriculture Nominee as...,Eric Lipton and Steve Eder,"WASHINGTON — In Sonny Perdue’s telling, Geo...",0,84,7936,11,0.130952,227,0.028604,"[WASHINGTON, In, Sonny, Perdue, s, telling, Ge...",[],"WASHINGTON — in sonny perdue’s telling, georgi...",[],ethics questions dogged agriculture nominee as...,[]
1,10041,U.S. Must Dig Deep to Stop Argentina’s Lionel ...,David Waldstein,HOUSTON — Venezuela had a plan. It was a ta...,0,72,6112,13,0.180556,256,0.041885,"[HOUSTON, Venezuela, had, a, plan, It, was, a,...",[],HOUSTON — venezuela had a plan. it was a tacti...,[],U.S. must dig deep to stop argentina’s lionel ...,[]
2,19113,Cotton to House: ’Do Not Walk the Plank and Vo...,Pam Key,"Sunday on ABC’s “This Week,” while discussing ...",0,100,425,12,0.12,28,0.065882,"[Sunday, on, ABC, s, This, Week, while, discus...",[],"sunday on abc’s “this week,” while discussing ...",[],cotton to house: ’do not walk the plank and vo...,[@pamkeyNEN]
3,6868,"Paul LePage, Besieged Maine Governor, Sends Co...",Jess Bidgood,"AUGUSTA, Me. — The beleaguered Republican g...",0,100,6516,15,0.15,196,0.03008,"[AUGUSTA, Me, The, beleaguered, Republican, go...",[],"AUGUSTA, me. — the beleaguered republican gove...",[],"paul lepage, besieged maine governor, sends co...",[]
4,7596,A Digital 9/11 If Trump Wins,Finian Cunningham,Finian Cunningham has written extensively on...,1,28,9164,5,0.178571,309,0.033719,"[Finian, Cunningham, has, written, extensively...",[],finian cunningham has written extensively on i...,[],a digital 9/11 if trump wins,[]


Number in data

I will replace the numbers with a space because some of the sentences run together and end with a number. Replacing the number with a space will split the sentences.

In [44]:
df['clean_news'] = df['clean_news'].apply(lambda x: re.sub(r'9\/11', 'nine-eleven', x))

In [45]:
df['clean_news'] = df['clean_news'].apply(lambda x: re.sub(r'\d+', ' ', x))

In [46]:
df['clean_headline'] = df['clean_headline'].apply(lambda x: re.sub(r'9\/11', 'nine-eleven', x))

In [47]:
df['clean_headline'] = df['clean_headline'].apply(lambda x: re.sub(r'\d+', ' ', x))

Tokens in the current clean news articles

In [48]:
df['clean_news_tokens'] = df['clean_news'].apply(word_tokenize)

In [49]:
review_freq_dis(df, 'clean_news_tokens', 150)

The number of unique tokens in the corpus is 213246


[('the', 908780),
 (',', 907207),
 ('.', 683188),
 ('to', 419097),
 ('of', 412385),
 ('and', 376927),
 ('a', 356351),
 ('in', 306643),
 ('’', 241461),
 ('that', 215020),
 ('“', 174464),
 ('”', 173562),
 ('s', 155174),
 ('is', 154332),
 ('for', 142357),
 ('on', 128880),
 ('it', 122767),
 ('he', 101402),
 ('with', 101188),
 ('was', 101013),
 ('as', 97553),
 ('said', 79708),
 ('by', 77116),
 ('at', 72481),
 ('are', 72478),
 ('have', 71651),
 ('not', 70405),
 ('be', 70373),
 ('his', 68949),
 ('this', 68041),
 ('from', 67597),
 ('mr.', 65556),
 ('they', 64318),
 ('has', 63496),
 ('i', 62330),
 ('but', 61462),
 ('an', 60262),
 ('you', 56454),
 ('who', 55327),
 ('trump', 54765),
 ('we', 54726),
 (':', 53425),
 ('—', 47147),
 ('had', 45319),
 ('their', 44617),
 ('or', 44172),
 ('about', 41627),
 ('will', 40636),
 ('she', 38822),
 (')', 38057),
 ('one', 37469),
 ('(', 37427),
 ('would', 36925),
 ('were', 36770),
 ('more', 36658),
 ('been', 36598),
 ('her', 36252),
 ('t', 36136),
 ('people', 357

Removing all of the Punctuation tokens except for the exclamation point, because it seems like it may be an indicator of Fake news. Also removing all the single characters except for i.

In [50]:
df['clean_news_tokens'] = df['clean_news_tokens'].apply(lambda x: remove_single_characters(x, ['i', '!']))

In [51]:
review_freq_dis(df, 'clean_news_tokens', 150)

The number of unique tokens in the corpus is 212944


[('the', 908780),
 ('to', 419097),
 ('of', 412385),
 ('and', 376927),
 ('in', 306643),
 ('that', 215020),
 ('is', 154332),
 ('for', 142357),
 ('on', 128880),
 ('it', 122767),
 ('he', 101402),
 ('with', 101188),
 ('was', 101013),
 ('as', 97553),
 ('said', 79708),
 ('by', 77116),
 ('at', 72481),
 ('are', 72478),
 ('have', 71651),
 ('not', 70405),
 ('be', 70373),
 ('his', 68949),
 ('this', 68041),
 ('from', 67597),
 ('mr.', 65556),
 ('they', 64318),
 ('has', 63496),
 ('i', 62330),
 ('but', 61462),
 ('an', 60262),
 ('you', 56454),
 ('who', 55327),
 ('trump', 54765),
 ('we', 54726),
 ('had', 45319),
 ('their', 44617),
 ('or', 44172),
 ('about', 41627),
 ('will', 40636),
 ('she', 38822),
 ('one', 37469),
 ('would', 36925),
 ('were', 36770),
 ('more', 36658),
 ('been', 36598),
 ('her', 36252),
 ('people', 35776),
 ('what', 33749),
 ('which', 33415),
 ('all', 33108),
 ('there', 32436),
 ('if', 32097),
 ('can', 32094),
 ('when', 30450),
 ('new', 29384),
 ('out', 27979),
 ('so', 27045),
 ('its',

Tokens in the current clean headline

In [52]:
df['clean_headline_tokens'] = df['clean_headline'].apply(word_tokenize)

In [53]:
review_freq_dis(df, 'clean_headline_tokens', 150)

The number of unique tokens in the corpus is 25212


[('the', 11576),
 ('’', 9720),
 ('-', 9155),
 ('new', 7084),
 ('york', 6369),
 ('times', 6345),
 (',', 6005),
 (':', 5520),
 ('to', 5347),
 ('in', 3740),
 ('of', 3713),
 ('trump', 3487),
 ('s', 3297),
 ('a', 2878),
 ('breitbart', 2401),
 ('for', 2381),
 ('and', 2368),
 ('on', 2285),
 ('‘', 1994),
 ('is', 1744),
 ('?', 1230),
 ('with', 1191),
 ('hillary', 1165),
 ('clinton', 1110),
 ('.', 1086),
 ('at', 1054),
 ('by', 1024),
 ('donald', 866),
 ('from', 833),
 ('as', 827),
 ('it', 743),
 ('after', 650),
 ('are', 636),
 ('!', 624),
 ('will', 591),
 ('”', 584),
 ('“', 583),
 ('you', 549),
 ('election', 547),
 ('what', 546),
 ('obama', 544),
 ('not', 540),
 ('be', 540),
 ('that', 528),
 ('–', 528),
 ('over', 526),
 ('your', 518),
 ('how', 512),
 ('t', 504),
 ('U.S.', 498),
 ('russia', 489),
 ('news', 483),
 ('about', 470),
 ('this', 420),
 ('says', 409),
 (')', 409),
 ('(', 408),
 ('his', 378),
 ('an', 377),
 ('war', 371),
 ('who', 370),
 ('america', 368),
 ('have', 363),
 ("'s", 360),
 ('w

Remove Punctuation and Single Letter Tokens from Clean Headline

In [54]:
df['clean_headline_tokens'] = df['clean_headline_tokens'].apply(lambda x: remove_single_characters(x, ['i', '!']))

In [55]:
review_freq_dis(df, 'clean_headline_tokens', 150)

The number of unique tokens in the corpus is 25105


[('the', 11576),
 ('new', 7084),
 ('york', 6369),
 ('times', 6345),
 ('to', 5347),
 ('in', 3740),
 ('of', 3713),
 ('trump', 3487),
 ('breitbart', 2401),
 ('for', 2381),
 ('and', 2368),
 ('on', 2285),
 ('is', 1744),
 ('with', 1191),
 ('hillary', 1165),
 ('clinton', 1110),
 ('at', 1054),
 ('by', 1024),
 ('donald', 866),
 ('from', 833),
 ('as', 827),
 ('it', 743),
 ('after', 650),
 ('are', 636),
 ('!', 624),
 ('will', 591),
 ('you', 549),
 ('election', 547),
 ('what', 546),
 ('obama', 544),
 ('not', 540),
 ('be', 540),
 ('that', 528),
 ('over', 526),
 ('your', 518),
 ('how', 512),
 ('U.S.', 498),
 ('russia', 489),
 ('news', 483),
 ('about', 470),
 ('this', 420),
 ('says', 409),
 ('his', 378),
 ('an', 377),
 ('war', 371),
 ('who', 370),
 ('america', 368),
 ('have', 363),
 ("'s", 360),
 ('we', 359),
 ('US', 358),
 ('up', 355),
 ('president', 350),
 ('has', 347),
 ('no', 345),
 ('why', 342),
 ('world', 341),
 ('FBI', 338),
 ('out', 338),
 ('he', 336),
 ('can', 329),
 ('state', 329),
 ('comme

Removing "'s"

While the fake news frequently or always didn't removed the apostrophe from 's, it doesn't look like that was done to the true news. 's will need to be removed so that it doesn't become a false indicator of true news.

In [56]:
df['clean_headline_tokens'] = df['clean_headline_tokens'].apply(lambda x: remove_words(x, ["'s"]))
df['clean_news_tokens'] = df['clean_news_tokens'].apply(lambda x: remove_words(x, ["'s"]))

Remove Date Words

To better generalize the models removing all the date words.

In [57]:
date_words = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 
              'saturday', 'sunday', 'january', 'february', 'march', 'april',
             'may', 'june', 'july', 'august', 'september', 'october',
             'november', 'december']

In [58]:
df['clean_headline_tokens'] = df['clean_headline_tokens'].apply(lambda x: remove_words(x, date_words))
df['clean_news_tokens'] = df['clean_news_tokens'].apply(lambda x: remove_words(x, date_words))

Remove Stop Words

In [59]:
stop_words = stopwords.words('english')

In [60]:
display(stop_words)

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [61]:
most_freq_clean_news = [x[0] for x in list(freq_dist_of_col(df, 'clean_news_tokens').most_common(150))]

The number of unique tokens in the corpus is 212924


In [62]:
most_freq_clean_news

['the',
 'to',
 'of',
 'and',
 'in',
 'that',
 'is',
 'for',
 'on',
 'it',
 'he',
 'with',
 'was',
 'as',
 'said',
 'by',
 'at',
 'are',
 'have',
 'not',
 'be',
 'his',
 'this',
 'from',
 'mr.',
 'they',
 'has',
 'i',
 'but',
 'an',
 'you',
 'who',
 'trump',
 'we',
 'had',
 'their',
 'or',
 'about',
 'will',
 'she',
 'one',
 'would',
 'were',
 'more',
 'been',
 'her',
 'people',
 'what',
 'which',
 'all',
 'there',
 'if',
 'can',
 'when',
 'new',
 'out',
 'so',
 'its',
 'clinton',
 'no',
 'like',
 'after',
 'also',
 'up',
 'president',
 'than',
 'our',
 'some',
 'other',
 'into',
 'them',
 'do',
 'over',
 'just',
 'time',
 'now',
 'him',
 'state',
 'could',
 'many',
 'even',
 'because',
 'how',
 'years',
 'most',
 'states',
 'only',
 'first',
 'your',
 'two',
 'my',
 'government',
 'those',
 'world',
 'american',
 'these',
 'last',
 'united',
 'any',
 'news',
 'against',
 'where',
 'did',
 'hillary',
 'year',
 'obama',
 'before',
 'being',
 'while',
 'then',
 'get',
 'campaign',
 'ms.'

In [63]:
def intersection(lst1, lst2):
    """Return the intersection of two lists"""

    temp = set(lst2) 
    lst3 = [value for value in lst1 if value in temp] 
    return lst3

In [64]:
common_words = intersection(stop_words, most_freq_clean_news)

In [65]:
common_words

['i',
 'me',
 'my',
 'we',
 'our',
 'you',
 'your',
 'he',
 'him',
 'his',
 'she',
 'her',
 'it',
 'its',
 'they',
 'them',
 'their',
 'what',
 'which',
 'who',
 'this',
 'that',
 'these',
 'those',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'do',
 'did',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'into',
 'through',
 'during',
 'before',
 'after',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'over',
 'then',
 'here',
 'there',
 'when',
 'where',
 'how',
 'all',
 'any',
 'more',
 'most',
 'other',
 'some',
 'no',
 'not',
 'only',
 'so',
 'than',
 'very',
 'can',
 'will',
 'just',
 'don',
 'should',
 'now',
 're']

In [66]:
len(common_words)

90

In [67]:
len(stop_words)

179

In [68]:
def difference(lst1, lst2):
    """Return the difference of two lists"""

    temp = set(lst2) 
    lst3 = [value for value in lst1 if value not in temp] 
    return lst3

In [69]:
words_in_nltk_not_news = difference(stop_words, most_freq_clean_news)

In [70]:
words_in_nltk_not_news

['myself',
 'ours',
 'ourselves',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'yours',
 'yourself',
 'yourselves',
 'himself',
 "she's",
 'hers',
 'herself',
 "it's",
 'itself',
 'theirs',
 'themselves',
 'whom',
 "that'll",
 'am',
 'having',
 'does',
 'doing',
 'a',
 'until',
 'between',
 'above',
 'below',
 'off',
 'under',
 'again',
 'further',
 'once',
 'why',
 'both',
 'each',
 'few',
 'such',
 'nor',
 'own',
 'same',
 'too',
 's',
 't',
 "don't",
 "should've",
 'd',
 'll',
 'm',
 'o',
 've',
 'y',
 'ain',
 'aren',
 "aren't",
 'couldn',
 "couldn't",
 'didn',
 "didn't",
 'doesn',
 "doesn't",
 'hadn',
 "hadn't",
 'hasn',
 "hasn't",
 'haven',
 "haven't",
 'isn',
 "isn't",
 'ma',
 'mightn',
 "mightn't",
 'mustn',
 "mustn't",
 'needn',
 "needn't",
 'shan',
 "shan't",
 'shouldn',
 "shouldn't",
 'wasn',
 "wasn't",
 'weren',
 "weren't",
 'won',
 "won't",
 'wouldn',
 "wouldn't"]

In [71]:
words_in_news_not_nltk = difference(most_freq_clean_news, stop_words)

In [72]:
words_in_news_not_nltk

['said',
 'mr.',
 'trump',
 'one',
 'would',
 'people',
 'new',
 'clinton',
 'like',
 'also',
 'president',
 'time',
 'state',
 'could',
 'many',
 'even',
 'years',
 'states',
 'first',
 'two',
 'government',
 'world',
 'american',
 'last',
 'united',
 'news',
 'hillary',
 'year',
 'obama',
 'get',
 'campaign',
 'ms.',
 'country',
 'going',
 'make',
 'election',
 'way',
 '!',
 'made',
 'house',
 'know',
 'back',
 'much',
 'think',
 'media',
 'white',
 'us',
 'say',
 'political',
 'day',
 'see',
 'war',
 'still',
 'told',
 'since',
 'national',
 'russia',
 'well',
 'public',
 'donald']

Looking at the remaining frequent words from the news text, that are all very concentrated on political news.

Saving data

In [74]:
df.to_csv(os.path.join(dataset_dir,'train_news_preprocessed.csv'),index=False)