In [1]:
import os
import pandas as pd
import re
import string
import nltk
from nltk.tokenize import RegexpTokenizer, word_tokenize, TweetTokenizer
from nltk import FreqDist
from nltk.corpus import stopwords

Importing the dataset

In [2]:
cwd = os.getcwd()
dataset_dir = os.path.join(cwd,'Dataset')
df = pd.read_csv(os.path.join(dataset_dir,'train_news_cleaned.csv'))

In [3]:
print("Dataset shape:", df.shape)

Dataset shape: (101, 10)


In [4]:
df.head()

Unnamed: 0,id,news,headline,label,headline_len,news_len,caps_in_headline,norm_caps_in_headline,caps_in_news,norm_caps_in_news
0,107_Real.txt,See Liberal Facebook and Conservative Facebook...,Blue Feed Red Feed\n,0,19,1151,4,0.210526,32,0.027802
1,125_Real.txt,Contrary to the conventional wisdom saying tha...,"""It's Official """"Bernie Sanders Is Staying In ...",0,79,7740,13,0.164557,307,0.039664
2,152_Real.txt,An anonymous Jane Doe filed a federal lawsuit ...,Why The New Child Rape Case Filed Against Dona...,0,77,13675,14,0.181818,376,0.027495
3,153_Real.txt,"It came together in about a week. First, the i...",Pantsuit Power flashmob video for Hillary Clin...,0,86,5102,5,0.05814,186,0.036456
4,115_Real.txt,Donald Trumps new campaign manager once insist...,Donald Trump's campaign manager says rape woul...,0,81,2068,2,0.024691,94,0.045455


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     101 non-null    object 
 1   news                   101 non-null    object 
 2   headline               101 non-null    object 
 3   label                  101 non-null    int64  
 4   headline_len           101 non-null    int64  
 5   news_len               101 non-null    int64  
 6   caps_in_headline       101 non-null    int64  
 7   norm_caps_in_headline  101 non-null    float64
 8   caps_in_news           101 non-null    int64  
 9   norm_caps_in_news      101 non-null    float64
dtypes: float64(2), int64(5), object(3)
memory usage: 8.0+ KB


Clean Headline and News data.

Helper Functions

In [6]:
def concat_lists_of_strings(df, column):
    """Concatenate a series of lists of strings from a column in a dataframe"""
    return [x for list_ in df[column].values for x in list_]

In [7]:
def find_strings(string_, regex):
    """Find and Return a list of URLs in the input string"""
    list_ = re.findall(regex, string_)
    return [s[0] for s in list_]

In [8]:
def freq_dist_of_col(df, col):
    """Return a Frequency Distribution of a column"""
    corpus_tokens = concat_lists_of_strings(df, col)
    corpus_freq_dist = FreqDist(corpus_tokens)
    print(f'The number of unique tokens in the corpus is {len(corpus_freq_dist)}')
    return corpus_freq_dist

In [9]:
def review_freq_dis(df, col, n):
    """
    Create a Frequency Distribution of a column of a dataframe and display
    the n most common tokens.
    """
    corpus_freq_dist = freq_dist_of_col(df, col)
    display(corpus_freq_dist.most_common(n))

In [10]:
def remove_punctuation(word_list, punctuation_list):
    """Remove punctuation tokens from a list of tokens"""
    return [w for w in word_list if w not in punctuation_list]

In [11]:
def remove_single_characters(word_list, exception_list):
    """Remove all the single characters, except those on the exception list"""
    return [w for w in word_list if (len(w) > 1 or w in exception_list)]

In [12]:
def remove_words(word_list, words_to_remove):
    """Remove all the words in the words_to_remove list from the words_list"""
    return [w for w in word_list if w not in words_to_remove]

Token Frequency Distribution

In [13]:
tknzr = RegexpTokenizer(r'\w+|\$[\d\.]+|\([@\w\d]+\)')
df['news_tokens'] = df['news'].apply(tknzr.tokenize)

In [14]:
corpus_freq_dist = freq_dist_of_col(df, 'news_tokens')

The number of unique tokens in the corpus is 11461


In [15]:
corpus_freq_dist.most_common(150)

[('the', 3957),
 ('to', 2604),
 ('of', 2378),
 ('and', 2076),
 ('a', 2022),
 ('that', 1344),
 ('in', 1327),
 ('is', 1139),
 ('Trump', 863),
 ('for', 834),
 ('I', 646),
 ('on', 591),
 ('it', 591),
 ('he', 571),
 ('was', 563),
 ('not', 555),
 ('with', 550),
 ('his', 528),
 ('The', 510),
 ('as', 498),
 ('are', 492),
 ('be', 462),
 ('has', 436),
 ('have', 433),
 ('by', 422),
 ('Clinton', 382),
 ('this', 380),
 ('who', 356),
 ('they', 341),
 ('an', 336),
 ('from', 329),
 ('or', 329),
 ('about', 328),
 ('at', 327),
 ('you', 324),
 ('but', 313),
 ('Donald', 304),
 ('would', 301),
 ('people', 287),
 ('her', 285),
 ('s', 282),
 ('more', 266),
 ('will', 263),
 ('we', 258),
 ('all', 252),
 ('their', 235),
 ('she', 232),
 ('said', 225),
 ('Hillary', 224),
 ('one', 216),
 ('what', 210),
 ('He', 203),
 ('them', 198),
 ('But', 195),
 ('been', 192),
 ('like', 191),
 ('our', 189),
 ('so', 189),
 ('him', 188),
 ('than', 187),
 ('if', 186),
 ('had', 186),
 ('just', 184),
 ('were', 183),
 ('do', 179),
 ('

Tokens used only once

In [16]:
len([w for w in corpus_freq_dist.most_common() if w[1] == 1])

5498

Tokens used less than 5 times

In [17]:
len([w for w in corpus_freq_dist.most_common() if w[1] <= 5])

9322

At the top of the frequency distribution, the usual stop words are present, along with with words associated with politics or the names of political figures, institutions or countries.

The amount of words that are used only once or 5 or less times is relatively small given the size of the corpus.

Investigate if URLs are present in the news article text

In [18]:
URL_REGEX = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

In [19]:
df['news_urls'] = df['news'].apply(lambda x: find_strings(x, URL_REGEX))

In [20]:
urls_in_news = concat_lists_of_strings(df, 'news_urls')

In [21]:
urls_in_news

['pic.twitter.com/kS8Z4dq9Qf']

In [22]:
len(urls_in_news)

1

In [23]:
url_freq_dist = FreqDist(urls_in_news)

In [24]:
url_freq_dist.most_common(150)

[('pic.twitter.com/kS8Z4dq9Qf', 1)]

The first two links are mentioned multiple times but when I tried to check them out they were sealed. As more time goes on more of the links will stop working. So it would be better to replace them with a placeholder {link}.

In [25]:
df['clean_news'] = df['news'].apply(lambda x: re.sub(URL_REGEX, '{link}', x))

Investigate if URLs are present in the headline text

In [26]:
df['headline_urls'] = df['headline'].apply(lambda x: find_strings(x, URL_REGEX))

In [27]:
urls_in_headline = concat_lists_of_strings(df, 'headline_urls')

In [28]:
urls_in_headline

[]

Replacing the links in the headlines with placeholder aswell.

In [29]:
df['clean_headline'] = df['headline'].apply(lambda x: re.sub(URL_REGEX, '{link}', x))

Investigate Twitter handles in news articles.

In [30]:
TWITTER_HANDLE_REGEX = r'(?<=^|(?<=[^\w]))(@\w{1,15})\b'

In [31]:
df['twitter_handles'] = df['clean_news'].apply(lambda x: re.findall(TWITTER_HANDLE_REGEX, x))

In [32]:
twitter_handles = concat_lists_of_strings(df, 'twitter_handles')

In [33]:
twitter_freq_dist = FreqDist(twitter_handles)

In [34]:
twitter_freq_dist.most_common(50)

[('@AnnCompton', 1),
 ('@pastpunditry', 1),
 ('@Miller_Center', 1),
 ('@cyvault', 1),
 ('@FoxNews', 1),
 ('@DRUDGE', 1),
 ('@TLProfessor', 1),
 ('@WrongThinkBlog', 1),
 ('@VoxRomani', 1),
 ('@CNY_KFieLd', 1),
 ('@haydenchad20', 1),
 ('@harpandjoseph', 1),
 ('@austin_klavins', 1)]

In [35]:
len(twitter_handles)

13

In [36]:
len(twitter_freq_dist)

13

In [37]:
df['clean_news'] = df['clean_news'].apply(lambda x: re.sub(TWITTER_HANDLE_REGEX, '@twitter-handle', x))

Capitalization

Because words with all caps are an import way that emphasis is made online, we will keep words that are in all caps while making all the letters in other words lower case. Words of length of one will be made lower case though since they are likely A or I which can be made lowercase without losing much emphasis.

In [38]:
def lower_unless_all_caps(string_):
    """
    Make all words in the input string lowercase unless that 
    word is in all caps
    """
    words = string_.split()
    processed_words = [w.lower() if not (w.isupper() and len(w) > 1) else w for w in words]
    return ' '.join(processed_words)

In [39]:
df['clean_news'] = df['clean_news'].apply(lower_unless_all_caps)

In [40]:
df['clean_headline'] = df['clean_headline'].apply(lower_unless_all_caps)

In [41]:
df.head()

Unnamed: 0,id,news,headline,label,headline_len,news_len,caps_in_headline,norm_caps_in_headline,caps_in_news,norm_caps_in_news,news_tokens,news_urls,clean_news,headline_urls,clean_headline,twitter_handles
0,107_Real.txt,See Liberal Facebook and Conservative Facebook...,Blue Feed Red Feed\n,0,19,1151,4,0.210526,32,0.027802,"[See, Liberal, Facebook, and, Conservative, Fa...",[],see liberal facebook and conservative facebook...,[],blue feed red feed,[]
1,125_Real.txt,Contrary to the conventional wisdom saying tha...,"""It's Official """"Bernie Sanders Is Staying In ...",0,79,7740,13,0.164557,307,0.039664,"[Contrary, to, the, conventional, wisdom, sayi...",[],contrary to the conventional wisdom saying tha...,[],"""it's official """"bernie sanders is staying in ...",[]
2,152_Real.txt,An anonymous Jane Doe filed a federal lawsuit ...,Why The New Child Rape Case Filed Against Dona...,0,77,13675,14,0.181818,376,0.027495,"[An, anonymous, Jane, Doe, filed, a, federal, ...",[],an anonymous jane doe filed a federal lawsuit ...,[],why the new child rape case filed against dona...,[]
3,153_Real.txt,"It came together in about a week. First, the i...",Pantsuit Power flashmob video for Hillary Clin...,0,86,5102,5,0.05814,186,0.036456,"[It, came, together, in, about, a, week, First...",[],"it came together in about a week. first, the i...",[],pantsuit power flashmob video for hillary clin...,[]
4,115_Real.txt,Donald Trumps new campaign manager once insist...,Donald Trump's campaign manager says rape woul...,0,81,2068,2,0.024691,94,0.045455,"[Donald, Trumps, new, campaign, manager, once,...",[],donald trumps new campaign manager once insist...,[],donald trump's campaign manager says rape woul...,[]


Number in data

I will replace the numbers with a space because some of the sentences run together and end with a number. Replacing the number with a space will split the sentences.

In [42]:
df['clean_news'] = df['clean_news'].apply(lambda x: re.sub(r'9\/11', 'nine-eleven', x))

In [43]:
df['clean_news'] = df['clean_news'].apply(lambda x: re.sub(r'\d+', ' ', x))

In [44]:
df['clean_headline'] = df['clean_headline'].apply(lambda x: re.sub(r'9\/11', 'nine-eleven', x))

In [45]:
df['clean_headline'] = df['clean_headline'].apply(lambda x: re.sub(r'\d+', ' ', x))

In [46]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kanai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Tokens in the current clean news articles

In [47]:
df['clean_news_tokens'] = df['clean_news'].apply(word_tokenize)

In [48]:
review_freq_dis(df, 'clean_news_tokens', 150)

The number of unique tokens in the corpus is 10511


[(',', 5318),
 ('.', 4516),
 ('the', 4463),
 ('to', 2626),
 ('of', 2386),
 ('and', 2206),
 ('a', 2092),
 ('in', 1463),
 ('that', 1404),
 ('is', 1163),
 ('for', 874),
 ('trump', 859),
 ('he', 782),
 ('it', 757),
 ('on', 624),
 ('not', 600),
 ('i', 595),
 ('his', 575),
 ('was', 575),
 ('with', 560),
 ('as', 550),
 ('are', 512),
 ('but', 506),
 ('this', 506),
 ('be', 466),
 ('they', 447),
 ('has', 443),
 ('have', 440),
 ('by', 433),
 ('we', 408),
 ('you', 400),
 ('clinton', 381),
 ('who', 368),
 ('an', 350),
 (':', 343),
 ('at', 340),
 ('or', 340),
 ('from', 337),
 ('about', 335),
 ('``', 317),
 ('would', 313),
 ('?', 312),
 ("''", 311),
 ('donald', 304),
 ('her', 300),
 ('she', 299),
 ('people', 298),
 ("'s", 279),
 ('more', 276),
 ('if', 274),
 ('what', 274),
 ('will', 268),
 ('all', 266),
 ('when', 246),
 ('their', 244),
 ('one', 233),
 ('so', 233),
 ('said', 228),
 ('hillary', 227),
 ('do', 226),
 ('former', 220),
 ('no', 215),
 ('our', 212),
 ('president', 206),
 ('were', 199),
 ('ca

Removing all of the Punctuation tokens except for the exclamation point, because it seems like it may be an indicator of Fake news. Also removing all the single characters except for i.

In [49]:
df['clean_news_tokens'] = df['clean_news_tokens'].apply(lambda x: remove_single_characters(x, ['i', '!']))

In [50]:
review_freq_dis(df, 'clean_news_tokens', 150)

The number of unique tokens in the corpus is 10466


[('the', 4463),
 ('to', 2626),
 ('of', 2386),
 ('and', 2206),
 ('in', 1463),
 ('that', 1404),
 ('is', 1163),
 ('for', 874),
 ('trump', 859),
 ('he', 782),
 ('it', 757),
 ('on', 624),
 ('not', 600),
 ('i', 595),
 ('his', 575),
 ('was', 575),
 ('with', 560),
 ('as', 550),
 ('are', 512),
 ('but', 506),
 ('this', 506),
 ('be', 466),
 ('they', 447),
 ('has', 443),
 ('have', 440),
 ('by', 433),
 ('we', 408),
 ('you', 400),
 ('clinton', 381),
 ('who', 368),
 ('an', 350),
 ('at', 340),
 ('or', 340),
 ('from', 337),
 ('about', 335),
 ('``', 317),
 ('would', 313),
 ("''", 311),
 ('donald', 304),
 ('her', 300),
 ('she', 299),
 ('people', 298),
 ("'s", 279),
 ('more', 276),
 ('if', 274),
 ('what', 274),
 ('will', 268),
 ('all', 266),
 ('when', 246),
 ('their', 244),
 ('one', 233),
 ('so', 233),
 ('said', 228),
 ('hillary', 227),
 ('do', 226),
 ('former', 220),
 ('no', 215),
 ('our', 212),
 ('president', 206),
 ('were', 199),
 ('can', 198),
 ('them', 198),
 ('just', 197),
 ('like', 196),
 ('been', 

Tokens in the current clean headline

In [51]:
df['clean_headline_tokens'] = df['clean_headline'].apply(word_tokenize)

In [52]:
review_freq_dis(df, 'clean_headline_tokens', 150)

The number of unique tokens in the corpus is 575


[('trump', 50),
 ('donald', 33),
 ("''", 32),
 ('the', 25),
 ("'s", 24),
 ('to', 24),
 ('clinton', 23),
 ('hillary', 19),
 ('``', 17),
 ('for', 17),
 ('is', 16),
 ('of', 13),
 ('in', 11),
 ('president', 11),
 ('.', 11),
 ('will', 10),
 ('and', 9),
 ('why', 8),
 ('says', 8),
 ('a', 8),
 ('he', 8),
 ('you', 7),
 ('i', 7),
 ('sanders', 6),
 ('be', 6),
 ('if', 6),
 ('america', 6),
 ('me', 6),
 ('not', 5),
 ('do', 5),
 ("n't", 5),
 ('american', 5),
 ('support', 5),
 ('campaign', 4),
 ('than', 4),
 ('time', 4),
 ('vote', 4),
 ('elected', 4),
 ('who', 4),
 ('rally', 4),
 ('this', 4),
 ('has', 4),
 ('!', 4),
 ('it', 3),
 ('rape', 3),
 ('against', 3),
 ('should', 3),
 ('women', 3),
 ('no', 3),
 ('would', 3),
 ('we', 3),
 ('more', 3),
 ('emails', 3),
 ('they', 3),
 ('leave', 3),
 ('CLINTON', 3),
 ('FOR', 3),
 ('how', 3),
 ('with', 3),
 ('as', 3),
 ('from', 3),
 ('$', 3),
 ('on', 3),
 ('just', 3),
 ('run', 3),
 ('election', 3),
 ('republican', 3),
 ('found', 3),
 ('email', 3),
 ('could', 3),
 ('I

Remove Punctuation and Single Letter Tokens from Clean Headline

In [53]:
df['clean_headline_tokens'] = df['clean_headline_tokens'].apply(lambda x: remove_single_characters(x, ['i', '!']))

In [54]:
review_freq_dis(df, 'clean_headline_tokens', 150)

The number of unique tokens in the corpus is 564


[('trump', 50),
 ('donald', 33),
 ("''", 32),
 ('the', 25),
 ("'s", 24),
 ('to', 24),
 ('clinton', 23),
 ('hillary', 19),
 ('``', 17),
 ('for', 17),
 ('is', 16),
 ('of', 13),
 ('in', 11),
 ('president', 11),
 ('will', 10),
 ('and', 9),
 ('why', 8),
 ('says', 8),
 ('he', 8),
 ('you', 7),
 ('i', 7),
 ('sanders', 6),
 ('be', 6),
 ('if', 6),
 ('america', 6),
 ('me', 6),
 ('not', 5),
 ('do', 5),
 ("n't", 5),
 ('american', 5),
 ('support', 5),
 ('campaign', 4),
 ('than', 4),
 ('time', 4),
 ('vote', 4),
 ('elected', 4),
 ('who', 4),
 ('rally', 4),
 ('this', 4),
 ('has', 4),
 ('!', 4),
 ('it', 3),
 ('rape', 3),
 ('against', 3),
 ('should', 3),
 ('women', 3),
 ('no', 3),
 ('would', 3),
 ('we', 3),
 ('more', 3),
 ('emails', 3),
 ('they', 3),
 ('leave', 3),
 ('CLINTON', 3),
 ('FOR', 3),
 ('how', 3),
 ('with', 3),
 ('as', 3),
 ('from', 3),
 ('on', 3),
 ('just', 3),
 ('run', 3),
 ('election', 3),
 ('republican', 3),
 ('found', 3),
 ('email', 3),
 ('could', 3),
 ('ISIS', 3),
 ('my', 3),
 ('ted', 3),

Removing "'s"

While the fake news frequently or always didn't removed the apostrophe from 's, it doesn't look like that was done to the true news. 's will need to be removed so that it doesn't become a false indicator of true news.

In [55]:
df['clean_headline_tokens'] = df['clean_headline_tokens'].apply(lambda x: remove_words(x, ["'s"]))
df['clean_news_tokens'] = df['clean_news_tokens'].apply(lambda x: remove_words(x, ["'s"]))

Remove Date Words

To better generalize the models removing all the date words.

In [56]:
date_words = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 
              'saturday', 'sunday', 'january', 'february', 'march', 'april',
             'may', 'june', 'july', 'august', 'september', 'october',
             'november', 'december']

In [57]:
df['clean_headline_tokens'] = df['clean_headline_tokens'].apply(lambda x: remove_words(x, date_words))
df['clean_news_tokens'] = df['clean_news_tokens'].apply(lambda x: remove_words(x, date_words))

In [58]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kanai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Remove Stop Words

In [59]:
stop_words = stopwords.words('english')

In [60]:
display(stop_words)

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [61]:
most_freq_clean_news = [x[0] for x in list(freq_dist_of_col(df, 'clean_news_tokens').most_common(150))]

The number of unique tokens in the corpus is 10446


In [62]:
most_freq_clean_news

['the',
 'to',
 'of',
 'and',
 'in',
 'that',
 'is',
 'for',
 'trump',
 'he',
 'it',
 'on',
 'not',
 'i',
 'his',
 'was',
 'with',
 'as',
 'are',
 'but',
 'this',
 'be',
 'they',
 'has',
 'have',
 'by',
 'we',
 'you',
 'clinton',
 'who',
 'an',
 'at',
 'or',
 'from',
 'about',
 '``',
 'would',
 "''",
 'donald',
 'her',
 'she',
 'people',
 'more',
 'if',
 'what',
 'will',
 'all',
 'when',
 'their',
 'one',
 'so',
 'said',
 'hillary',
 'do',
 'former',
 'no',
 'our',
 'president',
 'were',
 'can',
 'them',
 'just',
 'like',
 'been',
 'had',
 'him',
 'than',
 'how',
 'its',
 'mr.',
 'out',
 'new',
 'there',
 'because',
 'which',
 'party',
 'these',
 'even',
 'american',
 'republican',
 'time',
 'over',
 'know',
 'campaign',
 'other',
 'my',
 'could',
 "n't",
 'up',
 'did',
 'me',
 'some',
 'political',
 'after',
 'only',
 'also',
 'now',
 'state',
 'many',
 'those',
 'most',
 'any',
 'think',
 'very',
 'into',
 'against',
 'white',
 'years',
 'trumps',
 'first',
 'way',
 'country',
 'get'

In [63]:
def intersection(lst1, lst2):
    """Return the intersection of two lists"""

    temp = set(lst2) 
    lst3 = [value for value in lst1 if value in temp] 
    return lst3

In [64]:
common_words = intersection(stop_words, most_freq_clean_news)

In [65]:
common_words

['i',
 'me',
 'my',
 'we',
 'our',
 'you',
 'your',
 'he',
 'him',
 'his',
 'she',
 'her',
 'it',
 'its',
 'they',
 'them',
 'their',
 'what',
 'which',
 'who',
 'this',
 'that',
 'these',
 'those',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'do',
 'does',
 'did',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'into',
 'after',
 'to',
 'from',
 'up',
 'in',
 'out',
 'on',
 'over',
 'under',
 'then',
 'there',
 'when',
 'how',
 'all',
 'any',
 'more',
 'most',
 'other',
 'some',
 'no',
 'not',
 'only',
 'so',
 'than',
 'too',
 'very',
 'can',
 'will',
 'just',
 'now']

In [66]:
len(common_words)

84

In [67]:
len(stop_words)

179

In [68]:
def difference(lst1, lst2):
    """Return the difference of two lists"""

    temp = set(lst2) 
    lst3 = [value for value in lst1 if value not in temp] 
    return lst3

In [69]:
words_in_nltk_not_news = difference(stop_words, most_freq_clean_news)

In [70]:
words_in_nltk_not_news

['myself',
 'ours',
 'ourselves',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'yours',
 'yourself',
 'yourselves',
 'himself',
 "she's",
 'hers',
 'herself',
 "it's",
 'itself',
 'theirs',
 'themselves',
 'whom',
 "that'll",
 'am',
 'having',
 'doing',
 'a',
 'until',
 'between',
 'through',
 'during',
 'before',
 'above',
 'below',
 'down',
 'off',
 'again',
 'further',
 'once',
 'here',
 'where',
 'why',
 'both',
 'each',
 'few',
 'such',
 'nor',
 'own',
 'same',
 's',
 't',
 'don',
 "don't",
 'should',
 "should've",
 'd',
 'll',
 'm',
 'o',
 're',
 've',
 'y',
 'ain',
 'aren',
 "aren't",
 'couldn',
 "couldn't",
 'didn',
 "didn't",
 'doesn',
 "doesn't",
 'hadn',
 "hadn't",
 'hasn',
 "hasn't",
 'haven',
 "haven't",
 'isn',
 "isn't",
 'ma',
 'mightn',
 "mightn't",
 'mustn',
 "mustn't",
 'needn',
 "needn't",
 'shan',
 "shan't",
 'shouldn',
 "shouldn't",
 'wasn',
 "wasn't",
 'weren',
 "weren't",
 'won',
 "won't",
 'wouldn',
 "wouldn't"]

In [71]:
words_in_news_not_nltk = difference(most_freq_clean_news, stop_words)

In [72]:
words_in_news_not_nltk

['trump',
 'clinton',
 '``',
 'would',
 "''",
 'donald',
 'people',
 'one',
 'said',
 'hillary',
 'former',
 'president',
 'like',
 'mr.',
 'new',
 'party',
 'even',
 'american',
 'republican',
 'time',
 'know',
 'campaign',
 'could',
 "n't",
 'political',
 'also',
 'state',
 'many',
 'think',
 'white',
 'years',
 'trumps',
 'first',
 'way',
 'country',
 'get',
 'presidential',
 'national',
 'sanders',
 'make',
 'politics',
 'candidate',
 'email',
 'support',
 'us',
 'much',
 'authoritarians',
 'secretary',
 'election',
 'say',
 'america',
 'emails',
 'says',
 'women',
 'dont',
 'want',
 'good',
 'told',
 'states',
 'bush',
 'clintons',
 'news',
 'bill',
 'see',
 'every',
 'representative']

Looking at the remaining frequent words from the news text, that are all very concentrated on political news.

Saving data

In [73]:
df.to_csv(os.path.join(dataset_dir,'train_news_preprocessed_bf.csv'),index=False)