In [1]:
import os
import pandas as pd
import re
import string
import nltk
from nltk.tokenize import RegexpTokenizer, word_tokenize, TweetTokenizer
from nltk import FreqDist
from nltk.corpus import stopwords

Importing the dataset

In [2]:
cwd = os.getcwd()
dataset_dir = os.path.join(cwd,'Dataset')
df = pd.read_csv(os.path.join(dataset_dir,'train_news_cleaned.csv'))

In [3]:
print("Dataset shape:", df.shape)

Dataset shape: (6059, 10)


In [4]:
df.head()

Unnamed: 0,id,headline,news,label,headline_len,news_len,caps_in_headline,norm_caps_in_headline,caps_in_news,norm_caps_in_news
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1,28,7518,5,0.178571,314,0.041766
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1,85,2646,18,0.211765,96,0.036281
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0,43,2543,2,0.046512,96,0.037751
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1,84,2660,6,0.071429,139,0.052256
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0,48,1840,8,0.166667,77,0.041848


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6059 entries, 0 to 6058
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     6059 non-null   int64  
 1   headline               6059 non-null   object 
 2   news                   6059 non-null   object 
 3   label                  6059 non-null   int64  
 4   headline_len           6059 non-null   int64  
 5   news_len               6059 non-null   int64  
 6   caps_in_headline       6059 non-null   int64  
 7   norm_caps_in_headline  6059 non-null   float64
 8   caps_in_news           6059 non-null   int64  
 9   norm_caps_in_news      6059 non-null   float64
dtypes: float64(2), int64(6), object(2)
memory usage: 473.5+ KB


Clean Headline and News data.

Helper Functions

In [6]:
def concat_lists_of_strings(df, column):
    """Concatenate a series of lists of strings from a column in a dataframe"""
    return [x for list_ in df[column].values for x in list_]

In [7]:
def find_strings(string_, regex):
    """Find and Return a list of URLs in the input string"""
    list_ = re.findall(regex, string_)
    return [s[0] for s in list_]

In [8]:
def freq_dist_of_col(df, col):
    """Return a Frequency Distribution of a column"""
    corpus_tokens = concat_lists_of_strings(df, col)
    corpus_freq_dist = FreqDist(corpus_tokens)
    print(f'The number of unique tokens in the corpus is {len(corpus_freq_dist)}')
    return corpus_freq_dist

In [9]:
def review_freq_dis(df, col, n):
    """
    Create a Frequency Distribution of a column of a dataframe and display
    the n most common tokens.
    """
    corpus_freq_dist = freq_dist_of_col(df, col)
    display(corpus_freq_dist.most_common(n))

In [10]:
def remove_punctuation(word_list, punctuation_list):
    """Remove punctuation tokens from a list of tokens"""
    return [w for w in word_list if w not in punctuation_list]

In [11]:
def remove_single_characters(word_list, exception_list):
    """Remove all the single characters, except those on the exception list"""
    return [w for w in word_list if (len(w) > 1 or w in exception_list)]

In [12]:
def remove_words(word_list, words_to_remove):
    """Remove all the words in the words_to_remove list from the words_list"""
    return [w for w in word_list if w not in words_to_remove]

Token Frequency Distribution

In [13]:
tknzr = RegexpTokenizer(r'\w+|\$[\d\.]+|\([@\w\d]+\)')
df['news_tokens'] = df['news'].apply(tknzr.tokenize)

In [14]:
corpus_freq_dist = freq_dist_of_col(df, 'news_tokens')

The number of unique tokens in the corpus is 85400


In [15]:
corpus_freq_dist.most_common(150)

[('the', 258386),
 ('to', 136807),
 ('of', 127908),
 ('and', 112658),
 ('a', 103118),
 ('in', 89443),
 ('that', 68081),
 ('s', 55998),
 ('is', 53228),
 ('for', 43781),
 ('on', 38349),
 ('The', 30202),
 ('it', 29934),
 ('with', 29492),
 ('as', 29104),
 ('was', 26891),
 ('he', 25024),
 ('are', 24971),
 ('be', 24293),
 ('have', 23969),
 ('by', 22823),
 ('Trump', 22182),
 ('has', 22157),
 ('I', 21897),
 ('not', 21701),
 ('said', 21033),
 ('his', 20942),
 ('from', 19913),
 ('at', 19901),
 ('this', 19107),
 ('Clinton', 17329),
 ('an', 17304),
 ('who', 17214),
 ('they', 16636),
 ('t', 15922),
 ('will', 14832),
 ('or', 14062),
 ('about', 13859),
 ('you', 13501),
 ('their', 13434),
 ('we', 13273),
 ('would', 12567),
 ('but', 12417),
 ('more', 12222),
 ('been', 11342),
 ('people', 10940),
 ('her', 10696),
 ('all', 10644),
 ('one', 10274),
 ('were', 10235),
 ('had', 10076),
 ('out', 9599),
 ('which', 9590),
 ('can', 9440),
 ('It', 9325),
 ('what', 9186),
 ('she', 9027),
 ('up', 8924),
 ('But', 87

Tokens used only once

In [16]:
len([w for w in corpus_freq_dist.most_common() if w[1] == 1])

32159

Tokens used less than 5 times

In [17]:
len([w for w in corpus_freq_dist.most_common() if w[1] <= 5])

57318

At the top of the frequency distribution, the usual stop words are present, along with with words associated with politics or the names of political figures, institutions or countries.

The amount of words that are used only once or 5 or less times is relatively small given the size of the corpus.

Investigate if URLs are present in the news article text

In [18]:
URL_REGEX = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

In [19]:
df['news_urls'] = df['news'].apply(lambda x: find_strings(x, URL_REGEX))

In [20]:
urls_in_news = concat_lists_of_strings(df, 'news_urls')

In [21]:
urls_in_news

['https://t.co/VyTT49YvoE',
 'pic.twitter.com/wCvSCg4a5I',
 'https://t.co/KHyOuUSrFS',
 'www.galacticconnection.com',
 'http://cytocosmos.com',
 'http://radio.offplanetmedia.net/',
 'http://earthempaths.net',
 'http://www.claritypress.com/LendmanIII.html',
 'https://t.co/BjGcFO0du5',
 'pic.twitter.com/nrMqnbW5UK',
 'pic.twitter.com/MrPUlSO1OE',
 'http://link-address.com',
 'http://www.zerohedge.com/news/2016-10-20/dear-janet-china-devalues-most-august-yuan-tumbles-lowest-sept-2010',
 'www.abovetopsecret.com',
 'www.youtube.com',
 'www.youtube.com',
 'https://t.co/P6WNdG36f5',
 'https://t.co/n21U1MfHYO',
 'https://t.co/rGADYMG5Op',
 'https://t.co/d6f9C9ALoR',
 'pic.twitter.com/ydJpV2NgXh',
 'https://t.co/bhzGRzgimg',
 'pic.twitter.com/dozNVXXVgT',
 'https://t.co/FkNiEUOZHH',
 'pic.twitter.com/dalY9KBWtj',
 'https://t.co/FfWiSCbiKf',
 'pic.twitter.com/4DRwNPkfZ9',
 'www.TheDailySheeple.com',
 'www.TheDailySheeple.com',
 'https://t.co/fay4GNLAzy',
 'http://link-address.com',
 'https://t.c

In [22]:
len(urls_in_news)

1000

In [23]:
url_freq_dist = FreqDist(urls_in_news)

In [24]:
url_freq_dist.most_common(150)

[('http://www.infowarsstore.com/health-and-wellness/infowars-life/brain-force.html?ims=tzrwu&utm_campaign=Infowars+Placement&utm_source=Infowars.com&utm_medium=Widget&utm_content=Brain+Force',
  84),
 ('http://www.infowars.com/wp-content/uploads/2015/10/brainforce-25-200-e1476824046577.jpg',
  42),
 ('www.TheDailySheeple.com', 13),
 ('www.adayattheracesblog.com', 9),
 ('http://link-address.com', 7),
 ('https://facebook.com/LukeWeAreChange', 7),
 ('https://twitter.com/Lukewearechange', 7),
 ('http://instagram.com/lukewearechange', 7),
 ('www.youtube.com', 6),
 ('http://www.claritypress.com/LendmanIII.html', 4),
 ('www.amazon.com', 4),
 ('www.zerohedge.com', 4),
 ('http://rt.com/on-air', 3),
 ('pic.twitter.com/eir8r0FJ8M', 3),
 ('pic.twitter.com/GO5Y9FCnYN', 3),
 ('www.BeforeItsNews.com', 3),
 ('www.thelastgreatstand.com', 3),
 ('http://bit.ly/1MgFbVy', 3),
 ('http://RTD.rt.com/', 3),
 ('http://twitter.com/RT_DOC', 3),
 ('http://www.facebook.com/RTDocumentary', 3),
 ('http://www.dailymot

The first two links are mentioned multiple times but when I tried to check them out they were sealed. As more time goes on more of the links will stop working. So it would be better to replace them with a placeholder {link}.

In [25]:
df['clean_news'] = df['news'].apply(lambda x: re.sub(URL_REGEX, '{link}', x))

Investigate if URLs are present in the headline text

In [26]:
df['headline_urls'] = df['headline'].apply(lambda x: find_strings(x, URL_REGEX))

In [27]:
urls_in_headline = concat_lists_of_strings(df, 'headline_urls')

In [28]:
urls_in_headline

[]

Replacing the links in the headlines with placeholder aswell.

In [29]:
df['clean_headline'] = df['headline'].apply(lambda x: re.sub(URL_REGEX, '{link}', x))

Investigate Twitter handles in news articles.

In [30]:
TWITTER_HANDLE_REGEX = r'(?<=^|(?<=[^\w]))(@\w{1,15})\b'

In [31]:
df['twitter_handles'] = df['clean_news'].apply(lambda x: re.findall(TWITTER_HANDLE_REGEX, x))

In [32]:
twitter_handles = concat_lists_of_strings(df, 'twitter_handles')

In [33]:
twitter_freq_dist = FreqDist(twitter_handles)

In [34]:
twitter_freq_dist.most_common(50)

[('@realDonaldTrump', 37),
 ('@HillaryClinton', 29),
 ('@HowardKurtz', 27),
 ('@WalshFreedom', 21),
 ('@wikileaks', 13),
 ('@kurteichenwald', 13),
 ('@brianefallon', 11),
 ('@TheUnRealTimes', 11),
 ('@RT_com', 10),
 ('@BernieSanders', 9),
 ('@hooverwhalen', 9),
 ('@USATOpinion', 8),
 ('@POTUS', 8),
 ('@MarkRuffalo', 8),
 ('@KimDotcom', 8),
 ('@nickconfessore', 8),
 ('@MMFlint', 7),
 ('@rubycramer', 7),
 ('@ToddStarnes', 7),
 ('@FBI', 7),
 ('@DanScavino', 7),
 ('@sevyn', 7),
 ('@SpeakerRyan', 6),
 ('@nytimes', 6),
 ('@SenWarren', 6),
 ('@EdwardSzall', 6),
 ('@Cernovich', 6),
 ('@JDiamond1', 6),
 ('@realdonaldtrump', 5),
 ('@RT_America', 5),
 ('@seanhannity', 5),
 ('@MittRomney', 5),
 ('@jaketapper', 5),
 ('@MattAgorist', 5),
 ('@JamesOKeefeIII', 5),
 ('@jamiedupree', 5),
 ('@megynkelly', 5),
 ('@JohnGHendy', 5),
 ('@21WIRE', 4),
 ('@JaredWyand', 4),
 ('@JasonPatinkin', 4),
 ('@derekahunter', 4),
 ('@Canada', 4),
 ('@okcthunder', 4),
 ('@Sixers', 4),
 ('@XplodingUnicorn', 4),
 ('@SarahPa

In [35]:
len(twitter_handles)

1090

In [36]:
len(twitter_freq_dist)

612

In [37]:
df['clean_news'] = df['clean_news'].apply(lambda x: re.sub(TWITTER_HANDLE_REGEX, '@twitter-handle', x))

Capitalization

Because words with all caps are an import way that emphasis is made online, we will keep words that are in all caps while making all the letters in other words lower case. Words of length of one will be made lower case though since they are likely A or I which can be made lowercase without losing much emphasis.

In [38]:
def lower_unless_all_caps(string_):
    """
    Make all words in the input string lowercase unless that 
    word is in all caps
    """
    words = string_.split()
    processed_words = [w.lower() if not (w.isupper() and len(w) > 1) else w for w in words]
    return ' '.join(processed_words)

In [39]:
df['clean_news'] = df['clean_news'].apply(lower_unless_all_caps)

In [40]:
df['clean_headline'] = df['clean_headline'].apply(lower_unless_all_caps)

In [41]:
df.head()

Unnamed: 0,id,headline,news,label,headline_len,news_len,caps_in_headline,norm_caps_in_headline,caps_in_news,norm_caps_in_news,news_tokens,news_urls,clean_news,headline_urls,clean_headline,twitter_handles
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1,28,7518,5,0.178571,314,0.041766,"[Daniel, Greenfield, a, Shillman, Journalism, ...",[],"daniel greenfield, a shillman journalism fello...",[],you can smell hillary’s fear,[]
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1,85,2646,18,0.211765,96,0.036281,"[Google, Pinterest, Digg, Linkedin, Reddit, St...","[https://t.co/VyTT49YvoE, pic.twitter.com/wCvS...",google pinterest digg linkedin reddit stumbleu...,[],watch the exact moment paul ryan committed pol...,"[@SpeakerRyan, @realDonaldTrump, @ABCPolitics]"
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0,43,2543,2,0.046512,96,0.037751,"[U, S, Secretary, of, State, John, F, Kerry, s...",[],U.S. secretary of state john F. kerry said mon...,[],kerry to go to paris in gesture of sympathy,[]
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1,84,2660,6,0.071429,139,0.052256,"[Kaydee, King, (@KaydeeKing), November, 9, 201...",[https://t.co/KHyOuUSrFS],"— kaydee king (@twitter-handle) november 9, 20...",[],bernie supporters on twitter erupt in anger ag...,"[@KaydeeKing, @People4Bernie, @WalkerBragman, ..."
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0,48,1840,8,0.166667,77,0.041848,"[It, s, primary, day, in, New, York, and, fron...",[],it's primary day in new york and front-runners...,[],the battle of new york: why this primary matters,[]


Number in data

I will replace the numbers with a space because some of the sentences run together and end with a number. Replacing the number with a space will split the sentences.

In [42]:
df['clean_news'] = df['clean_news'].apply(lambda x: re.sub(r'9\/11', 'nine-eleven', x))

In [43]:
df['clean_news'] = df['clean_news'].apply(lambda x: re.sub(r'\d+', ' ', x))

In [44]:
df['clean_headline'] = df['clean_headline'].apply(lambda x: re.sub(r'9\/11', 'nine-eleven', x))

In [45]:
df['clean_headline'] = df['clean_headline'].apply(lambda x: re.sub(r'\d+', ' ', x))

In [46]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kanai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Tokens in the current clean news articles

In [47]:
df['clean_news_tokens'] = df['clean_news'].apply(word_tokenize)

In [48]:
review_freq_dis(df, 'clean_news_tokens', 150)

The number of unique tokens in the corpus is 92419


[('the', 287770),
 (',', 257868),
 ('.', 205372),
 ('to', 137784),
 ('of', 128415),
 ('and', 118219),
 ('a', 106611),
 ('in', 97145),
 ('that', 71774),
 ('’', 58146),
 ('is', 54248),
 ('for', 45816),
 ('on', 40088),
 ('it', 38703),
 ('s', 36390),
 ('“', 34711),
 ('”', 34099),
 ('he', 32351),
 ('as', 32227),
 ('with', 30539),
 ('was', 27271),
 ('are', 25516),
 ('this', 24733),
 ('be', 24343),
 ('have', 24190),
 ('by', 24104),
 ('not', 23300),
 ('has', 22395),
 ('his', 22171),
 ('trump', 21725),
 ('at', 21422),
 ('said', 21043),
 ('but', 20977),
 ('from', 20476),
 ('they', 20434),
 ('``', 19885),
 ("'s", 19697),
 ("''", 19472),
 ('we', 19053),
 ('i', 18049),
 ('an', 17878),
 (':', 17543),
 ('who', 17536),
 ('clinton', 17104),
 ('you', 15886),
 ('will', 15143),
 ('or', 14222),
 ('about', 14113),
 ('their', 13843),
 (')', 13295),
 ('(', 12999),
 ('would', 12831),
 ('more', 12758),
 ('people', 11508),
 ('she', 11440),
 ('if', 11396),
 ('what', 11374),
 ('been', 11342),
 ('all', 11212),
 ('o

Removing all of the Punctuation tokens except for the exclamation point, because it seems like it may be an indicator of Fake news. Also removing all the single characters except for i.

In [49]:
df['clean_news_tokens'] = df['clean_news_tokens'].apply(lambda x: remove_single_characters(x, ['i', '!']))

In [50]:
review_freq_dis(df, 'clean_news_tokens', 150)

The number of unique tokens in the corpus is 92250


[('the', 287770),
 ('to', 137784),
 ('of', 128415),
 ('and', 118219),
 ('in', 97145),
 ('that', 71774),
 ('is', 54248),
 ('for', 45816),
 ('on', 40088),
 ('it', 38703),
 ('he', 32351),
 ('as', 32227),
 ('with', 30539),
 ('was', 27271),
 ('are', 25516),
 ('this', 24733),
 ('be', 24343),
 ('have', 24190),
 ('by', 24104),
 ('not', 23300),
 ('has', 22395),
 ('his', 22171),
 ('trump', 21725),
 ('at', 21422),
 ('said', 21043),
 ('but', 20977),
 ('from', 20476),
 ('they', 20434),
 ('``', 19885),
 ("'s", 19697),
 ("''", 19472),
 ('we', 19053),
 ('i', 18049),
 ('an', 17878),
 ('who', 17536),
 ('clinton', 17104),
 ('you', 15886),
 ('will', 15143),
 ('or', 14222),
 ('about', 14113),
 ('their', 13843),
 ('would', 12831),
 ('more', 12758),
 ('people', 11508),
 ('she', 11440),
 ('if', 11396),
 ('what', 11374),
 ('been', 11342),
 ('all', 11212),
 ('one', 11197),
 ('her', 11186),
 ('there', 10791),
 ('were', 10337),
 ('can', 10225),
 ('had', 10155),
 ('which', 9724),
 ('when', 9466),
 ('out', 9311),
 

Tokens in the current clean headline

In [51]:
df['clean_headline_tokens'] = df['clean_headline'].apply(word_tokenize)

In [52]:
review_freq_dis(df, 'clean_headline_tokens', 150)

The number of unique tokens in the corpus is 11071


[('the', 2016),
 ('to', 1630),
 (':', 1577),
 (',', 1414),
 ('’', 1344),
 ('in', 1116),
 ('of', 1046),
 ('trump', 1028),
 ('for', 766),
 ('on', 726),
 ('s', 712),
 ('a', 709),
 ('and', 663),
 ('is', 662),
 ('clinton', 661),
 ('hillary', 537),
 ('?', 536),
 ("'s", 512),
 ("'", 377),
 ('obama', 359),
 ('‘', 357),
 ('.', 326),
 ('with', 307),
 ('by', 269),
 ('donald', 265),
 ('as', 265),
 ('new', 264),
 ('”', 254),
 ('“', 250),
 ('it', 247),
 ('from', 245),
 ('at', 243),
 ('why', 239),
 ('election', 224),
 ('what', 220),
 ('GOP', 215),
 ('how', 214),
 ('are', 212),
 ('will', 206),
 ('about', 204),
 ('be', 203),
 (')', 201),
 ('(', 200),
 ('after', 195),
 ('US', 189),
 ('–', 184),
 ('over', 184),
 ('!', 179),
 ('that', 175),
 ('-', 172),
 ('not', 164),
 ('this', 160),
 ('campaign', 158),
 ('america', 155),
 ('t', 154),
 ('says', 154),
 ('debate', 154),
 ('you', 146),
 ('russia', 146),
 ('has', 144),
 ('his', 133),
 ('FBI', 132),
 ('he', 130),
 ('house', 130),
 ('state', 127),
 ('sanders', 

Remove Punctuation and Single Letter Tokens from Clean Headline

In [53]:
df['clean_headline_tokens'] = df['clean_headline_tokens'].apply(lambda x: remove_single_characters(x, ['i', '!']))

In [54]:
review_freq_dis(df, 'clean_headline_tokens', 150)

The number of unique tokens in the corpus is 10999


[('the', 2016),
 ('to', 1630),
 ('in', 1116),
 ('of', 1046),
 ('trump', 1028),
 ('for', 766),
 ('on', 726),
 ('and', 663),
 ('is', 662),
 ('clinton', 661),
 ('hillary', 537),
 ("'s", 512),
 ('obama', 359),
 ('with', 307),
 ('by', 269),
 ('donald', 265),
 ('as', 265),
 ('new', 264),
 ('it', 247),
 ('from', 245),
 ('at', 243),
 ('why', 239),
 ('election', 224),
 ('what', 220),
 ('GOP', 215),
 ('how', 214),
 ('are', 212),
 ('will', 206),
 ('about', 204),
 ('be', 203),
 ('after', 195),
 ('US', 189),
 ('over', 184),
 ('!', 179),
 ('that', 175),
 ('not', 164),
 ('this', 160),
 ('campaign', 158),
 ('america', 155),
 ('says', 154),
 ('debate', 154),
 ('you', 146),
 ('russia', 146),
 ('has', 144),
 ('his', 133),
 ('FBI', 132),
 ('he', 130),
 ('house', 130),
 ('state', 127),
 ('sanders', 125),
 ('up', 125),
 ('war', 123),
 ('can', 121),
 ('news', 120),
 ('out', 119),
 ('we', 113),
 ('have', 110),
 ('white', 109),
 ('against', 108),
 ('comment', 106),
 ('just', 106),
 ('cruz', 105),
 ('more', 105

Removing "'s"

While the fake news frequently or always didn't removed the apostrophe from 's, it doesn't look like that was done to the true news. 's will need to be removed so that it doesn't become a false indicator of true news.

In [55]:
df['clean_headline_tokens'] = df['clean_headline_tokens'].apply(lambda x: remove_words(x, ["'s"]))
df['clean_news_tokens'] = df['clean_news_tokens'].apply(lambda x: remove_words(x, ["'s"]))

Remove Date Words

To better generalize the models removing all the date words.

In [56]:
date_words = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 
              'saturday', 'sunday', 'january', 'february', 'march', 'april',
             'may', 'june', 'july', 'august', 'september', 'october',
             'november', 'december']

In [57]:
df['clean_headline_tokens'] = df['clean_headline_tokens'].apply(lambda x: remove_words(x, date_words))
df['clean_news_tokens'] = df['clean_news_tokens'].apply(lambda x: remove_words(x, date_words))

In [58]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kanai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Remove Stop Words

In [59]:
stop_words = stopwords.words('english')

In [60]:
display(stop_words)

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [61]:
most_freq_clean_news = [x[0] for x in list(freq_dist_of_col(df, 'clean_news_tokens').most_common(150))]

The number of unique tokens in the corpus is 92230


In [62]:
most_freq_clean_news

['the',
 'to',
 'of',
 'and',
 'in',
 'that',
 'is',
 'for',
 'on',
 'it',
 'he',
 'as',
 'with',
 'was',
 'are',
 'this',
 'be',
 'have',
 'by',
 'not',
 'has',
 'his',
 'trump',
 'at',
 'said',
 'but',
 'from',
 'they',
 '``',
 "''",
 'we',
 'i',
 'an',
 'who',
 'clinton',
 'you',
 'will',
 'or',
 'about',
 'their',
 'would',
 'more',
 'people',
 'she',
 'if',
 'what',
 'been',
 'all',
 'one',
 'her',
 'there',
 'were',
 'can',
 'had',
 'which',
 'when',
 'out',
 'new',
 'so',
 'state',
 'do',
 'up',
 'president',
 'no',
 'than',
 'our',
 'also',
 'obama',
 'other',
 'campaign',
 'after',
 'some',
 'just',
 'over',
 'hillary',
 'its',
 'like',
 'them',
 'into',
 'could',
 'even',
 'now',
 'time',
 'states',
 'only',
 'because',
 'how',
 'him',
 'most',
 'many',
 'republican',
 'those',
 'party',
 "n't",
 'first',
 'these',
 'two',
 'any',
 'against',
 'political',
 'government',
 'election',
 'U.S.',
 'years',
 'get',
 'did',
 'american',
 'while',
 'world',
 'house',
 'going',
 'you

In [63]:
def intersection(lst1, lst2):
    """Return the intersection of two lists"""

    temp = set(lst2) 
    lst3 = [value for value in lst1 if value in temp] 
    return lst3

In [64]:
common_words = intersection(stop_words, most_freq_clean_news)

In [65]:
common_words

['i',
 'my',
 'we',
 'our',
 'you',
 'your',
 'he',
 'him',
 'his',
 'she',
 'her',
 'it',
 'its',
 'they',
 'them',
 'their',
 'what',
 'which',
 'who',
 'this',
 'that',
 'these',
 'those',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'do',
 'did',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'into',
 'before',
 'after',
 'to',
 'from',
 'up',
 'in',
 'out',
 'on',
 'over',
 'then',
 'there',
 'when',
 'where',
 'how',
 'all',
 'any',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'not',
 'only',
 'so',
 'than',
 'very',
 'can',
 'will',
 'just',
 'should',
 'now']

In [66]:
len(common_words)

84

In [67]:
len(stop_words)

179

In [68]:
def difference(lst1, lst2):
    """Return the difference of two lists"""

    temp = set(lst2) 
    lst3 = [value for value in lst1 if value not in temp] 
    return lst3

In [69]:
words_in_nltk_not_news = difference(stop_words, most_freq_clean_news)

In [70]:
words_in_nltk_not_news

['me',
 'myself',
 'ours',
 'ourselves',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'yours',
 'yourself',
 'yourselves',
 'himself',
 "she's",
 'hers',
 'herself',
 "it's",
 'itself',
 'theirs',
 'themselves',
 'whom',
 "that'll",
 'am',
 'having',
 'does',
 'doing',
 'a',
 'until',
 'between',
 'through',
 'during',
 'above',
 'below',
 'down',
 'off',
 'under',
 'again',
 'further',
 'once',
 'here',
 'why',
 'both',
 'each',
 'few',
 'nor',
 'own',
 'same',
 'too',
 's',
 't',
 'don',
 "don't",
 "should've",
 'd',
 'll',
 'm',
 'o',
 're',
 've',
 'y',
 'ain',
 'aren',
 "aren't",
 'couldn',
 "couldn't",
 'didn',
 "didn't",
 'doesn',
 "doesn't",
 'hadn',
 "hadn't",
 'hasn',
 "hasn't",
 'haven',
 "haven't",
 'isn',
 "isn't",
 'ma',
 'mightn',
 "mightn't",
 'mustn',
 "mustn't",
 'needn',
 "needn't",
 'shan',
 "shan't",
 'shouldn',
 "shouldn't",
 'wasn',
 "wasn't",
 'weren',
 "weren't",
 'won',
 "won't",
 'wouldn',
 "wouldn't"]

In [71]:
words_in_news_not_nltk = difference(most_freq_clean_news, stop_words)

In [72]:
words_in_news_not_nltk

['trump',
 'said',
 '``',
 "''",
 'clinton',
 'would',
 'people',
 'one',
 'new',
 'state',
 'president',
 'also',
 'obama',
 'campaign',
 'hillary',
 'like',
 'could',
 'even',
 'time',
 'states',
 'many',
 'republican',
 'party',
 "n't",
 'first',
 'two',
 'political',
 'government',
 'election',
 'U.S.',
 'years',
 'get',
 'american',
 'world',
 'house',
 'going',
 'percent',
 '--',
 'last',
 'make',
 'country',
 'presidential',
 'think',
 'news',
 'white',
 'way',
 'say',
 'sanders',
 'donald',
 'told',
 'much',
 'democratic',
 'voters',
 'know',
 'war',
 'back',
 'US',
 'year',
 'us',
 'united',
 'republicans',
 'support',
 'national',
 'media',
 'america',
 'right']

Looking at the remaining frequent words from the news text, that are all very concentrated on political news.

Saving data

In [73]:
df.to_csv(os.path.join(dataset_dir,'train_news_preprocessed_mc.csv'),index=False)