In [37]:
import pandas as pd

df = pd.read_csv("C:\\MAIN\\NYCDSA\\Web_Scraping_Project\\Reuters\\2020_01_23_reuters_news_900_articles.csv")

In [38]:
# How many observations?  There are 4 columns: body (text), classification, timestamp, and title
print("Dimensions of our dataframe: ",df.shape)
print("Our 4 columns are: ",df.columns)

Dimensions of our dataframe:  (910, 4)
Our 4 columns are:  Index(['body', 'classification', 'timestamp', 'title'], dtype='object')


In [39]:
# Observe the first 5 elements
df.head()

Unnamed: 0,body,classification,timestamp,title
0,VANCOUVER (Reuters) - The first phase of battl...,Technology News,1/23/2020 12:52,U.S. extradition battle over Huawei's Meng end...
1,PARIS (Reuters) - French prosecutors investiga...,Business News,1/23/2020 18:42,French investigators to move ahead with Ghosn ...
2,BRUSSELS (Reuters) - iPhone maker Apple on Thu...,Technology News,1/23/2020 16:05,"Apple pushes back against EU common charger, w..."
3,"(Reuters) - Waymo, the self-driving unit of Al...",Technology News,1/23/2020 13:57,"Waymo to test autonomous trucks, vans in Texas..."
4,LONDON (Reuters) - Britain wants a trade deal ...,Technology News,1/23/2020 8:04,Britain to United States: We want a trade deal...


In [40]:
# Looks like our timestamps are strings
print("First timestamp",df.timestamp[0],"is type",type(df.timestamp[0]))

#Convert the timestamps from string type to datetime type
df.timestamp = pd.to_datetime(df.timestamp, format = '%m/%d/%Y %H:%M')
print("\nAfter conversion")
print("First timestamp",df.timestamp[0], "is type",type(df.timestamp[0]))

First timestamp 1/23/2020 12:52 is type <class 'str'>

After conversion
First timestamp 2020-01-23 12:52:00 is type <class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [51]:
# Sort by timestamp first, to get these by chronological order
df = df.sort_values(by = "timestamp", ascending=False)

In [54]:
# Verify that the sorting by timestamp process worked
df.head(n=5)

Unnamed: 0,body,classification,timestamp,title
158,"(Reuters) - Chipmaker Broadcom Inc (,) said on...",Technology News,2020-01-23 21:42:00,Broadcom to supply wireless components to Apple
155,BOSTON (Reuters) - Insys Therapeutics founder ...,Business News,2020-01-23 21:14:00,Insys founder Kapoor sentenced to 66 months in...
161,"(Reuters) - Intel Corp (,) on Thursday cemente...",Business News,2020-01-23 21:08:00,Intel sees 2020 revenue above estimates as chi...
145,GEORGETOWN (Reuters) - Guyana's government nex...,Business News,2020-01-23 19:46:00,Exclusive: Guyana opening search for oil firm ...
1,PARIS (Reuters) - French prosecutors investiga...,Business News,2020-01-23 18:42:00,French investigators to move ahead with Ghosn ...


In [55]:
# Verify that the sorting by timestamp process worked
df.tail(n=5)

Unnamed: 0,body,classification,timestamp,title
806,HOUSTON (Reuters) - Oil prices reached the hig...,Business News,2019-12-19 02:34:00,"Oil reaches three-month highs, supported by lo..."
811,NEW YORK (Reuters) - Global equity markets ext...,Business News,2019-12-19 00:53:00,Stocks rally anew; euro bonds rise on Swedish ...
804,NEW YORK (Reuters) - The dollar was stalled on...,Business News,2019-12-19 00:48:00,"Markets sleepy, dollar on hold ahead of U.S. G..."
802,LUXEMBOURG (Reuters) - Agreements that let Fac...,Technology News,2019-12-18 23:17:00,Facebook EU user data transfer contracts are l...
807,"WASHINGTON (Reuters) - BMW AG (,) and Daimler ...",Business News,2019-12-18 19:56:00,"Daimler, BMW exiting North American car-sharin..."


In [69]:
# Look at the body text of the first article here
example_article = df.iloc[0,0]
example_article

'(Reuters) - Chipmaker Broadcom Inc (,) said on Thursday it has entered into two multi-year agreements with Apple Inc (,) for the supply of wireless components used in its products. ,The agreements, "2020 SOWs", are in addition to the existing ones for supplying radio frequency components and modules to the iPhone maker and together could generate as much as $15 billion in revenue for Broadcom. ,Shares of the chipmaker rose 2% in extended trading.             '

In [77]:
import operator

# Input: text of words.  Output: dictionary of the unique words and how often they appeared,
# with most frequent words listed first
def count_word_frequency(sentence):
    word_dict = {}
    sentence = sentence.split()
    for word in sentence:
        if word in word_dict:
            word_dict[word] += 1
        else:
            word_dict[word] = 1
    return sorted(word_dict.items(),key=operator.itemgetter(1),reverse=True)

In [78]:
import string

def remove_punctuation(sentence):
    return sentence.translate(str.maketrans('', '', string.punctuation))

remove_punctuation(example_article)

'Reuters  Chipmaker Broadcom Inc  said on Thursday it has entered into two multiyear agreements with Apple Inc  for the supply of wireless components used in its products The agreements 2020 SOWs are in addition to the existing ones for supplying radio frequency components and modules to the iPhone maker and together could generate as much as 15 billion in revenue for Broadcom Shares of the chipmaker rose 2 in extended trading             '

In [83]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# All of the words that we will remove
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r