# Data Pre-Processing

In [26]:
# Importing Libraries
import numpy as np
import pandas as pd

In [27]:
# Creating pandas DataFrame
df = pd.read_csv('News Headlines Dataset.csv')

df.head()

Unnamed: 0,TITLE,CATEGORY
0,"Fed official says weak data caused by weather,...",Business
1,Fed's Charles Plosser sees high bar for change...,Business
2,US open: Stocks fall after Fed official hints ...,Business
3,"Fed risks falling 'behind the curve', Charles ...",Business
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,Business


In [28]:
# News Categories
pd.unique(df['CATEGORY'])

array(['Business', 'Science & Technology', 'Entertainment', 'Health'],
      dtype=object)

In [36]:
# Headlines Tokenization

# Importing Libraries
from nltk.tokenize import sent_tokenize, word_tokenize

TOKENIZED_TITLES = []

for headline in df['TITLE']:
    TOKENIZED_TITLES.append(word_tokenize(headline.lower()))

In [37]:
# Headlines have been tokenized
for title in TOKENIZED_TITLES[0:10]:
    print(title)

['fed', 'official', 'says', 'weak', 'data', 'caused', 'by', 'weather', ',', 'should', 'not', 'slow', 'taper']
['fed', "'s", 'charles', 'plosser', 'sees', 'high', 'bar', 'for', 'change', 'in', 'pace', 'of', 'tapering']
['us', 'open', ':', 'stocks', 'fall', 'after', 'fed', 'official', 'hints', 'at', 'accelerated', 'tapering']
['fed', 'risks', 'falling', "'behind", 'the', 'curve', "'", ',', 'charles', 'plosser', 'says']
['fed', "'s", 'plosser', ':', 'nasty', 'weather', 'has', 'curbed', 'job', 'growth']
['plosser', ':', 'fed', 'may', 'have', 'to', 'accelerate', 'tapering', 'pace']
['fed', "'s", 'plosser', ':', 'taper', 'pace', 'may', 'be', 'too', 'slow']
['fed', "'s", 'plosser', 'expects', 'us', 'unemployment', 'to', 'fall', 'to', '6.2', '%', 'by', 'the', 'end', 'of', '2014']
['us', 'jobs', 'growth', 'last', 'month', 'hit', 'by', 'weather', ':', 'fed', 'president', 'charles', 'plosser']
['ecb', 'unlikely', 'to', 'end', 'sterilisation', 'of', 'smp', 'purchases', '-', 'traders']


In [38]:
# Pickling TOKENIZED_TITLES

# Required Library
import pickle

file = "TOKENIZED_TITLES.pkl"
fileobj = open(file, 'wb')
pickle.dump(TOKENIZED_TITLES, fileobj)
fileobj.close()

In [18]:
#file = "TOKENIZED_TITLE.pkl"
#fileobj = open(file, 'rb')
#a = pickle.load(fileobj)
#fileobj.close()
#print(a[0:5])

[['Fed', 'official', 'says', 'weak', 'data', 'caused', 'by', 'weather', ',', 'should', 'not', 'slow', 'taper'], ['Fed', "'s", 'Charles', 'Plosser', 'sees', 'high', 'bar', 'for', 'change', 'in', 'pace', 'of', 'tapering'], ['US', 'open', ':', 'Stocks', 'fall', 'after', 'Fed', 'official', 'hints', 'at', 'accelerated', 'tapering'], ['Fed', 'risks', 'falling', "'behind", 'the', 'curve', "'", ',', 'Charles', 'Plosser', 'says'], ['Fed', "'s", 'Plosser', ':', 'Nasty', 'Weather', 'Has', 'Curbed', 'Job', 'Growth']]


In [10]:
# Download 'stopwords' from nltk if using for the first time
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Abhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [39]:
# Removal of Stop Words & Punctuation
# Also removes 's

# Required Libraries
from nltk.corpus import stopwords
import string

# Stop Words for English Language
stop_words = set(stopwords.words("english"))
print("Stop Words : ")
print(stop_words)

# Punctuations
punctuations = set(string.punctuation)
print("\nPunctuations : ")
print(punctuations)

# FILTERED TITLE =  Title Without Stop Words & Punctuations
FILTERED_TITLES = []

for title in TOKENIZED_TITLES:
    temp_title = []
    for word in title:
        if((word not in stop_words) and (word not in punctuations) and (word != "'s")):
            temp_title.append(word)
            
    FILTERED_TITLES.append(temp_title)
        

print("\nFiltered Titles : ")
print(FILTERED_TITLES[0:5])

Stop Words : 
{'some', 'those', 'here', 'isn', 'won', 'same', 'above', 'my', 'what', 'an', 'o', 'when', 'where', 'himself', 'its', 'but', 'then', 'had', "needn't", 'such', 'wasn', 'against', "wouldn't", "hasn't", 'whom', 'of', 'do', 'if', 'below', 'that', 'under', "shan't", 'so', 'her', 'because', 'having', 'each', 'these', 'down', 'herself', 'needn', 'few', 'no', 'between', 'hasn', 'it', 'very', 'at', 'can', 'are', 'a', 've', 'both', "she's", 'not', 'his', "didn't", 'me', "should've", 'in', 'how', 'own', 'doesn', 'myself', 'ain', 'their', 'll', 'ourselves', 'our', 'from', 'shouldn', "won't", "doesn't", 'is', 'theirs', 'now', 'was', 'until', 'which', 'has', 'they', 'and', 'nor', 'him', 'on', "wasn't", "you'll", 't', 'once', 'more', 'than', "weren't", 'i', 'be', 'by', 'were', 'd', 'didn', 'over', 'further', 'or', 'm', 's', 're', 'couldn', 'you', 'been', 'with', 'themselves', 'yours', 'this', "hadn't", 'aren', 'being', 'itself', 'weren', 'other', 'wouldn', 'doing', "aren't", 'am', 'just'

In [40]:
# Pickling FILTERED_TITLES

# Library already imported
# import pickle

file = "FILTERED_TITLES.pkl"
fileobj = open(file, 'wb')
pickle.dump(FILTERED_TITLES, fileobj)
fileobj.close()

In [45]:
# Stemming using Porter Stemmer

# Required Library
from nltk.stem import PorterStemmer

porter = PorterStemmer()

STEMMED_TITLES_HEADLINES = []

for title in FILTERED_TITLES:
    temp_title = []
    for word in title:
        temp_title.append(porter.stem(word))
        
    STEMMED_TITLES_HEADLINES.append(" ".join(temp_title))
    
    
print("Stemmed Titles Headlines : ")
print(STEMMED_TITLES_HEADLINES[0:5])

Stemmed Titles Headlines : 
['fed offici say weak data caus weather slow taper', 'fed charl plosser see high bar chang pace taper', 'us open stock fall fed offici hint acceler taper', "fed risk fall 'behind curv charl plosser say", 'fed plosser nasti weather curb job growth']


In [46]:
# Pickling STEMMED_TITLES_HEADLINES

# Library already imported
# import pickle

file = "STEMMED_TITLES_HEADLINES.pkl"
fileobj = open(file, 'wb')
pickle.dump(STEMMED_TITLES_HEADLINES, fileobj)
fileobj.close()