In [1]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
warnings.filterwarnings("ignore")

In [2]:
os.chdir("C:/Users/levi0/Downloads")

In [3]:
news = pd.read_csv("india-news-headlines.csv")

In [4]:
stock = pd.read_csv("BSESN.csv")

In [5]:
#Since we will only forecast for "High"
stock.drop(columns = ["Open", "Low", "High", "Adj Close", "Volume"], inplace = True)

In [6]:
news['headline_text'] = news['headline_text'].str.lower()

In [7]:
#segregating the news headlines which contain the name of a company under the bombay stock exchange or the news headlines
#which contain words which represent other factor such as a natural calamity, inflation, the budget etc. 
companies = "bajaj|hdfc|kotak|tata|reliance|titan|larsen|mahindra|nestle|icici|maruti|hcl|ultratech|airtel|ntpc|sbi|ongc"
others = "|earthquake|housing|business|budget|flood|gold|exchange rate|inflation|silver"

In [8]:
keywords = companies+others

In [9]:
keywords

'bajaj|hdfc|kotak|tata|reliance|titan|larsen|mahindra|nestle|icici|maruti|hcl|ultratech|airtel|ntpc|sbi|ongc|earthquake|housing|business|budget|flood|gold|exchange rate|inflation|silver'

In [10]:
#out of the entire dataset we could only get 74053 news records which were relevant for forecasting 
news['relevant'] = news['headline_text'].str.contains(pat = keywords).astype(int)
news['relevant'].value_counts()

0    3223119
1      74053
Name: relevant, dtype: int64

In [11]:
news['publish_date'] = pd.to_datetime(news['publish_date'], format='%Y%m%d') 

In [12]:
relevant_news = news[(news['relevant'] == 1)]

In [13]:
relevant_news.reset_index(drop = True, inplace = True)

In [14]:
relevant_news.head(5)

Unnamed: 0,publish_date,headline_category,headline_text,relevant
0,2001-01-03,unknown,bcc holds talks on budget,1
1,2001-01-04,unknown,pil urges to seek flood relief from un,1
2,2001-01-04,unknown,close battle for mahindra golfer; 2001,1
3,2001-01-07,unknown,tops to trousers; they're going for gold!,1
4,2001-01-07,unknown,cricket board chief means business,1


In [15]:
from nltk.corpus import stopwords
stop = set(stopwords.words("english"))

In [16]:
#removing stop words/puctuations

relevant_news['headline_text'] = relevant_news['headline_text'].str.replace('[^\w\s]','')

relevant_news['headline_text'] = relevant_news['headline_text'].str.replace('\d+', '')

In [17]:
for i in range(len(relevant_news['headline_text'])):
    relevant_news['headline_text'][i] = ' '.join([word for word in relevant_news['headline_text'][i].split() if word not in stop])

In [18]:
relevant_news.tail(5)

Unnamed: 0,publish_date,headline_category,headline_text,relevant
74048,2020-06-30,business.india-business,tata steel hit rs crore loss q,1
74049,2020-06-30,business.india-business,hdfc bank looks raise rs crore,1
74050,2020-06-30,city.ahmedabad,gold prices rise due safe investments,1
74051,2020-06-30,city.goa,cooperative banks get govt approval offer gold...,1
74052,2020-06-30,city.nagpur,one tola gold kg silver create price ratio record,1


**Finding Verbs**

In [19]:
import spacy 
import textacy 
import en_core_web_sm

nlp = en_core_web_sm.load()

In [20]:
doc = nlp(relevant_news['headline_text'][0])

In [21]:
pattern = [{"POS": "VERB"}]

In [22]:
vrb = textacy.extract.matches(doc, patterns=pattern)

In [25]:
relevant_news['verb'] = ''

In [26]:
for i in range(len(relevant_news['headline_text'])):
    doc = nlp(relevant_news['headline_text'][i])
    vrb = textacy.extract.matches(doc, patterns=pattern)
    for verb_phrase in vrb:
        relevant_news['verb'][i] = str(verb_phrase)

In [28]:
from collections import Counter
Counter(" ".join(relevant_news["verb"]).split()).most_common(20)

[('held', 851),
 ('says', 831),
 ('wins', 730),
 ('gets', 663),
 ('stolen', 639),
 ('hit', 574),
 ('seized', 573),
 ('flooded', 571),
 ('robbed', 489),
 ('get', 456),
 ('hits', 427),
 ('take', 415),
 ('set', 358),
 ('rises', 342),
 ('make', 299),
 ('buy', 287),
 ('arrested', 280),
 ('takes', 263),
 ('seeks', 254),
 ('snatched', 253)]

In [29]:
#The most frequently occuring verbs were separately extracted and the new valence scores were assigned to them specially 
#for scoring of financial data
from nltk.sentiment.vader import SentimentIntensityAnalyzer
vader = SentimentIntensityAnalyzer()

In [30]:
new_words = {
    'wins': 3.0,
    'gets': 2.0,
    'rises': 2.5,
    'rise': 2.5,
    'loses': -3.0,
    'win': 3.0,
    'raise': 2.5,
    'raises': 2.5,
    'jumps': 2.0,
    'celebrates': 3.0,
    'floodaffected': -2.5,
    'reduce': -1.5,
    'gain': 2.5,
    'hit': -2.5,
    'falls': -3.0,
    'approves': 2.0,
    'passed': 2.0,
    'fail': -2.0,
    'lost' : -2.0,
}

vader.lexicon.update(new_words)

In [31]:
relevant_news['polarity'] = ''
for i in range(0, len(relevant_news['headline_text'])):
    relevant_news['polarity'][i] = vader.polarity_scores(relevant_news['headline_text'][i])['compound']

In [34]:
relevant_news.tail()

Unnamed: 0,publish_date,headline_category,headline_text,relevant,verb,polarity
74048,2020-06-30,business.india-business,tata steel hit rs crore loss q,1,hit,-0.7003
74049,2020-06-30,business.india-business,hdfc bank looks raise rs crore,1,raise,0.5423
74050,2020-06-30,city.ahmedabad,gold prices rise due safe investments,1,rise,0.7506
74051,2020-06-30,city.goa,cooperative banks get govt approval offer gold...,1,offer,0.4767
74052,2020-06-30,city.nagpur,one tola gold kg silver create price ratio record,1,create,0.2732


In [35]:
g = relevant_news.groupby(['publish_date'], sort=False)['polarity'].max()

In [36]:
#merging the two datasets together to get the stock prices and the polarity scores in the same dataset
stock['Date'] = pd.to_datetime(stock['Date']) 
stock['publish_date'] = stock['Date']
stock = pd.merge(stock, g, on=['publish_date'], how='left')

In [37]:
stock.head(5)

Unnamed: 0,Date,Close,publish_date,polarity
0,2001-01-03,4060.02002,2001-01-03,0.0
1,2001-01-04,4115.370117,2001-01-04,0.4767
2,2001-01-05,4183.72998,2001-01-05,
3,2001-01-08,4120.430176,2001-01-08,
4,2001-01-09,4125.310059,2001-01-09,


In [38]:
stock['polarity'].fillna(0, inplace = True)

In [39]:
stock.isna().sum()

Date             0
Close           74
publish_date     0
polarity         0
dtype: int64

In [40]:
#Forward fill for the missing values in "High"
stock.fillna(method="ffill", inplace =True)

In [42]:
stock.drop(columns = ["Date"], inplace = True)

In [43]:
stock

Unnamed: 0,Close,publish_date,polarity
0,4060.020020,2001-01-03,0.0000
1,4115.370117,2001-01-04,0.4767
2,4183.729980,2001-01-05,0.0000
3,4120.430176,2001-01-08,0.0000
4,4125.310059,2001-01-09,0.0000
...,...,...,...
4862,35430.429688,2020-06-23,0.6369
4863,34868.980469,2020-06-24,0.0000
4864,34842.101563,2020-06-25,0.5106
4865,35171.269531,2020-06-26,0.5423


In [44]:
#The final dataset 
stock.to_csv("Preprocessed.csv", index = False)