# Build Sentiment Model Via SpaCy. 

In [1]:
# Python module. 
import re, os 
import pandas as pd
import spacy 
from sklearn.model_selection import train_test_split 

# Change the current directory from (./notebook) to root directory. 
if not re.match(r".+/MADS-M2-estimating-news-impact-on-financial-market$", os.getcwd()): 
	os.chdir("../..") 

# For clearing safe warnings. Not important. 
from IPython.display import clear_output 

# Custom modules. 
from source.modules.manage_files import ManageFiles
from source.modules.processor_spacy import to_spacy_document 

# Custom configs. 
from source.config_py.config import DIR_MLSPACY, PARAM_SEED

# Preview. 
print(os.getcwd()) 

/Users/lioneltay/Dropbox/Courses/michigan_mads/SIADS_694_695_milestone_2_Eric_Gilbert/submission/MADS-M2-estimating-news-impact-on-financial-market


## Configurations (general). 

In [2]:
# Pandas DF config. 
pd.set_option("display.max_rows", 50, "display.max_columns", 50, "display.max_colwidth", 200) 

# SpaCy configuration. 
nlp = spacy.load("en_core_web_sm") 

# File management setup. 
manage_files = ManageFiles() 

# For clearing the output. Not important. 
clear_output()

## Load sentiment dataset. 

In [3]:
# Load dataset. 
# Need to set (encoding="latin-1"). Otherwise will raise error due to special characters. 
# Fine to ignore those characters. Will not affect the analysis later. 
colnames = ["sentiment", "headline"] 
df_sentiment = manage_files.read_from_csv(filename="sentiment_news.csv", names=colnames, encoding="latin-1") 

# Rearrange the column order. 
df_sentiment = df_sentiment[["headline", "sentiment"]] 

# Preview. 
df_sentiment

Read from (sentiment_news.csv)


Unnamed: 0,headline,sentiment
0,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .",neutral
1,"Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .",neutral
2,The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers ...,negative
3,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profi...,positive
4,"According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net ...",positive
...,...,...
4841,LONDON MarketWatch -- Share prices ended lower in London Monday as a rebound in bank stocks failed to offset broader weakness for the FTSE 100 .,negative
4842,"Rinkuskiai 's beer sales fell by 6.5 per cent to 4.16 million litres , while Kauno Alus ' beer sales jumped by 6.9 per cent to 2.48 million litres .",neutral
4843,"Operating profit fell to EUR 35.4 mn from EUR 68.8 mn in 2007 , including vessel sales gain of EUR 12.3 mn .",negative
4844,"Net sales of the Paper segment decreased to EUR 221.6 mn in the second quarter of 2009 from EUR 241.1 mn in the second quarter of 2008 , while operating profit excluding non-recurring items rose t...",negative


## Reformat the data structure according to SpaCy requirement. 

### Split train test. 

In [4]:
X, y = df_sentiment[["headline"]], df_sentiment["sentiment"] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=PARAM_SEED, shuffle=True, stratify=y) 

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(3246, 1) (1600, 1) (3246,) (1600,)


In [5]:
# Convert to list of tuples for TRAIN and TEST set 
# to convert into SpaCy format later. 
trainset = pd.concat([X_train, y_train], axis=1) 
trainset = list(trainset.to_records(index=False)) 
testset = pd.concat([X_test, y_test], axis=1) 
testset = list(testset.to_records(index=False)) 

# Preview. 
trainset[:2], testset[:2] 

([('No blind-spots coming from 1 vantage point allows investigators to see the whole story in high resolution with no unbroken video .', 'neutral'),
  ('International sales rose by 59.8 % to EUR 1,244.4 mn .', 'positive')],
 [("Markets had been expecting a poor performance , and the company 's stock was up 6 percent at  x20ac 23.89 US$ 33.84 in early afternoon trading in Helsinki .", 'positive'),
  ("The core of Solidium 's investment strategy is proper , value enhancing asset management of the current holdings .", 'positive')])

### Reformat the data and save train test in SpaCy format. 

In [6]:
# # Uncomment this part to re-save the dataset. 
# # Save the dataset in SpaCy format. 
# manage_files.save_to_spacy(data=trainset, filename="sentiment_trainset.spacy", nlp=nlp) 
# manage_files.save_to_spacy(data=testset, filename="sentiment_testset.spacy", nlp=nlp) 

## Train the SpaCy model & Model evaluation. 

In [7]:
# # Uncomment this part to re-train the model. 
# # May take less than 7 minutes for training. 
# !python -m spacy train source/config_spacy/config_tp.cfg \
# 	--verbose  \
# 	--output model/spacy_sentiment 

## Predict examples. 

In [8]:
headlines = [
	'''
	Many participants noted that one or more 50 basis point increases in the 
	target range could be appropriate at future meetings, particularly if 
	inflation pressures remained elevated or intensified.
	''', 
	'''
	The yield curve inversion is spooking the markets, although its a recession 
	predictor, history shows it may not be time to sell.
	''', 
	'''
	Yields on the benchmark 10-year OFZ ruble treasury bonds spiked to 19.7% 
	in early pre-market trade, an all-time high, but had settled to around 14% 
	by the end of the trading session.
	''', 
	'''
	U.S. two-year Treasury yields climbed to their highest level since early 2019 
	on Monday, continuing to push higher on expectations that the Federal Reserve 
	will deliver bigger rate hikes in the months ahead to tame inflation.
	''', 
	'''
	Apple earnings: Huge iPhone 12 sales beat analyst expectations.
	''', 
	'''
	Australia largest airline temporarily lays off 2,500 employees. 
	''', 
]

# Loading the best model from output_updated folder
nlp = spacy.load(f"{DIR_MLSPACY}/model-best") 

# Make prediction. 
for i, headline in enumerate(headlines): 
	spcy_doc = nlp(headline) 
	print(i, "| Sentiment: ", max(spcy_doc.cats, key=lambda x: spcy_doc.cats[x]), "|", spcy_doc.cats) 



0 | Sentiment:  neutral | {'positive': 0.14875653386116028, 'negative': 0.021231312304735184, 'neutral': 0.8300121426582336}
1 | Sentiment:  neutral | {'positive': 0.001358339563012123, 'negative': 0.0002256576990475878, 'neutral': 0.9984159469604492}
2 | Sentiment:  positive | {'positive': 0.9396674036979675, 'negative': 0.021503876894712448, 'neutral': 0.038828689604997635}
3 | Sentiment:  neutral | {'positive': 0.3172284662723541, 'negative': 0.013310641050338745, 'neutral': 0.6694609522819519}
4 | Sentiment:  neutral | {'positive': 0.04883705824613571, 'negative': 0.003382651833817363, 'neutral': 0.9477802515029907}
5 | Sentiment:  negative | {'positive': 0.00034739868715405464, 'negative': 0.9996166229248047, 'neutral': 3.601448770496063e-05}
