# Build Sentiment Model Via SpaCy. 

In [1]:
# Python module. 
import re, os 
import pandas as pd 
import spacy 
import shap 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix

# Change the current directory from (./notebook) to root directory. 
if not re.match(r".+/MADS-M2-estimating-news-impact-on-financial-market$", os.getcwd()): 
	os.chdir("../..") 

# For clearing safe warnings. Not important. 
from IPython.display import clear_output 

# Custom modules. 
from source.modules.manage_files import ManageFiles
from source.modules.processor_estim import ExtractSentiment 
from source.modules.processor_spacy import (
	to_spacy_document, sentiment_predictor, token_wrapper 
)

# Custom configs. 
from source.config_py.config import DIR_MLSPACY, PARAM_SEED

# Preview. 
print(os.getcwd()) 

  from .autonotebook import tqdm as notebook_tqdm


/Users/lioneltay/Dropbox/Courses/michigan_mads/SIADS_694_695_milestone_2_Eric_Gilbert/submission/MADS-M2-estimating-news-impact-on-financial-market


## Configurations (general). 

In [2]:
# Pandas DF config. 
pd.set_option("display.max_rows", 50, "display.max_columns", 50, "display.max_colwidth", 200) 

# SpaCy configuration. 
nlp = spacy.load("en_core_web_sm") 

# File management setup. 
manage_files = ManageFiles() 

# For clearing the output. Not important. 
clear_output()

## Load sentiment dataset. 

In [3]:
# Load dataset. 
# Need to set (encoding="latin-1"). Otherwise will raise error due to special characters. 
# Fine to ignore those characters. Will not affect the analysis later. 
colnames = ["sentiment", "headline"] 
df_sentiment = manage_files.read_from_csv(filename="sentiment_news.csv", names=colnames, encoding="latin-1") 

# Rearrange the column order. 
df_sentiment = df_sentiment[["headline", "sentiment"]] 

# Preview. 
df_sentiment

Read from (sentiment_news.csv)


Unnamed: 0,headline,sentiment
0,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .",neutral
1,"Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .",neutral
2,The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers ...,negative
3,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profi...,positive
4,"According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net ...",positive
...,...,...
4841,LONDON MarketWatch -- Share prices ended lower in London Monday as a rebound in bank stocks failed to offset broader weakness for the FTSE 100 .,negative
4842,"Rinkuskiai 's beer sales fell by 6.5 per cent to 4.16 million litres , while Kauno Alus ' beer sales jumped by 6.9 per cent to 2.48 million litres .",neutral
4843,"Operating profit fell to EUR 35.4 mn from EUR 68.8 mn in 2007 , including vessel sales gain of EUR 12.3 mn .",negative
4844,"Net sales of the Paper segment decreased to EUR 221.6 mn in the second quarter of 2009 from EUR 241.1 mn in the second quarter of 2008 , while operating profit excluding non-recurring items rose t...",negative


## Reformat the data structure according to SpaCy requirement. 

### Split train test. 

In [4]:
X, y = df_sentiment[["headline"]], df_sentiment["sentiment"] 
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=PARAM_SEED, shuffle=True, stratify=y) 
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, test_size=0.1, random_state=PARAM_SEED, shuffle=True, stratify=y_valid) 

print(X_train.shape, X_valid.shape, X_test.shape, "|", y_train.shape, y_valid.shape, y_test.shape) 

(3876, 1) (873, 1) (97, 1) | (3876,) (873,) (97,)


In [5]:
# Convert to list of tuples for TRAIN and TEST set 
# to convert into SpaCy format later. 
trainset = pd.concat([X_train, y_train], axis=1) 
trainset = list(trainset.to_records(index=False)) 
validset = pd.concat([X_valid, y_valid], axis=1) 
validset = list(validset.to_records(index=False)) 

# Preview. 
trainset[:2], validset[:2] 

([('The major breweries increased their domestic beer sales by 4.5 per cent last year , to 256.88 million litres from 245.92 million litres in 2004 .', 'positive'),
  ('CapMan , an asset manager , has EUR 3bn worth of assets under management in the Nordic region .', 'neutral')],
 [('Its market share widened to 48.51 percent from 48.31 percent a year earlier .', 'positive'),
  ('The GyPSii mobile social networking application is available in China with both Chinese and English language support .', 'neutral')])

### Reformat the data and save train test in SpaCy format. 

In [6]:
# # Uncomment this part to re-save the dataset. 
# # Save the dataset in SpaCy format. 
# manage_files.save_to_spacy(data=trainset, filename="sentiment_trainset.spacy", nlp=nlp) 
# manage_files.save_to_spacy(data=validset, filename="sentiment_validset.spacy", nlp=nlp) 

## Train the SpaCy model & Model evaluation. 

In [7]:
# # Uncomment this part to re-train the model. 
# # May take less than 7 minutes for training. 
# !python -m spacy train source/config_spacy/config_tp.cfg \
# 	--verbose  \
# 	--output model/spacy_sentiment 

## Predict examples. 

In [8]:
headlines = [
	'''
	Many participants noted that one or more 50 basis point increases in the 
	target range could be appropriate at future meetings, particularly if 
	inflation pressures remained elevated or intensified.
	''', 
	'''
	The yield curve inversion is spooking the markets, although its a recession 
	predictor, history shows it may not be time to sell.
	''', 
	'''
	Yields on the benchmark 10-year OFZ ruble treasury bonds spiked to 19.7% 
	in early pre-market trade, an all-time high, but had settled to around 14% 
	by the end of the trading session.
	''', 
	'''
	U.S. two-year Treasury yields climbed to their highest level since early 2019 
	on Monday, continuing to push higher on expectations that the Federal Reserve 
	will deliver bigger rate hikes in the months ahead to tame inflation.
	''', 
	'''
	Apple earnings: Huge iPhone 12 sales beat analyst expectations.
	''', 
	'''
	Australia largest airline temporarily lays off 2,500 employees. 
	''', 
]

# Loading the best model. 
mlpipe_spacy = spacy.load(f"{DIR_MLSPACY}/model-best") 

# Make prediction. 
for i, headline in enumerate(headlines): 
	spcy_doc = mlpipe_spacy(headline) 
	print(i, "| Sentiment: ", max(spcy_doc.cats, key=lambda x: spcy_doc.cats[x]), "|", spcy_doc.cats) 

0 | Sentiment:  neutral | {'positive': 8.496724149154034e-06, 'negative': 7.220386777362364e-08, 'neutral': 0.9999914169311523}
1 | Sentiment:  neutral | {'positive': 9.705704684392913e-08, 'negative': 4.645928584068315e-07, 'neutral': 0.9999994039535522}
2 | Sentiment:  neutral | {'positive': 0.1338992863893509, 'negative': 5.817384226247668e-05, 'neutral': 0.8660424947738647}
3 | Sentiment:  neutral | {'positive': 0.14268331229686737, 'negative': 0.2341422438621521, 'neutral': 0.6231744289398193}
4 | Sentiment:  neutral | {'positive': 2.5521835596009623e-06, 'negative': 5.80230334890075e-07, 'neutral': 0.9999969005584717}
5 | Sentiment:  negative | {'positive': 0.12900400161743164, 'negative': 0.8643807172775269, 'neutral': 0.0066153123043477535}


In [9]:
extractor_topic = ExtractSentiment(est_pipe=mlpipe_spacy, var_proc="headline") 
predicted = extractor_topic.fit_transform(X=X_valid) 

# For changing the value of each class name. 
mapper_classes = {
	"positive": 0,
	"neutral" : 1,
	"negative": 2,
} 
df_predicted = X_valid.copy() 
df_predicted["predicted"] = predicted["sentiment"].map(mapper_classes) 
df_predicted["sentiment"] = y_valid.map(mapper_classes) 

# Observe the prediction. 
conf_mat = confusion_matrix(df_predicted["sentiment"], df_predicted["predicted"]) 
conf_mat = pd.DataFrame(conf_mat, index=mapper_classes.keys(), columns=mapper_classes.keys()) 
conf_mat.index.name, conf_mat.columns.name = "true", "predicted" 

# Preview. 
conf_mat 

predicted,positive,neutral,negative
true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
positive,155,80,11
neutral,49,457,12
negative,7,33,69


In [10]:
classes = list(mlpipe_spacy.get_pipe("textcat").labels) 

# Create the Shap Explainer: 
# - (predict) is the "model" function, adapted to a transformers-like model. 
# - (masker) is the masker used by shap, which relies on a transformers-like tokenizer
# - (algorithm) is set to permutation, which is the one used for transformers models
# - (output_names) are the classes (although it is not propagated to the permutation 
#   explainer currently, which is why plots do not have the labels)
# - (max_evals) is set to a high number to reduce the probability of cases where the 
#   explainer fails because there are too many tokens. 
explainer = shap.Explainer(
    sentiment_predictor,
    masker=shap.maskers.Text(token_wrapper),
    algorithm="permutation",
    output_names=classes,
    max_evals=1500,
)

# Get a single row for evaluation. Can only evaluate a single headline at a time. 
row = 0 
single_row_text = X_valid.iloc[row:row+1, 0].values 

# Display the result. 
shap_values = explainer(single_row_text) 
shap_values.output_names = classes 
shap.plots.text(shap_values) 