# Chapter 5: spaCy

Install the `en_core_web_trf` model from spacy with `python -m spacy download en_core_web_trf`

Acknowledgements:\
Thanks [Python Tutorials for Digital Humanities](https://www.youtube.com/channel/UC5vr5PwcXiKX_-6NTteAlXw) for the [tutorial](https://www.youtube.com/watch?v=7PD48PFL9VQ)

In [1]:
import configparser
from getpass import getuser
config = configparser.ConfigParser()
config.read(f"../config/{getuser()}.ini")
if len(config.sections()) == 0:
    config.read("../config/default.ini")
DATA_PATH = config["Data"]["path"]

import os
import pandas as pd
df = pd.read_pickle(os.path.join(DATA_PATH, "df.pkl"))
print(df.shape)

(5920, 11)


In [2]:
TRAIN_SIZE = 0.55
VALID_SIZE = 0.2
TEST_SIZE = 0.25

CHAPTER_5_FILES_PATH = "./chapter-5-files"
from pathlib import Path
Path(CHAPTER_5_FILES_PATH).mkdir(parents=True, exist_ok=True)

from sklearn.model_selection import train_test_split
from sklearn.utils import resample

FEATURE_NAME = "Body_Text"

X = df[FEATURE_NAME]
y = df["Sentiment"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=50)
        
# Undersampling the training data
# train = pd.concat([X_train, y_train], axis=1)
# train_positive = train[train["Sentiment"] == 1]
# train_neutral = train[train["Sentiment"] == 0]
# train_negative = train[train["Sentiment"] == -1]
# train_positive_resampled = resample(train_positive, n_samples=len(train_negative), random_state=70)
# train_neutral_resampled = resample(train_neutral, n_samples=len(train_negative), random_state=70)
# train_resampled = pd.concat([train_positive_resampled, train_neutral_resampled, train_negative])

# Oversampling the training data
train = pd.concat([X_train, y_train], axis=1)
train_positive = train[train["Sentiment"] == 1]
train_neutral = train[train["Sentiment"] == 0]
train_negative = train[train["Sentiment"] == -1]
train_neutral_resampled = resample(train_neutral, n_samples=len(train_positive), random_state=70)
train_negative_resampled = resample(train_negative, n_samples=len(train_positive), random_state=70)
train_resampled = pd.concat([train_positive, train_neutral_resampled, train_negative_resampled])
print(train_resampled.Sentiment.value_counts())

X_train, X_valid, y_train, y_valid = train_test_split(train_resampled[FEATURE_NAME], train_resampled["Sentiment"], test_size=VALID_SIZE/(TRAIN_SIZE+VALID_SIZE), random_state=50)

df_train = pd.concat([X_train, y_train], axis=1)
df_valid = pd.concat([X_valid, y_valid], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

df_train.to_pickle(os.path.join(CHAPTER_5_FILES_PATH, "df_train.pkl"))
df_valid.to_pickle(os.path.join(CHAPTER_5_FILES_PATH, "df_valid.pkl"))
df_test.to_pickle(os.path.join(CHAPTER_5_FILES_PATH, "df_test.pkl"))

 0    2727
 1    2727
-1    2727
Name: Sentiment, dtype: int64


In [3]:
df_train = pd.read_pickle(os.path.join(CHAPTER_5_FILES_PATH, "df_train.pkl"))
print(df_train.shape)
df_valid = pd.read_pickle(os.path.join(CHAPTER_5_FILES_PATH, "df_valid.pkl"))
print(df_valid.shape)

def map_df_to_list_of_tuples(df):
    text_sentiment_tuples = []
    for i, r in df.iterrows():
        text_sentiment_tuples.append((r[FEATURE_NAME], r["Sentiment"]))
    return text_sentiment_tuples

text_sentiment_tuples_train = map_df_to_list_of_tuples(df_train)
text_sentiment_tuples_valid = map_df_to_list_of_tuples(df_valid)

(5999, 2)
(2182, 2)


In [4]:
import spacy
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_trf")

from tqdm import tqdm

def convert_tuples_to_docbin(tuples):
    docs = []
    for doc, label in tqdm(nlp.pipe(tuples, as_tuples=True), total=len(tuples)):
        if label == -1:
            doc.cats["Negative"] = 1
            doc.cats["Neutral"] = 0
            doc.cats["Positive"] = 0
        elif label == 0:
            doc.cats["Negative"] = 0
            doc.cats["Neutral"] = 1
            doc.cats["Positive"] = 0
        elif label == 1:
            doc.cats["Negative"] = 0
            doc.cats["Neutral"] = 0
            doc.cats["Positive"] = 1
        docs.append(doc)
    return spacy.tokens.DocBin(docs=docs)

docbin_train = convert_tuples_to_docbin(text_sentiment_tuples_train)
docbin_train.to_disk(os.path.join(CHAPTER_5_FILES_PATH, "train.spacy"))
docbin_valid = convert_tuples_to_docbin(text_sentiment_tuples_valid)
docbin_valid.to_disk(os.path.join(CHAPTER_5_FILES_PATH, "valid.spacy"))

100%|█████████████████████████████████████████████████████████████████████████████| 5999/5999 [00:58<00:00, 102.12it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2182/2182 [00:21<00:00, 103.79it/s]


`cd notebooks/chapter-5-files`

`python -m spacy init fill-config base_config.cfg config.cfg`

`python -m spacy train config.cfg --output ./output --gpu-id 0`

In [1]:
CHAPTER_5_FILES_PATH = "./chapter-5-files"
import spacy
spacy.prefer_gpu()
nlp = spacy.load(CHAPTER_5_FILES_PATH + "/output/model-best")

In [2]:
import os
import pandas as pd
df_test = pd.read_pickle(os.path.join(CHAPTER_5_FILES_PATH, "df_test.pkl"))

In [3]:
FEATURE_NAME = "Body_Text"

def get_predicted_category(r):
    predictions = nlp(r[FEATURE_NAME]).cats
    prediction = list(predictions.keys())[list(predictions.values()).index(max(predictions.values()))]
    return 1 if prediction == "Positive" else 0 if prediction == "Neutral" else -1

df_test["Predicted"] = df_test.apply(lambda r: get_predicted_category(r), axis=1)

In [4]:
from sklearn.metrics import accuracy_score
accuracy_score(df_test["Sentiment"].to_list(), df_test["Predicted"].to_list())

0.6513513513513514

In [5]:
df_test[df_test["Sentiment"] != df_test["Predicted"]].head()

Unnamed: 0,Body_Text,Sentiment,Predicted
5248,"Overnight HIBOR last posted at 0.03813%, accor...",0,1
4901,"XL2CSOPHSTECH closed at HK$9.085,up 4.9%.",-1,1
6673,ASCENTAGE-B announced that the Company plans f...,0,1
4483,Under the executive order issued by US Preside...,1,0
5592,"At midday close, HSI dropped 447 pts or 1.5% t...",-1,0


Still not much better than the baseline, maybe can try different model architectures after scraping more data