In [57]:
import transformers
import os
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import StringTensorType
from web_scraper import scrape
from transformers import pipeline
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
transformers.logging.set_verbosity_info()
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

In [58]:
# get sentence from the web scraper module
url ='https://www.gq.com/story/wrexham-fc-ryan-reynolds-rob-mcelhenney'
sentences = scrape(url)
print(sentences[:2])

['Ryan Reynolds, restless as a trapped cat, broke off his pacing and stooped to peer through plate glass balcony doors', 'He was in an owners’ lounge, high to one side of a soccer stadium in the city of Wrexham in Wales, a few miles west of the Welsh-English border']


In [59]:
# creating a function to handle neutral sentences
def classify_neutral(sentence, model, threshold=0.70):
    result = model(sentence)[0]
    if result['score'] < threshold:
        return 'neutral'
    return result['label'].lower()

In [60]:
# Automating the annotation of the sentence using hugging face
sentiment_model = pipeline('sentiment-analysis')
labeled_sentences = [(sentence, classify_neutral(sentence, sentiment_model)) for sentence in sentences]

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
loading configuration file config.json from cache at /home/imisioluwa/.cache/huggingface/hub/models--distilbert--distilbert-base-uncased-finetuned-sst-2-english/snapshots/714eb0fa89d2f80546fda750413ed43d93601a13/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert/distilbert-base-uncased-finetuned-sst-2-english",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_positi

In [61]:
# Splitting dataset
text, label = zip(*labeled_sentences)
x_train, x_test, y_train, y_test = train_test_split(text, label, test_size=0.2, random_state=42)

In [62]:
# Training a svm model
svm_model = make_pipeline(TfidfVectorizer(), SVC(kernel='rbf', gamma=1.8, C=1.8))
svm_model.fit(x_train, y_train)
accuracy = svm_model.score(x_test, y_test)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.68


In [63]:
# Naive Bayes model for A/B testing
nb_model = make_pipeline(TfidfVectorizer(), MultinomialNB())
nb_model.fit(x_train, y_train)
accuracy = nb_model.score(x_test, y_test)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.65


# A/B Testing
From the above description after testing the model on the same test dataset it is clear that the svm model outperforms the naive bayes model which is the baseline model
Dataset size = 200 sentences
Accuracy score for naive bayes = 0.65
Accuracy score for svm = 0.68
while a 3% increase might not be too significant this is due to the smaller dataset that was used to perform this test

In [64]:
# Using onnx for optimizing svm model for inferencing
initial_type = [("input", StringTensorType([None]))]
onnx_model = convert_sklearn(svm_model, initial_types=initial_type)

onnx_model_path = "svm_model.onnx"
with open(onnx_model_path, "wb") as f:
    f.write(onnx_model.SerializeToString())