In [None]:
import transformers
import os
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import StringTensorType
from web_scraper import scrape
from transformers import pipeline
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
transformers.logging.set_verbosity_info()
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

In [None]:
# get sentence from the web scraper module
url ='https://www.gq.com/story/wrexham-fc-ryan-reynolds-rob-mcelhenney'
sentences = scrape(url)
print(sentences[:2])

In [None]:
# creating a function to handle neutral sentences
def classify_neutral(sentence, model, threshold=0.85):
    result = model(sentence)[0]
    if result['score'] < threshold:
        return 'neutral'
    return result['label'].lower()

In [None]:
# Automating the annotation of the sentence using hugging face
sentiment_model = pipeline('sentiment-analysis')
labeled_sentences = [(sentence, classify_neutral(sentence, sentiment_model)) for sentence in sentences]

In [None]:
count = 0
for _, label in labeled_sentences:
    if label == 'neutral':
        count += 1
print(count)

In [None]:
# Splitting dataset
text, label = zip(*labeled_sentences)
x_train, x_test, y_train, y_test = train_test_split(text, label, test_size=0.2, random_state=42)

In [None]:
# Training a svm model
svm_model = make_pipeline(TfidfVectorizer(), SVC(kernel='rbf', gamma=2.0, C=2.0))
svm_model.fit(x_train, y_train)
accuracy = svm_model.score(x_test, y_test)
print(f"Model Accuracy: {accuracy:.2f}")

In [None]:
# Naive Bayes model for A/B testing
nb_model = make_pipeline(TfidfVectorizer(), MultinomialNB())
nb_model.fit(x_train, y_train)
accuracy = nb_model.score(x_test, y_test)
print(f"Model Accuracy: {accuracy:.2f}")

# A/B Testing
From the above description after testing the model on the same test dataset it is clear that the svm model outperforms the naive bayes model which is the baseline model
Dataset size = 200 sentences
Accuracy score for naive bayes = 0.60
Accuracy score for svm = 0.65
while a 5% increase might not be too significant this is due to the smaller dataset that was used to perform this test

In [None]:
# Using onnx for optimizing svm model for inferencing
initial_type = [("input", StringTensorType([None]))]
onnx_model = convert_sklearn(svm_model, initial_types=initial_type)

onnx_model_path = "svm_model.onnx"
with open(onnx_model_path, "wb") as f:
    f.write(onnx_model.SerializeToString())