In [1]:
import pandas as pd
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

2023-11-29 18:00:42.292790: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Read in the data
df = pd.read_csv('all-data.csv', encoding='unicode_escape', names=['sentiment', 'text'])
df

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [3]:
df.shape

(4846, 2)

In [4]:
# Get the data into an X, y format
X = df['text'].to_list()
y = df['sentiment'].to_list()

In [5]:
# Download finbert
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [6]:
from tqdm import tqdm

# Predict the sentiment
predictions = []
prediction_probs = []

tokenizer_kwargs = {'padding': True, 'truncation': True, 'max_length': 512}

# Replace the rich progress bar with a tqdm progress bar
for x in tqdm(X, desc="Processing", colour="green"):
    with torch.no_grad():
        input_sequence = tokenizer(x, return_tensors="pt", **tokenizer_kwargs)
        logits = model(**input_sequence).logits
        scores = {
            k: v
            for k, v in zip(
                model.config.id2label.values(),
                scipy.special.softmax(logits.numpy().squeeze()),
            )
        }
    sentimentFinbert = max(scores, key=scores.get)
    probabilityFinbert = max(scores.values())
    predictions.append(sentimentFinbert)
    prediction_probs.append(probabilityFinbert)


Processing: 100%|[32m██████████[0m| 4846/4846 [04:25<00:00, 18.28it/s]


In [7]:
# Compute the accuracy score
from rich import print
print(f"Accuracy: {accuracy_score(y, predictions):.3f}")

In [8]:
print(classification_report(y, predictions))

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import pickle

# Define the FinbertClassifier class
class FinbertClassifier(BaseEstimator, TransformerMixin):
    def __init__(self, model_path="ProsusAI/finbert"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)

    def predict(self, texts):
        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        return predictions.numpy()

# Create a pipeline
finbert_pipeline = Pipeline([
    ('finbert', FinbertClassifier())
])

# Save the pipeline
with open("models/finbert_pipeline.pkl", "wb") as f:
    pickle.dump(finbert_pipeline, f)


In [10]:


# with open("models/finbert_pipeline.pkl", "rb") as f:
#     finbert_pred_pipeline = pickle.load(f)
    
# predictions = finbert_pred_pipeline.predict(df['headline'].to_list())
# predictions
