In [18]:
import pandas as pd
import time
import re

from utils.system import *

from transformers import AutoTokenizer, AutoModel
from transformers import pipeline, BertTokenizer

In [11]:
def analyze_sentiment(text):
    # Tokenize and truncate the text
    inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
    
    # Decode to get a truncated string
    truncated_text = tokenizer.decode(inputs[0], skip_special_tokens=True)
    
    # Perform sentiment analysis
    result = sentiment_pipeline(truncated_text)[0]
    return result['label'], result['score']

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = BertTokenizer.from_pretrained(model_name)
sentiment_pipeline = pipeline('sentiment-analysis', model=model_name)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [None]:
data = pd.read_parquet(get_data() / 'clean_data.parquet.brotli')

In [14]:
start_time = time.time()
data['sent_article'], data['conf_article'] = zip(*data['cleaned_article'].apply(analyze_sentiment))
total_time = time.time() - start_time
print(f"Total Time: {total_time}")

Total Time: 602.0481472015381


In [15]:
# Create Sentiment Score
data['sent_score'] = data.apply(
    lambda row: 1 if row['sent_article'] == 'POSITIVE' and row['conf_article'] > 0.75 else 
    (-1 if row['sent_article'] == 'NEGATIVE' and row['conf_article'] > 0.75 else 0), axis=1)

In [16]:
data[['sent_score']].to_parquet(get_data() / 'bert_sentiment.parquet.brotli', compression='brotli')