In [None]:
!pip install ipywidgets

In [1]:
from transformers import pipeline

pipe = pipeline("text-classification", model="ProsusAI/finbert")
prompt="I am going to my friends place"
result=pipe(prompt)

print("Sentiment: "+str(result))



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Sentiment: [{'label': 'neutral', 'score': 0.9137189388275146}]


In [23]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch

def preprocess_tweet(text):
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'RT[\s]+', '', text)
    text = re.sub(r'#', '', text)
    return text

In [41]:
try:
    df = pd.read_csv('/projectnb/cs505ws/students/praneshj/final-project/data/train.csv', encoding='ISO-8859-1', usecols=['Sentiment', 'SentimentText'])
except UnicodeDecodeError as e:
    print(f"Error reading file: {e}")
df['SentimentText'] = df['SentimentText'].apply(preprocess_tweet)

In [55]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

Using device: cuda:1


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [44]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [45]:
len(train_df), len(test_df)

(79991, 19998)

In [46]:
class SentimentDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['SentimentText']
        label = self.data.iloc[idx]['Sentiment']
        inputs = tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
        return {
            'input_ids': inputs['input_ids'][0],
            'attention_mask': inputs['attention_mask'][0],
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [47]:
train_df

Unnamed: 0,Sentiment,SentimentText
58519,1,"if you can't sleep, I suggest watching the &q..."
38238,1,ii was also at school. =] haha was just remin...
3806,1,=- La La la. I love country music.
27925,1,gmornin my little madeleine cake! gotta love...
6006,0,cheapspeakers have everybody at rancho dancing...
...,...,...
6265,0,dontyouhate when you try to be sumbody everyth...
54886,1,Are you really???? Bonus! We can be new in to...
76820,0,s'not fair I'm going to do overtime after my...
860,0,Dinara lost again in Roland Garros. Why the S...


In [49]:
from torch.utils.data import Dataset, DataLoader

train_dataset = SentimentDataset(train_df)
test_dataset = SentimentDataset(test_df)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [50]:
train_dataset.__getitem__(0)

{'input_ids': tensor([  101,  2065,  2017,  2064,  1005,  1056,  3637,  1010,  1045,  6592,
          3666,  1996,  1004, 22035,  2102,  1025,  6513,  3325,  1004, 22035,
          2102,  1025,  1010,  2008,  2323,  2079,  2009,   999,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [53]:
from torch.optim import AdamW

model.train()
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 1

In [57]:
from tqdm import tqdm

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Average Training Loss: {avg_loss}")

# Evaluation Loop
model.eval()
total_eval_loss = 0

for batch in tqdm(test_dataloader, desc="Evaluating"):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_eval_loss += loss.item()

avg_test_loss = total_eval_loss / len(test_dataloader)
print(f"Average Test Loss: {avg_test_loss}")

# Save the model
model.save_pretrained('/projectnb/cs505ws/students/praneshj/final-project/model/finBert_fine_tuned')

Epoch 1/1 Training: 100%|██████████| 9999/9999 [37:41<00:00,  4.42it/s]


Average Training Loss: 2.9825707631583738


Evaluating: 100%|██████████| 2500/2500 [03:20<00:00, 12.46it/s]


Average Test Loss: 3.0921369310379028


In [64]:
new_lr = 0.0005
for param_group in optimizer.param_groups:
    param_group['lr'] = new_lr

In [76]:
import torch
from tqdm import tqdm

def evaluate_model(model, dataloader, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", unit="batch"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            _, predicted = torch.max(outputs.logits, -1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    return avg_loss, accuracy

def train_model(model, train_dataloader, test_dataloader, optimizer, num_epochs=10, patience=50, model_path='/projectnb/cs505ws/students/praneshj/final-project/model/finBert_fine_tuned_epoch_3'):
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        train_iterator = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)

        for batch in train_iterator:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            model.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            train_iterator.set_postfix(Loss=loss.item())

        avg_train_loss = total_loss / len(train_dataloader)
        val_loss, val_acc = evaluate_model(model, test_dataloader, device)
        tqdm.write(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), model_path)
        else:
            patience_counter += 1
            if patience_counter >= patience:
                tqdm.write("Early stopping")
                break

In [77]:
train_model(model, train_dataloader, test_dataloader, optimizer, num_epochs=1, patience=3, model_path="/projectnb/cs505ws/students/praneshj/final-project/model/finBert_fine_tuned_epoch_3")

Evaluating: 100%|██████████| 2500/2500 [03:21<00:00, 12.42batch/s]       


Epoch 1, Train Loss: 2.9804, Val Loss: 3.0921, Val Acc: 0.0233


In [81]:
model.save_pretrained('/projectnb/cs505ws/students/praneshj/final-project/model/finBert_fine_tuned_3')
tokenizer.save_pretrained("/projectnb/cs505ws/students/praneshj/final-project/model/finBert_fine_tuned_3")

('/projectnb/cs505ws/students/praneshj/final-project/model/finBert_fine_tuned_3/tokenizer_config.json',
 '/projectnb/cs505ws/students/praneshj/final-project/model/finBert_fine_tuned_3/special_tokens_map.json',
 '/projectnb/cs505ws/students/praneshj/final-project/model/finBert_fine_tuned_3/vocab.txt',
 '/projectnb/cs505ws/students/praneshj/final-project/model/finBert_fine_tuned_3/added_tokens.json',
 '/projectnb/cs505ws/students/praneshj/final-project/model/finBert_fine_tuned_3/tokenizer.json')

## Inference

In [2]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

model_path = '/Users/praneshjayasundar/Documents/Gunner/Boston-University/Fall-2023/student/CS505/final-project/health-assistant/model/finBert-medical-v3'

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
prompt = "I didn't eat anything the entire day, urghh irritating"
result = pipe(prompt)

label = result[0]['label']
score = result[0]['score']

print("Sentiment: ",label, "\nScore: ",score)

OSError: We couldn't connect to 'https://huggingface.co/' to load this model and it looks like /Users/praneshjayasundar/Documents/Gunner/Boston-University/Fall-2023/Student/CS505/final-project/health-assistant/model/finBert_fine_tuned_v3 is not the path to a directory conaining a config.json file.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.