In [27]:
!pip install transformers datasets torch torchvision accelerate evaluate ipywidgets --quiet
!pip install --upgrade transformers accelerate --quiet

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [28]:
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import load_dataset, Dataset
import numpy as np
import evaluate
import transformers, accelerate
from accelerate import Accelerator

accelerator = Accelerator()

In [29]:
!pip cache purge
!df -h

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Files removed: 4
Filesystem      Size  Used Avail Use% Mounted on
overlay          37G   83M   37G   1% /
tmpfs            64M     0   64M   0% /dev
tmpfs           7.7G     0  7.7G   0% /sys/fs/cgroup
shm             4.0G   48K  4.0G   1% /dev/shm
/dev/nvme2n1     25G   21G  4.1G  84% /home/studio-lab-user
/dev/nvme0n1p1   50G   26G   25G  52% /mnt/sagemaker-nvme
devtmpfs        7.7G     0  7.7G   0% /dev/tty
tmpfs           7.7G   12K  7.7G   1% /proc/driver/nvidia
tmpfs           7.7G     0  7.7G   0% /proc/acpi
tmpfs           7.7G     0  7.7G   0% /sys/firmware


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [30]:
dataset = load_dataset('financial_phrasebank', 'sentences_50agree')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [31]:
# Inspect the dataset
print(dataset['train'].features)
print(dataset['train'][10])

{'sentence': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'neutral', 'positive'], id=None)}
{'sentence': "TeliaSonera TLSN said the offer is in line with its strategy to increase its ownership in core business holdings and would strengthen Eesti Telekom 's offering to its customers .", 'label': 2}


In [32]:
# Load the dataset into a DataFrame for preprocessing
df = pd.DataFrame(dataset['train'])
df

Unnamed: 0,sentence,label
0,"According to Gran , the company has no plans t...",1
1,Technopolis plans to develop in stages an area...,1
2,The international electronic industry company ...,0
3,With the new production plant the company woul...,2
4,According to the company 's updated strategy f...,2
...,...,...
4841,LONDON MarketWatch -- Share prices ended lower...,0
4842,Rinkuskiai 's beer sales fell by 6.5 per cent ...,1
4843,Operating profit fell to EUR 35.4 mn from EUR ...,0
4844,Net sales of the Paper segment decreased to EU...,0


In [33]:
# Check the unique values in the 'label' column
print("Unique labels in the dataset:", df['label'].unique())

Unique labels in the dataset: [1 0 2]


In [34]:
# Convert to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(df[['sentence', 'label']])
hf_dataset

Dataset({
    features: ['sentence', 'label'],
    num_rows: 4846
})

In [35]:
# OPTION 1
# Load FinBERT model and tokenizer

model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

  return self.fget.__get__(instance, owner)()


In [8]:
# OPTION 2
# Load the fine-tuned finBERT model from your DIRECTORY

model_dir = os.path.expanduser("~/LLM/finbert-finetuned/checkpoint-1455")
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)


In [36]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [37]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['sentence'], truncation=True)

In [38]:
# Tokenize the dataset
tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/4846 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Dataset({
    features: ['sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4846
})

In [39]:
# Prepare for PyTorch DataLoader
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [40]:
# Split the dataset into train and test
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [46]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="~/LLM/finbert-finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='~/LLM/logs',
    logging_steps=10,
    save_steps=500,
)




In [42]:
# Define the evaluation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Load metrics using the evaluate library
    accuracy_metric = evaluate.load("accuracy")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    f1_metric = evaluate.load("f1")

    # Compute the metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_metric.compute(predictions=predictions, references=labels, average='weighted')["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average='weighted')["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')["f1"]

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [48]:
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # Include the compute_metrics function
)

In [50]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0003,1.56943,0.819588,0.824238,0.819588,0.820912
2,0.0412,1.372468,0.826804,0.826409,0.826804,0.826527
3,0.0,1.362451,0.838144,0.838546,0.838144,0.838024


TrainOutput(global_step=1455, training_loss=0.045306213430752675, metrics={'train_runtime': 172.8065, 'train_samples_per_second': 67.289, 'train_steps_per_second': 8.42, 'total_flos': 312101847191232.0, 'train_loss': 0.045306213430752675, 'epoch': 3.0})

In [51]:
# Evaluate the model on the test set
evaluation_results = trainer.evaluate()

In [52]:
# Print evaluation results
print("Evaluation Metrics:")
print(f"Accuracy: {evaluation_results['eval_accuracy']:.3f}")
print(f"Precision: {evaluation_results['eval_precision']:.3f}")
print(f"Recall: {evaluation_results['eval_recall']:.3f}")
print(f"F1 Score: {evaluation_results['eval_f1']:.3f}")

Evaluation Metrics:
Accuracy: 0.838
Precision: 0.839
Recall: 0.838
F1 Score: 0.838


In [105]:
def analyze_sentiment_with_finbert(user_input):
    # Assume that `finbert_model` and `tokenizer` are already defined and loaded
    inputs = tokenizer(user_input, return_tensors="pt", truncation=True, padding=True)
    
    # Move input tensors to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    
    # Apply softmax to get probabilities
    probabilities = torch.softmax(logits, dim=1)
    
    # Move tensor to CPU before converting to NumPy
    probabilities = probabilities.cpu().detach().numpy()
    
    # Get the sentiment with the highest probability
    sentiment_idx = np.argmax(probabilities, axis=1)[0]

    # Map the output to sentiment label
    LABELS = {0: 'positive', 1: 'neutral', 2: 'negative'}
    sentiment = LABELS[sentiment_idx]
    
    return sentiment

In [106]:
# Load gpt2-medium model and tokenizer for text generation

gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2-medium", cache_dir="/tmp")
gpt2_model = AutoModelForCausalLM.from_pretrained("gpt2-medium", cache_dir="/tmp")

In [107]:
# Example usage
import torch 
# Check if CUDA is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Move model to the device
gpt2_model = gpt2_model.to(device)


In [108]:
def generate_advice_with_gpt2(user_question, sentiment):
    # Construct the input prompt for GPT-2 without repetition
    prompt = (
        f"You are a professional financial analyst.\n Based on the '{user_question}' and the market sentiment being '{sentiment}'\n "
        "Provide concise financial advice. Respond in exactly three sentences, focusing on potential risks, opportunities, and actionable steps : "
    )

    # Tokenize the prompt for GPT-2
    input_ids = gpt2_tokenizer(prompt, return_tensors="pt").input_ids

    # Move input_ids to the correct device
    input_ids = input_ids.to(device)

    # Create the attention mask and move it to the correct device
    attention_mask = torch.ones_like(input_ids).to(device)

    # Generate text using GPT-2 with controlled decoding parameters
    gpt2_output = gpt2_model.generate(
        input_ids,
        max_length=200,  # Limit the total number of tokens
        do_sample=True,
        temperature=0.3,  # Lower temperature for coherence
        top_p=0.85,  # Tighten nucleus sampling for relevance
        top_k=40,  # Keep diversity but reduce randomness
        attention_mask=attention_mask,  # Ensure attention mask is also on the right device
        pad_token_id=gpt2_tokenizer.eos_token_id,  # Ensure proper padding
        num_return_sequences=1,  # Only return one response
        repetition_penalty=2.0,  # Penalize repetitive outputs
        early_stopping=True  # Stop early if the response seems complete
    )

    # Decode the generated text
    advice_with_reasoning = gpt2_tokenizer.decode(gpt2_output[0], skip_special_tokens=True)

    return advice_with_reasoning

In [109]:
def get_financial_advice_with_sentiment(user_question):
    
    # Step 1: Analyze sentiment using FinBERT
    sentiment = analyze_sentiment_with_finbert(user_question)

    # Step 2: Generate advice with GPT-2 based on the sentiment
    advice_with_reasoning = generate_advice_with_gpt2(user_question, sentiment)

    # Return both sentiment and advice
    return {
        "user_question": user_question,
        "sentiment": sentiment,
        "advice_with_reasoning": advice_with_reasoning
    }

In [110]:
user_question = "Should I invest in real esatet in california now?, i am quite sure what to do, i have some money that is sufficient enough i think."
financial_advice = get_financial_advice_with_sentiment(user_question)

In [111]:
from IPython.core.display import display, HTML

# Format the output in HTML
html_output = f"""
<!DOCTYPE html>
<html>
<head>
    <style>
        body {{
            font-family: Arial, sans-serif;
            line-height: 1.6;
        }}
        h3 {{
            color: #333;
        }}
        p {{
            margin: 0;
            padding: 0 0 10px 0;
        }}
    </style>
</head>
<body>
    <h3>User Question</h3>
    <p>{financial_advice['user_question']}</p>

    <h3>Sentiment</h3>
    <p>{financial_advice['sentiment'].capitalize()}</p>

    <h3>Advice with Reasoning</h3>
    <p>{financial_advice['advice_with_reasoning']}</p>
</body>
</html>
"""

# Display the HTML formatted output
display(HTML(html_output))


  from IPython.core.display import display, HTML
