In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:

In [None]:

#C:/Users/karth/OneDrive - Michigan Technological University/Desktop/BERT_TRAIED_DATA.csv
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Step 1: Load the dataset
data = pd.read_csv("BERT_TRAIED_DATA.csv")

# Step 2: Preprocessing
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Map intents to labels
intent_labels = {intent: idx for idx, intent in enumerate(data['INTENT'].unique())}
data['label'] = data['INTENT'].map(intent_labels)

# Tokenize queries
def tokenize_function(examples):
    return tokenizer(examples['QUERY'], padding="max_length", truncation=True)

# Convert data into a Hugging Face dataset
dataset = Dataset.from_pandas(data[['QUERY', 'label']])
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Step 3: Split into train and test sets using Hugging Face's datasets library
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Step 4: Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(intent_labels))

# Step 5: Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Step 6: Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



Map:   0%|          | 0/520 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Step 7: Fine-tuning
trainer.train()

# Save the model
model.save_pretrained('./fine_tuned_bert_model')
tokenizer.save_pretrained('./fine_tuned_bert_model')






Step,Training Loss
10,1.4117
20,1.4627
30,1.3735
40,1.3842
50,1.303
60,1.2376
70,1.161
80,1.0159
90,0.9325
100,0.8018


('./fine_tuned_bert_model/tokenizer_config.json',
 './fine_tuned_bert_model/special_tokens_map.json',
 './fine_tuned_bert_model/vocab.txt',
 './fine_tuned_bert_model/added_tokens.json')

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the fine-tuned model and tokenizer
model = BertForSequenceClassification.from_pretrained('./fine_tuned_bert_model')
tokenizer = BertTokenizer.from_pretrained('./fine_tuned_bert_model')

# Function to classify the intent of a new query
def classify_query(query):
    inputs = tokenizer(query, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = logits.argmax().item()  # Get the predicted class ID
    return predicted_class_id

# Example test query
user_query = "What’s the latest value of Tesla’s stock?"
predicted_class_id = classify_query(user_query)

# Map back the predicted class ID to the intent label
intent_labels = {0: 'price_inquiry', 1: 'stock_trend', 2: 'stock_comparison', 3: 'stock_news_sentiment'}
predicted_intent_label = intent_labels[predicted_class_id]

print(f"Predicted Intent: {predicted_intent_label}")


Predicted Intent: price_inquiry


In [None]:
# Evaluate the fine-tuned model on the test set
results = trainer.evaluate()

# Print evaluation metrics
print(f"Evaluation results: {results}")


Evaluation results: {'eval_loss': 0.0914812907576561, 'eval_runtime': 2.9097, 'eval_samples_per_second': 35.742, 'eval_steps_per_second': 4.468, 'epoch': 3.0}


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the fine-tuned model and tokenizer
model = BertForSequenceClassification.from_pretrained('./fine_tuned_bert_model')
tokenizer = BertTokenizer.from_pretrained('./fine_tuned_bert_model')

def predict_intent(query):
    # Tokenize the query
    inputs = tokenizer(query, return_tensors="pt")

    # Get predictions
    outputs = model(**inputs)
    logits = outputs.logits

    # Get predicted class ID
    predicted_class_id = torch.argmax(logits, dim=1).item()

    # Convert class ID to intent label
    predicted_intent_label = intent_labels[predicted_class_id]

    return predicted_intent_label

# Test with a sample query
sample_query = "What is the current price of Tesla?"
predicted_intent = predict_intent(sample_query)
print(f"Predicted Intent: {predicted_intent}")


Predicted Intent: price_inquiry


In [None]:
model.save_pretrained('/content/drive/MyDrive/fine_tuned_bert_model')
tokenizer.save_pretrained('/content/drive/MyDrive/fine_tuned_bert_model')


('/content/drive/MyDrive/fine_tuned_bert_model/tokenizer_config.json',
 '/content/drive/MyDrive/fine_tuned_bert_model/special_tokens_map.json',
 '/content/drive/MyDrive/fine_tuned_bert_model/vocab.txt',
 '/content/drive/MyDrive/fine_tuned_bert_model/added_tokens.json')

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

# Correctly load the model and tokenizer from Google Drive
model_path = "/content/drive/MyDrive/fine_tuned_bert_model"

# Load the model and tokenizer
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)


In [None]:
query = "How much did Tesla peak at?"
inputs = tokenizer(query, return_tensors="pt")
outputs = model(**inputs)
predicted_class_id = torch.argmax(outputs.logits, dim=1).item()
predicted_intent_label = intent_labels[predicted_class_id]
print(f"Predicted Intent: {predicted_intent_label}")


Predicted Intent: price_inquiry


In [None]:
print(test_dataset)


Dataset({
    features: ['QUERY', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 104
})


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Example evaluation function
def evaluate_model(test_dataset):
    all_labels = []
    all_preds = []

    for sample in test_dataset:
        query = sample['QUERY']

        # Tokenize the input
        inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=128)

        # Get model predictions
        with torch.no_grad():  # Avoid calculating gradients during inference
            outputs = model(**inputs)

        # Get predicted class
        predicted_class_id = torch.argmax(outputs.logits, dim=1).item()
        all_preds.append(predicted_class_id)

        # Append the true labels (from the 'label' column)
        all_labels.append(sample['label'])

    # Calculate accuracy and classification report
    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=intent_labels)

    # Print evaluation metrics
    print(f"Accuracy: {accuracy}")
    print(f"Classification Report:\n{report}")

# List of intent names corresponding to numeric labels (make sure this list matches your labels)
intent_labels = ['price_inquiry', 'stock_trend', 'stock_comparison', 'stock_news_sentiment']

# Run the evaluation function
evaluate_model(test_dataset)


Accuracy: 0.9903846153846154
Classification Report:
                      precision    recall  f1-score   support

       price_inquiry       1.00      1.00      1.00        23
         stock_trend       0.96      1.00      0.98        27
    stock_comparison       1.00      0.97      0.99        34
stock_news_sentiment       1.00      1.00      1.00        20

            accuracy                           0.99       104
           macro avg       0.99      0.99      0.99       104
        weighted avg       0.99      0.99      0.99       104



In [None]:
pip install yfinance




In [None]:
import yfinance as yf

# Function to fetch stock data based on user intent
def handle_price_inquiry(stock_symbols, user_query):
    try:
        period = '1d'  # We'll stick to the '1d' period for real-time queries.
        response = ""

        for stock_symbol in stock_symbols:
            stock_data = yf.Ticker(stock_symbol).history(period=period)

            if stock_data.empty:
                response += f"Sorry, I couldn't retrieve data for {stock_symbol}.\n"
            else:
                # Detect specific price types from the user query (current, high, low)
                if "current" in user_query or "close" in user_query:
                    current_price = stock_data['Close'].values[-1]
                    response += f"The current price of {stock_symbol.upper()} is {current_price:.2f}.\n"
                elif "open" in user_query:
                    open_price = stock_data['Open'].values[-1]
                    response += f"The open price of {stock_symbol.upper()} today is {open_price:.2f}.\n"
                elif "high" in user_query:
                    high_price = stock_data['High'].values[-1]
                    response += f"The highest price of {stock_symbol.upper()} today is {high_price:.2f}.\n"
                elif "low" in user_query:
                    low_price = stock_data['Low'].values[-1]
                    response += f"The lowest price of {stock_symbol.upper()} today is {low_price:.2f}.\n"
                else:
                    response += "Sorry, I couldn't understand the specific price type you're asking for.\n"

        return response
    except Exception as e:
        return f"Error fetching stock data: {str(e)}"


In [None]:
import yfinance as yf

# Function to detect stock symbols (this can be improved later to fetch from the query dynamically)
def extract_stock_symbols(user_query):
    stock_symbols = []

    # Manually mapping for now
    manual_overrides = {
        "tesla": "TSLA",
        "apple": "AAPL",
        "microsoft": "MSFT",
        "amazon": "AMZN",
        "google": "GOOGL",
        "meta": "META"
    }

    for company_name, stock_symbol in manual_overrides.items():
        if company_name.lower() in user_query.lower():
            stock_symbols.append(stock_symbol)

    if stock_symbols:
        return stock_symbols
    else:
        return None

# Function to fetch stock data based on user query
def handle_price_inquiry(stock_symbols, user_query):
    try:
        period = '1d'  # For real-time queries, we'll use '1d' period
        response = ""

        for stock_symbol in stock_symbols:
            stock_data = yf.Ticker(stock_symbol).history(period=period)

            if stock_data.empty:
                response += f"Sorry, I couldn't retrieve data for {stock_symbol}.\n"
            else:
                if "current" in user_query or "close" in user_query:
                    current_price = stock_data['Close'].values[-1]
                    response += f"The current price of {stock_symbol.upper()} is {current_price:.2f}.\n"
                elif "open" in user_query:
                    open_price = stock_data['Open'].values[-1]
                    response += f"The open price of {stock_symbol.upper()} today is {open_price:.2f}.\n"
                elif "high" in user_query:
                    high_price = stock_data['High'].values[-1]
                    response += f"The highest price of {stock_symbol.upper()} today is {high_price:.2f}.\n"
                elif "low" in user_query:
                    low_price = stock_data['Low'].values[-1]
                    response += f"The lowest price of {stock_symbol.upper()} today is {low_price:.2f}.\n"
                else:
                    response += "Sorry, I couldn't understand the specific price type you're asking for.\n"

        return response
    except Exception as e:
        return f"Error fetching stock data: {str(e)}"

# Main function to handle user query
def handle_user_query(user_query):
    # Step 1: Extract stock symbols
    stock_symbols = extract_stock_symbols(user_query)

    if not stock_symbols:
        return "Sorry, I couldn't identify a valid company name or stock symbol in your query."

    # Step 2: Handle price inquiry based on the stock symbols extracted
    response = handle_price_inquiry(stock_symbols, user_query)
    return response

# Testing the function with example queries
user_query = "What is the current price of Tesla?"
response = handle_user_query(user_query)
print(response)

user_query = "What was the highest price of Apple today?"
response = handle_user_query(user_query)
print(response)


The current price of TSLA is 219.57.

The highest price of AAPL today is 237.49.



In [None]:
user_query = "Tell me the high price of Tesla."
response = handle_user_query(user_query)
print(response)

The highest price of TSLA today is 224.26.

