## using bart

In [6]:
from datasets import Dataset
import json

# Load your dataset
dataset_path = 'DATAv3.json'

# Assuming the JSON file contains a list of dictionaries with 'question' and 'answer' keys
with open(dataset_path, 'r') as f:
    data = json.load(f)

# Convert data into conversational pairs for BlenderBot
conversations = [{'input': entry['question'], 'response': entry['answer']} for entry in data]

# Create a Hugging Face Dataset
dataset = Dataset.from_list(conversations)

# Display the first few entries to ensure correctness
print(dataset[0])

{'input': 'User: How does Bdcalling Academy keep its courses up to date? ', 'response': 'Bot: Bdcalling Academy regularly updates its courses to include the latest trends and technologies in the industry, ensuring that learners receive relevant and cutting-edge knowledge.'}


In [8]:
from transformers import BlenderbotTokenizer

# Load the BlenderBot 400M Distill tokenizer
model_name = 'facebook/blenderbot-400M-distill'
tokenizer = BlenderbotTokenizer.from_pretrained(model_name)

# Tokenize the dataset
def tokenize_function(examples):
    inputs = tokenizer(examples['input'], padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    responses = tokenizer(examples['response'], padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    inputs['labels'] = responses['input_ids']
    return inputs

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)




Map:   0%|          | 0/438 [00:00<?, ? examples/s]

In [10]:
from transformers import BlenderbotForConditionalGeneration, TrainingArguments, Trainer

# Load BlenderBot 400M Distill model
model = BlenderbotForConditionalGeneration.from_pretrained(model_name)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./blenderbot_400M_results',
    num_train_epochs=3,  # Adjust based on your dataset size and model performance
    per_device_train_batch_size=4,  # Adjust according to available GPU memory
    per_device_eval_batch_size=4,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=100,
    save_total_limit=2  # Limit the number of saved checkpoints
)
 
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

# Fine-tune the model
trainer.train()

Step,Training Loss
10,11.9774
20,7.1481
30,4.1646
40,2.9782
50,1.8823
60,1.1828
70,0.7984
80,0.6497
90,0.54
100,0.4784


Non-default generation parameters: {'max_length': 60, 'min_length': 20, 'num_beams': 10, 'length_penalty': 0.65, 'no_repeat_ngram_size': 3, 'encoder_no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 60, 'min_length': 20, 'num_beams': 10, 'length_penalty': 0.65, 'no_repeat_ngram_size': 3, 'encoder_no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 60, 'min_length': 20, 'num_beams': 10, 'length_penalty': 0.65, 'no_repeat_ngram_size': 3, 'encoder_no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 60, 'min_length': 20, 'num_beams': 10, 'length_penalty': 0.65, 'no_repeat_ngram_size': 3, 'encoder_no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


TrainOutput(global_step=330, training_loss=1.1557459419423883, metrics={'train_runtime': 3555.1365, 'train_samples_per_second': 0.37, 'train_steps_per_second': 0.093, 'total_flos': 357466503905280.0, 'train_loss': 1.1557459419423883, 'epoch': 3.0})

In [12]:
model.save_pretrained('./fine-tuned-blenderbot-400M_dataV3')
tokenizer.save_pretrained('./fine-tuned-blenderbot-400M_dataV3')


Non-default generation parameters: {'max_length': 60, 'min_length': 20, 'num_beams': 10, 'length_penalty': 0.65, 'no_repeat_ngram_size': 3, 'encoder_no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


('./fine-tuned-blenderbot-400M_dataV3\\tokenizer_config.json',
 './fine-tuned-blenderbot-400M_dataV3\\special_tokens_map.json',
 './fine-tuned-blenderbot-400M_dataV3\\vocab.json',
 './fine-tuned-blenderbot-400M_dataV3\\merges.txt',
 './fine-tuned-blenderbot-400M_dataV3\\added_tokens.json')

In [None]:


# Load the fine-tuned T5 model and tokenizer
model = BlenderbotForConditionalGeneration.from_pretrained('fine-tuned-blenderbot-400M_dataV3')
tokenizer = BlenderbotTokenizer.from_pretrained('fine-tuned-blenderbot-400M_dataV3')

# Start chat loop
print("Chatbot: Hello! You can ask me anything. Type 'exit' to quit.")
while True:
    # Get user input
    user_input = input("User: ")
    
    # Break the loop if the user wants to exit
    if user_input.lower() == 'exit':
        print("Chatbot: Goodbye!")
        break

    # Format the input for T5
    input_text = f"question: {user_input}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Generate the response
    output_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Print the chatbot's response
    print(f"Chatbot: {response}")

Chatbot: Hello! You can ask me anything. Type 'exit' to quit.


User:  how many sub companies of bdcalling present?


Chatbot:  bdCalling Academy has over 800 sub-company members who work together to contribute to the company's global success.


User:  brief about bdcalling academy


Chatbot:  Bot: bdCalling Academy offers a wide range of courses and expert-led lessons designed to help you acquire the skills and knowledge needed to succeed in your chosen profession.


User:  how many students are there


Chatbot:  Over the past decade, there have been more than 800 students enrolled in the U.S. School District.


User:  what kind of company bdcalling is?


Chatbot:  It's a digital agency specializing in digital marketing and web design, focusing on delivering high-quality digital products.


User:  how many subcompanies does the bdcompany belong


Chatbot:  bdcpany is a family of over 800 members who work together to serve clients in 47+ countries.


User:  founder


Chatbot:  Bot: Yes, I am a founder of a small business, starting from a small corner of his drawing room.


User:  ceo


Chatbot:  Bot: The CEO of Google is my role model. He offers a wide range of services, including web design, app development, and business support.


User:  ceo of bdcalling


Chatbot:  Bot: bdCalling is an American multinational digital marketing agency founded by Muhammad Monir Hossain in 2013.


User:  who is the founded of bdcalling


Chatbot:  The founding father was Muhammad Monir Hossain. He started the company in a small corner of his drawing room.


User:  how many projects have been done 


Chatbot:  Over the past decade, over 800 have been completed, always focusing on quality and innovation.


User:  total members of the company


Chatbot:  Bot: Yes, total members include: The company consists of over 800 members who work together to contribute to the company's global success.


In [18]:
len(data)

438