In [None]:
!pip install tensorflow===2.10.0
!pip install torch
!pip install keras===2.10.0
!pip install transformers
!pip install datasets
!pip install sentencepiece
!pip install evaluate
!pip install nltk
!pip install rouge_score

Collecting datasets
  Using cached datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Using cached datasets-3.1.0-py3-none-any.whl (480 kB)
Using cached dill-0.3.8-py3-none-any.whl (116 kB)
Downloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Question and Answering

In [None]:
import transformers

transformers.logging.set_verbosity_error()

### 01.03. Using a Qu-An Pipeline

In [None]:
from transformers import pipeline

# Suppress Transformers warnings
import transformers
transformers.logging.set_verbosity_error()

# Define a unique context about software development
context = """
Object-oriented programming (OOP) is a paradigm that organizes software design around objects,
which are instances of classes. Key principles of OOP include encapsulation, inheritance,
polymorphism, and abstraction. Encapsulation involves bundling data with methods that operate on
that data, while inheritance allows one class to derive properties and methods from another.
Polymorphism enables objects to be treated as instances of their parent class, and abstraction
hides complex implementation details from the user. Popular OOP languages include Python, Java,
and C++. Agile development, on the other hand, emphasizes iterative progress and collaboration.
Scrum and Kanban are two frameworks commonly used in Agile.
"""

# Initialize the question-answering pipeline
qa_pipeline = pipeline("question-answering", model="deepset/minilm-uncased-squad2")

# Define a list of custom questions for the context
questions = [
    "What is encapsulation in OOP?",
    "Which programming languages are popular in OOP?",
    "What are the key principles of OOP?",
    "What is Agile development?",
    "Name two frameworks used in Agile development.",
]

# Loop through questions, fetch answers, and display them
for question in questions:
    result = qa_pipeline(question=question, context=context)
    print(f"Question: {question}")
    print(f"Answer: {result['answer']}")
    print(f"Confidence: {result['score']:.2f}")
    print("-" * 50)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Question: What is encapsulation in OOP?
Answer: bundling data with methods that operate on 
that data
Confidence: 0.75
--------------------------------------------------
Question: Which programming languages are popular in OOP?
Answer: Python, Java, 
and C++
Confidence: 0.94
--------------------------------------------------
Question: What are the key principles of OOP?
Answer: encapsulation, inheritance, 
polymorphism, and abstraction
Confidence: 0.94
--------------------------------------------------
Question: What is Agile development?
Answer: emphasizes iterative progress and collaboration
Confidence: 0.61
--------------------------------------------------
Question: Name two frameworks used in Agile development.
Answer: Scrum and Kanban
Confidence: 0.99
--------------------------------------------------


In [None]:
# Additional question for the context
print("\nAnother question:")
additional_answer = qa_pipeline(
    question="What is the difference between manual and automated testing?",
    context=context
)
print(f"Question: What is the difference between manual and automated testing?")
print(f"Answer: {additional_answer['answer']}")
print(f"Confidence: {additional_answer['score']:.2f}")



Another question:
Question: What is the difference between manual and automated testing?
Answer: abstraction 
hides complex implementation details from the user
Confidence: 0.00


## 01.05 Evaluating Qu-An Performance

In [None]:
from evaluate import load

# Load the SQuAD v2 evaluation metric
squad_metric = load("squad_v2")

# Define the correct answer and a list of predicted answers
correct_answer = "Object-oriented programming"
predicted_answers = [
    "Object-oriented programming",
    "Procedural programming",
    "OOP stands for Object-oriented programming"
]

# Initialize cumulative results for predictions and references
cum_predictions = []
cum_references = []

# Iterate over predictions and compute metrics
for i, predicted_answer in enumerate(predicted_answers):
    # Prepare the prediction format
    predictions = [{'prediction_text': predicted_answer, 'id': str(i), 'no_answer_probability': 0.}]
    cum_predictions.append(predictions[0])

    # Prepare the reference format
    references = [{'answers': {'answer_start': [0], 'text': [correct_answer]}, 'id': str(i)}]
    cum_references.append(references[0])

    # Compute the SQuAD evaluation metrics for the current prediction
    results = squad_metric.compute(predictions=predictions, references=references)
    print(f"F1 Score: {results.get('f1'):.2f} | Prediction: '{predicted_answer}'")

# Compute cumulative results safely
cum_results = squad_metric.compute(predictions=cum_predictions, references=cum_references)
print("\nCumulative Results:")
em = cum_results.get('exact_match', 0)  # Default to 0 if None
f1 = cum_results.get('f1', 0)  # Default to 0 if None
print(f"Exact Match (EM): {em:.2f}")
print(f"F1 Score: {f1:.2f}")


F1 Score: 100.00 | Prediction: 'Object-oriented programming'
F1 Score: 50.00 | Prediction: 'Procedural programming'
F1 Score: 57.14 | Prediction: 'OOP stands for Object-oriented programming'

Cumulative Results:
Exact Match (EM): 0.00
F1 Score: 69.05


### 02.03. Summarization with Pipelines

In [None]:
# Define the verbose text about programming
verbose_text = """
Programming is the process of designing and building executable computer programs to accomplish specific tasks.
It involves writing code in various languages like Python, JavaScript, or C++, each suited for different types of projects.
Debugging is a crucial part of programming, helping developers identify and fix issues in their code.
Effective programming also requires understanding algorithms, data structures, and software design principles.
As technology evolves, programming remains at the core of innovation, driving advancements in artificial intelligence, robotics, and web development.
"""

# Remove newline characters to make it a single continuous string
verbose_text = verbose_text.replace("\n", "")


In [None]:
from transformers import pipeline

# Initialize the summarization pipeline
extractive_summarizer = pipeline("summarization",
                                 min_length=10,
                                 max_length=100)

# Perform extractive summarization on the verbose text
extractive_summary = extractive_summarizer(verbose_text)

# Print the summary text
print("Summary:")
print(extractive_summary[0].get("summary_text"))


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Summary:
 Programming is the process of designing and building executable computer programs to accomplish specific tasks . It involves writing code in various languages like Python, JavaScript, or C++ .


In [None]:
print("Checkpoint used: ", extractive_summarizer.model.config)

Checkpoint used:  BartConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "sshleifer/distilbart-cnn-12-6",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LA

## 02.05 Evaluating with ROUGE

In [None]:
import evaluate

# Load the ROUGE evaluation metric
rouge_evaluator = evaluate.load("rouge")

# Define reference and predicted texts for evaluation
reference_text = ["Machine learning is a subset of artificial intelligence."]
predict_text = ["Machine learning is a part of artificial intelligence."]

# Compute ROUGE scores for the predicted and reference texts
eval_results = rouge_evaluator.compute(predictions=predict_text,
                                       references=reference_text)

# Print the evaluation results
print("ROUGE Evaluation Results:")
print(eval_results)


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ROUGE Evaluation Results:
{'rouge1': 0.875, 'rouge2': 0.7142857142857143, 'rougeL': 0.875, 'rougeLsum': 0.875}


In [None]:
# Evaluate completely different strings
reference_text = ["Deep learning models excel at image recognition tasks."]
predict_text = ["The weather forecast predicts heavy rainfall tomorrow."]

# Compute ROUGE scores for the mismatched strings
eval_results = rouge_evaluator.compute(predictions=predict_text,
                                       references=reference_text)

# Print the evaluation results
print("\nROUGE Evaluation Results for No Match:")
print(eval_results)



ROUGE Evaluation Results for No Match:
{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}


In [None]:

#Evaluate summary
eval_results=rouge_evaluator.compute(
    predictions=[extractive_summary[0].get("summary_text")],
    references=[verbose_text])

print("\nResults for Summary generated", eval_results)


Results for Summary generated {'rouge1': 0.5, 'rouge2': 0.4905660377358491, 'rougeL': 0.5, 'rougeLsum': 0.5}


## 03.02. Content Creation

In [None]:
from transformers import pipeline

# Initialize the text generation pipeline
text_generator = pipeline("text-generation",
                          model="gpt2")

# Set a seed for reproducibility
transformers.set_seed(42)

# Define input text for generation
input_text = "Artificial intelligence is transforming various industries,"

# Generate synthetic text
synthetic_text = text_generator(input_text,
                                num_return_sequences=3,
                                max_new_tokens=50)

# Print each generated sequence
for idx, text in enumerate(synthetic_text):
    print(f"Generated Text {idx + 1}:")
    print(text.get("generated_text"))
    print("-----------------")


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Generated Text 1:
Artificial intelligence is transforming various industries, but what about our personal privacy?

I know for a fact that you're not, under those conditions, using your privacy to control your finances – your only choice are privacy controls and government regulations.

This is also true for personal data
-----------------
Generated Text 2:
Artificial intelligence is transforming various industries, driving a huge number of jobs away from the U.S. and threatening our democracy and national security.

We need technology to do some of the above. Now we need to show the world it's possible for us to build better tomorrow and
-----------------
Generated Text 3:
Artificial intelligence is transforming various industries, both technologically and morally, in ways that could help improve the lives of Americans." According to NIST researcher and author Mike De La Rosa, who led the 2014 National Science Foundation report, "Cognitive sophistication is the potential to save lives

## 03.04. Chatbot Conversation

In [None]:
from transformers import pipeline

# Initialize the text-generation pipeline
chat_pipeline = pipeline("text-generation", model="gpt2")

# Simulate a conversational input
input_text = "Hello! How are you doing today?"

# Generate the response
response = chat_pipeline(input_text, max_length=50, num_return_sequences=1)

# Print the response
print("Model Response:")
print(response[0]["generated_text"])


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model Response:
Hello! How are you doing today? I saw something new here! Please take a moment, and please let me know what you think!"

The three had no hesitation with their expression.

A small black dragon had appeared in front of


In [None]:
from transformers import pipeline

# Initialize the text2text-generation pipeline
chat_pipeline = pipeline("text2text-generation", model="facebook/blenderbot-400M-distill")

# Sample user inputs
user_inputs = [
    "Do you have any hobbies?",
    "I like to watch movies",
    "action movies"
]

# Context simulation for the conversation
context = ""

# Simulate exchanges
for i, user_input in enumerate(user_inputs):
    print(f"\nExchange {i + 1}: \n--------------------")
    # Add user input to context
    context += f"User: {user_input}\n"

    # Generate bot response
    bot_response = chat_pipeline(context, max_length=50)[0]["generated_text"]

    # Add bot response to context
    context += f"Bot: {bot_response}\n"

    # Print the exchange
    print(f" User Input: {user_input}")
    print(f" Bot Output: {bot_response}")

# Print the full conversation
print("\nAccessing All Responses: ")
print(context)


config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/730M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/127k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/62.9k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/310k [00:00<?, ?B/s]

Device set to use cpu



Exchange 1: 
--------------------
 User Input: Do you have any hobbies?
 Bot Output:  I like to play video games.  What about you?  What do you like to do?

Exchange 2: 
--------------------
 User Input: I like to watch movies
 Bot Output:  What is your favorite movie of all time?  Mine is The Godfather Part II.

Exchange 3: 
--------------------
 User Input: action movies
 Bot Output:  I like action movies as well.  Do you like any other genres of movies other than action?

Accessing All Responses: 
User: Do you have any hobbies?
Bot:  I like to play video games.  What about you?  What do you like to do?
User: I like to watch movies
Bot:  What is your favorite movie of all time?  Mine is The Godfather Part II.
User: action movies
Bot:  I like action movies as well.  Do you like any other genres of movies other than action?



## 03.06. Translating with Hugging Face

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the T5 model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
tokenizer = AutoTokenizer.from_pretrained("t5-base")

# Input text for translation
source_text = "Harmony is a music streaming service headquartered in Berlin and Tokyo."

# Translation to German
inputs_german = tokenizer(
    "translate English to German: " + source_text,
    return_tensors="pt"
)
outputs_german = model.generate(
    inputs_german["input_ids"],
    max_length=40
)
print("German Translation: ",
      tokenizer.decode(outputs_german[0], skip_special_tokens=True))

# Translation to French
inputs_french = tokenizer(
    "translate English to French: " + source_text,
    return_tensors="pt"
)
outputs_french = model.generate(
    inputs_french["input_ids"],
    max_length=40
)
print("French Translation: ",
      tokenizer.decode(outputs_french[0], skip_special_tokens=True))


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

German Translation:  Harmony ist ein Musik-Streaming-Service mit Sitz in Berlin und Tokio.
French Translation:  Harmony est un service de streaming de musique ayant son siège social à Berlin et à Tokyo.


## 04.02. Loading a Hugging Face Dataset

In [None]:
from datasets import load_dataset

# Define a pretrained model checkpoint and dataset name
model_name = "bert-base-uncased"
dataset_name = "imdb"  # IMDB sentiment analysis dataset

# Load the IMDB sentiment dataset from Hugging Face
imdb_dataset = load_dataset(dataset_name)

# Display dataset details and a few examples
print(imdb_dataset)
print("\nSample Entries from Test Dataset:")
print(imdb_dataset["test"][10:15])

# Display the sentiment labels used in the dataset
print("\nSentiment Labels Used:")
print(imdb_dataset["train"].features["label"].names)


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

Sample Entries from Test Dataset:

Sentiment Labels Used:
['neg', 'pos']


## 04.03. Encoding and pre-processing the dataset

In [None]:
# Encoding text using a tokenizer

from transformers import BertTokenizer

# Load a tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define a custom tokenize function
def tokenize(batch):
    return tokenizer(batch["text"],  # Adjusted to use "text" as a key
                     padding=True,
                     truncation=True)

# Tokenize the IMDB dataset
enc_imdb_dataset = imdb_dataset.map(
    tokenize,
    batched=True,
    batch_size=None
)

# Print a few tokenized samples from the training set
print("Tokenized Training Samples:")
print(enc_imdb_dataset["train"][0:5])


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
# Explore input IDs and Attention Mask for the IMDB dataset

sample = enc_imdb_dataset["train"][1]

# Display the original text
print("Text:", sample.get("text"))

# Display the tokenized input IDs
print("\nInput IDs:", sample.get("input_ids"))

# Display the attention mask
print("\nAttention Mask:", sample.get("attention_mask"))

# Explore token statistics
print("\nTotal tokens:", len(sample.get("input_ids")))
print("Non-zero tokens:", len([x for x in sample.get("input_ids") if x > 0]))
print("Attention mask = 1:", len([x for x in sample.get("attention_mask") if x == 1]))


NameError: name 'enc_imdb_dataset' is not defined

In [None]:
#Separate training and validation sets
training_dataset = enc_poem_sentiment["train"]
validation_dataset=enc_poem_sentiment["validation"]

print("\nColumn Names : ",training_dataset.column_names)
print("\nFeatures : ",training_dataset.features)

labels = training_dataset.features.get("label")
num_labels=len(labels.names)



Column Names :  ['id', 'verse_text', 'label', 'input_ids', 'attention_mask']

Features :  {'id': Value(dtype='int32', id=None), 'verse_text': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'positive', 'no_impact', 'mixed'], id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


## 04.04. Creating the Model Architecture

In [None]:
from transformers import TFAutoModelForSequenceClassification

#Load transformer checkpoint from huggingface
sentiment_model = (TFAutoModelForSequenceClassification
            .from_pretrained(model_name, num_labels=num_labels))

sentiment_model.get_config()


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

{'vocab_size': 30522,
 'max_position_embeddings': 512,
 'sinusoidal_pos_embds': False,
 'n_layers': 6,
 'n_heads': 12,
 'dim': 768,
 'hidden_dim': 3072,
 'dropout': 0.1,
 'attention_dropout': 0.1,
 'activation': 'gelu',
 'initializer_range': 0.02,
 'qa_dropout': 0.1,
 'seq_classif_dropout': 0.2,
 'return_dict': True,
 'output_hidden_states': False,
 'output_attentions': False,
 'torchscript': False,
 'torch_dtype': None,
 'use_bfloat16': False,
 'tf_legacy_loss': False,
 'pruned_heads': {},
 'tie_word_embeddings': True,
 'chunk_size_feed_forward': 0,
 'is_encoder_decoder': False,
 'is_decoder': False,
 'cross_attention_hidden_size': None,
 'add_cross_attention': False,
 'tie_encoder_decoder': False,
 'max_length': 20,
 'min_length': 0,
 'do_sample': False,
 'early_stopping': False,
 'num_beams': 1,
 'num_beam_groups': 1,
 'diversity_penalty': 0.0,
 'temperature': 1.0,
 'top_k': 50,
 'top_p': 1.0,
 'typical_p': 1.0,
 'repetition_penalty': 1.0,
 'length_penalty': 1.0,
 'no_repeat_ngram_s

In [None]:
#Freeze the first layer if needed
sentiment_model.layers[0].trainable = True

#Add/remove layers if needed.
#sentiment_model.layers [append()/insert()/remove()]

print(sentiment_model.summary())



Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  3076      
                                                                 
 dropout_19 (Dropout)        multiple                  0 (unused)
                                                                 
Total params: 66,956,548
Trainable params: 66,956,548
Non-trainable params: 0
_________________________________________________________________
None
