# Machine Translation Using Pretrained Transformer

In [None]:
from transformers import MarianMTModel, MarianTokenizer

# Choose translation direction
model_en_de_name = "Helsinki-NLP/opus-mt-en-de"  # English → German
tokenizer_en_de = MarianTokenizer.from_pretrained(model_en_de_name)
mode_en_de = MarianMTModel.from_pretrained(model_en_de_name)

model_en_da_name = "Helsinki-NLP/opus-mt-en-da"  # English → Danish
tokenizer_en_da = MarianTokenizer.from_pretrained(model_en_da_name)
mode_en_da = MarianMTModel.from_pretrained(model_en_da_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/298M [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/788k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/820k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/300M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
def translate(text, tokenizer, model):
    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    # Generate translation with beam search
    translated_tokens = model.generate(**inputs, num_beams=5)
    # Decode to readable text
    return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]

# Test
example = "Transformers changed the field of natural language processing forever."
translation_de = translate(example, tokenizer_en_de, mode_en_de)
translation_da = translate(example, tokenizer_en_da, mode_en_da)

print("EN:", example)
print("DE:", translation_de)
print("DA:", translation_da)

model.safetensors:   0%|          | 0.00/300M [00:00<?, ?B/s]

EN: Transformers changed the field of natural language processing forever.
DE: Transformers veränderten das Feld der natürlichen Sprachverarbeitung für immer.
DA: Transformere ændrede området for naturlig sprogbehandling for evigt.


In [None]:
# Se

# Sentiment Analysis

In [None]:
from transformers import pipeline

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
sentiment_model = pipeline("sentiment-analysis", model=model_name)


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


In [None]:
# Example sentences (Applied Machine Learning course)
sentences = [
    "I really love the Applied Machine Learning course, the lectures are amazing!",
    "The assignments are challenging but very rewarding.",
    "Sometimes the pace of the course is too fast and I feel a bit lost.",
    "I hate when the code doesn't work and I have no idea why."
]

# Run inference
results = sentiment_model(sentences)

# Print results
for sent, res in zip(sentences, results):
    print(f"Sentence: {sent}")
    print(f"Prediction: {res['label']} (score: {res['score']:.4f})\n")

Sentence: I really love the Applied Machine Learning course, the lectures are amazing!
Prediction: POSITIVE (score: 0.9999)

Sentence: The assignments are challenging but very rewarding.
Prediction: POSITIVE (score: 0.9999)

Sentence: Sometimes the pace of the course is too fast and I feel a bit lost.
Prediction: NEGATIVE (score: 0.9998)

Sentence: I hate when the code doesn't work and I have no idea why.
Prediction: NEGATIVE (score: 0.9996)



# Emotion Detection

In [None]:
# Load pretrained pipeline for emotion classification
emotion_classifier = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    return_all_scores=True
)

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cpu


In [None]:

sentences = [
    "I am so excited to attend the Applied Machine Learning lecture today!",
    "I'm frustrated because I can't solve the assignment problem.",
    "This course makes me feel confident about my future career.",
    "I am worried that I will fail the exam.",
]

# Run and display results
for text in sentences:
    print(f"\nSentence: {text}")
    preds = emotion_classifier(text)[0]
    # Sort by confidence
    preds = sorted(preds, key=lambda x: x['score'], reverse=True)
    # Print top emotion
    top = preds[0]
    print(f"Detected Emotion: {top['label']} (score: {top['score']:.4f})")



Sentence: I am so excited to attend the Applied Machine Learning lecture today!
Detected Emotion: joy (score: 0.9651)

Sentence: I'm frustrated because I can't solve the assignment problem.
Detected Emotion: anger (score: 0.9714)

Sentence: This course makes me feel confident about my future career.
Detected Emotion: joy (score: 0.9608)

Sentence: I am worried that I will fail the exam.
Detected Emotion: fear (score: 0.9885)


# Token Classification (Named Entities Recognition)

In [None]:
# Load pretrained NER pipeline
ner_pipeline = pipeline(
    "ner",
    model="dbmdz/bert-large-cased-finetuned-conll03-english",
    aggregation_strategy="simple"  # Groups sub-tokens into full entities
)


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


In [None]:

sentences = [
    "Tariq Yousef teaches the Applied Machine Learning course at the University of Southern Denmark.",
    "He works mainly in Odense but also teaches in Kolding, Denmark.",
]

# Apply NER
for text in sentences:
    entities = ner_pipeline(text)
    print(f"\nText: {text}")
    print("Entities:")
    for ent in entities:
        print(f"  - {ent['word']} ({ent['entity_group']}), score={ent['score']:.3f}")


Text: Tariq Yousef teaches the Applied Machine Learning course at the University of Southern Denmark.
Entities:
  - Tariq Yousef (PER), score=0.999
  - Machine Learning (MISC), score=0.911
  - University of Southern Denmark (ORG), score=0.993

Text: He works mainly in Odense but also teaches in Kolding, Denmark.
Entities:
  - Odense (LOC), score=0.988
  - Kolding (LOC), score=0.994
  - Denmark (LOC), score=1.000


# Medical NER

In [None]:
# Load medical NER model
med_ner = pipeline(
    "ner",
    model="Clinical-AI-Apollo/Medical-NER",
    aggregation_strategy="simple"  # Group subwords into a single entity
)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/736M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

Device set to use cpu


In [None]:

# Example clinical sentences
sentences = [
    "The patient was diagnosed with diabetes and prescribed insulin.",
    "He complained of severe chest pain and shortness of breath.",
    "MRI revealed a fracture in the left femur after the accident.",
    "Aspirin was administered to reduce the risk of heart attack."
]

# Run extraction
for text in sentences:
    entities = med_ner(text)
    print(f"\nText: {text}")
    print("Extracted Medical Entities:")
    for ent in entities:
        print(f"  - {ent['word']:<20}  {ent['entity_group']} (score: {ent['score']:.3f})")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Text: The patient was diagnosed with diabetes and prescribed insulin.
Extracted Medical Entities:
  - diabetes              DISEASE_DISORDER (score: 0.309)
  - insulin               MEDICATION (score: 0.166)

Text: He complained of severe chest pain and shortness of breath.
Extracted Medical Entities:
  - severe                SEVERITY (score: 0.755)
  - chest                 BIOLOGICAL_STRUCTURE (score: 0.247)
  - pain                  SIGN_SYMPTOM (score: 0.611)
  - shortness of breath   SIGN_SYMPTOM (score: 0.878)

Text: MRI revealed a fracture in the left femur after the accident.
Extracted Medical Entities:
  - MRI                   DIAGNOSTIC_PROCEDURE (score: 0.221)
  - fracture              SIGN_SYMPTOM (score: 0.325)
  - left femur            BIOLOGICAL_STRUCTURE (score: 0.760)

Text: Aspirin was administered to reduce the risk of heart attack.
Extracted Medical Entities:
  - Aspirin               MEDICATION (score: 0.218)
  - heart attack          DISEASE_DISORDER (score: 0.

# Semantic Similarity

In [None]:

import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
sentences = [
    "The Applied Machine Learning course is very interesting.",
    "I really enjoy learning about neural networks and AI in this course.",
    "The weather in Denmark is cold during winter.",
    "Tariq Yousef teaches Machine Learning at the University of Southern Denmark."
]

def bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    # CLS token embedding
    return outputs.last_hidden_state[:, 0, :].detach().numpy()

# Compute embeddings and similarity
embeddings = [bert_embedding(s) for s in sentences]

def similarity(i, j):
    return cosine_similarity(embeddings[i], embeddings[j])[0][0]

print("\n--- BERT CLS Similarity ---")
print("1 vs 2:", similarity(0, 1))
print("1 vs 3:", similarity(0, 2))
print("1 vs 4:", similarity(0, 3))


--- BERT CLS Similarity ---
1 vs 2: 0.9223983
1 vs 3: 0.7740307
1 vs 4: 0.79514575
