### Language translation and classification using high-level API pipelines with transformers using several models from Huggingface. 

In [1]:
# Load pipeline from transformers
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Reference text (R) in English to be translated into the other languages. Every word is taken as a token/vector. 
R = '''My name is Wolfgang and I live in Berlin. I am 65 years old and retired from services.
            I love traveling all over the world and collect souvenirs, that are specific to that place. 
            I give gifts to my colleagues, family, and friends after every trip.'''

# English to Arabic  https://huggingface.co/Helsinki-NLP/opus-mt-en-ar

In [4]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-ar"
translator = pipeline("translation", model=model_checkpoint) #translation pipeline provides high API
T = translator(R)
print(T)



[{'translation_text': 'اسمي وولفغانغ وأنا أعيش في برلين، عمري 65 عاماً ومتقاعد من الخدمات، أحب السفر في جميع أنحاء العالم وجمع الهدايا التذكارية، التي هي خاصة بذلك المكان، وأعطي الهدايا لزملائي وعائلتي وأصدقائي بعد كل رحلة.'}]


In [7]:
# To access a model's methods and attributes, we need to use AutoModel and AutoTokenizer
from transformers import AutoModel, AutoTokenizer
model = AutoModel.from_pretrained(model_checkpoint)
Tokenizer = AutoModel.from_pretrained(model_checkpoint)
dir(model),dir(Tokenizer)

Some weights of the model checkpoint at Helsinki-NLP/opus-mt-en-zh were not used when initializing MarianModel: ['final_logits_bias']
- This IS expected if you are initializing MarianModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MarianModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at Helsinki-NLP/opus-mt-en-zh were not used when initializing MarianModel: ['final_logits_bias']
- This IS expected if you are initializing MarianModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializin

(['T_destination',
  '__annotations__',
  '__call__',
  '__class__',
  '__delattr__',
  '__dict__',
  '__dir__',
  '__doc__',
  '__eq__',
  '__format__',
  '__ge__',
  '__getattr__',
  '__getattribute__',
  '__gt__',
  '__hash__',
  '__init__',
  '__init_subclass__',
  '__le__',
  '__lt__',
  '__module__',
  '__ne__',
  '__new__',
  '__reduce__',
  '__reduce_ex__',
  '__repr__',
  '__setattr__',
  '__setstate__',
  '__sizeof__',
  '__str__',
  '__subclasshook__',
  '__weakref__',
  '_apply',
  '_auto_class',
  '_backward_compatibility_gradient_checkpointing',
  '_backward_hooks',
  '_backward_pre_hooks',
  '_buffers',
  '_call_impl',
  '_convert_head_mask_to_5d',
  '_create_repo',
  '_expand_inputs_for_generation',
  '_extract_past_from_model_output',
  '_forward_hooks',
  '_forward_hooks_with_kwargs',
  '_forward_pre_hooks',
  '_forward_pre_hooks_with_kwargs',
  '_from_config',
  '_get_backward_hooks',
  '_get_backward_pre_hooks',
  '_get_decoder_start_token_id',
  '_get_files_timesta

In [9]:
#To see sequence length and hidden_size
model.config.max_position_embeddings,model.config.hidden_size

(512, 512)

# English to Chinese (Mandarin) https://huggingface.co/Helsinki-NLP/opus-mt-en-zh

In [5]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-zh"
translator = pipeline("translation", model=model_checkpoint)
T = translator(R)
print(T)

[{'translation_text': '我的名字是沃尔夫冈,我住在柏林。我65岁,退休了,我热爱环游世界各地,收集那里特有的纪念品。每次旅行后,我都会给我的同事、家人和朋友送礼物。'}]


## Language Classifier

### The following program from https://huggingface.co/qanastek/51-languages-classifier classifies Arabic, Russian, Mandarin, Polish, and Latvian and a few more

In [14]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
model_name = 'qanastek/51-languages-classifier'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [15]:
res = classifier("أحب السفر إلى أماكن مختلفة وجمع الهدايا التذكارية. بعد كل رحلة ، أشارك ميورياتي مع العائلة والأصدقاء.")
print(res)

In [17]:
res = classifier("我喜歡到不同的地方旅行並收集紀念品。每次旅行後，我都會與家人和朋友分享我的回憶。")
print(res)

[{'label': 'zh-TW', 'score': 0.9998824596405029}]


In [18]:
res = classifier("Mulle meeldib reisida erinevatesse kohtadesse ja koguda suveniire. Pärast iga reisi jagan oma mälestusi pere ja sõpradega.")
print(res)

[{'label': 'fi-FI', 'score': 0.9999626874923706}]
