#### steps
 - Language Detection
 - Translation

# Settings

In [1]:
# pip install langdetect
# pip install google_trans_new

import pandas as pd
import langdetect as ld
from langdetect import detect, detect_langs, DetectorFactory

import google_trans_new
from google_trans_new import google_translator

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
# pd.set_option('max_colwidth', 300)
DetectorFactory.seed = 0

# Common
 - package name: google_translate_new
 - function name: google_translator()

In [2]:
translator = google_translator()
translator

<google_trans_new.google_trans_new.google_translator at 0x7fbbad30ab20>

# Language Detection

#### 1) langdetect

In [3]:
sentence = "Tanzania ni nchi inayoongoza kwa utalii barani afrika"
detect(sentence)

'sw'

In [4]:
sentence = "Tanzania ni nchi inayoongoza kwa utalii barani afrika"
detect_langs(sentence)

[sw:0.9999971210408876]

In [5]:
sentence = "안녕하세요, 좋은 아침입니다."
detect(sentence), detect_langs(sentence)

('ko', [ko:0.9999999918622879])

#### 2) google_trans_new.google_translator

In [6]:
sentence = "안녕하세요, 좋은 아침입니다."
translator.detect(sentence)

['ko', 'korean']

# Translation

In [7]:
translate_text = translator.translate('สวัสดีจีน',lang_tgt='en')  
translate_text

'Hello China '

In [8]:
translate_text = translator.translate('안녕하세요',lang_tgt='en', pronounce=True)
print(translate_text)

['hello ', 'annyeonghaseyo', None]


In [9]:
translate_text = translator.translate('안녕하세요',lang_tgt='ja'); print(translate_text)
translate_text = translator.translate('こんにちは',lang_tgt='en'); print(translate_text)
print(translator.translate('안녕하세요! 좋은 아침입니다!', lang_tgt='en', lang_src='auto'))
translator.translate('Cerco un centro di gravità permanente', lang_tgt='en', lang_src='auto')

こんにちは 
hello 
hello! good morning! 


'I am looking for a permanent center of gravity '

In [10]:
lang_df = pd.DataFrame.from_dict(google_trans_new.LANGUAGES,  orient='index', columns=['Language'])
lang_df.T

Unnamed: 0,af,sq,am,en,ko,yo,zu
Language,afrikaans,albanian,amharic,english,korean,yoruba,zulu


In [11]:
google_trans_new.LANGUAGES

{'af': 'afrikaans',
 'sq': 'albanian',
 'am': 'amharic',
 'en': 'english',
 'ko': 'korean',
 'yo': 'yoruba',
 'zu': 'zulu'}

# Languages

{'af': 'afrikaans',
 'sq': 'albanian',
 'am': 'amharic',
 'ar': 'arabic',
 'hy': 'armenian',
 'az': 'azerbaijani',
 'eu': 'basque',
 'be': 'belarusian',
 'bn': 'bengali',
 'bs': 'bosnian',
 'bg': 'bulgarian',
 'ca': 'catalan',
 'ceb': 'cebuano',
 'ny': 'chichewa',
 'zh-cn': 'chinese (simplified)',
 'zh-tw': 'chinese (traditional)',
 'co': 'corsican',
 'hr': 'croatian',
 'cs': 'czech',
 'da': 'danish',
 'nl': 'dutch',
 'en': 'english',
 'eo': 'esperanto',
 'et': 'estonian',
 'tl': 'filipino',
 'fi': 'finnish',
 'fr': 'french',
 'fy': 'frisian',
 'gl': 'galician',
 'ka': 'georgian',
 'de': 'german',
 'el': 'greek',
 'gu': 'gujarati',
 'ht': 'haitian creole',
 'ha': 'hausa',
 'haw': 'hawaiian',
 'iw': 'hebrew',
 'he': 'hebrew',
 'hi': 'hindi',
 'hmn': 'hmong',
 'hu': 'hungarian',
 'is': 'icelandic',
 'ig': 'igbo',
 'id': 'indonesian',
 'ga': 'irish',
 'it': 'italian',
 'ja': 'japanese',
 'jw': 'javanese',
 'kn': 'kannada',
 'kk': 'kazakh',
 'km': 'khmer',
 'ko': 'korean',
 'ku': 'kurdish (kurmanji)',
 'ky': 'kyrgyz',
 'lo': 'lao',
 'la': 'latin',
 'lv': 'latvian',
 'lt': 'lithuanian',
 'lb': 'luxembourgish',
 'mk': 'macedonian',
 'mg': 'malagasy',
 'ms': 'malay',
 'ml': 'malayalam',
 'mt': 'maltese',
 'mi': 'maori',
 'mr': 'marathi',
 'mn': 'mongolian',
 'my': 'myanmar (burmese)',
 'ne': 'nepali',
 'no': 'norwegian',
 'or': 'odia',
 'ps': 'pashto',
 'fa': 'persian',
 'pl': 'polish',
 'pt': 'portuguese',
 'pa': 'punjabi',
 'ro': 'romanian',
 'ru': 'russian',
 'sm': 'samoan',
 'gd': 'scots gaelic',
 'sr': 'serbian',
 'st': 'sesotho',
 'sn': 'shona',
 'sd': 'sindhi',
 'si': 'sinhala',
 'sk': 'slovak',
 'sl': 'slovenian',
 'so': 'somali',
 'es': 'spanish',
 'su': 'sundanese',
 'sw': 'swahili',
 'sv': 'swedish',
 'tg': 'tajik',
 'ta': 'tamil',
 'te': 'telugu',
 'th': 'thai',
 'tr': 'turkish',
 'tk': 'turkmen',
 'uk': 'ukrainian',
 'ur': 'urdu',
 'ug': 'uyghur',
 'uz': 'uzbek',
 'vi': 'vietnamese',
 'cy': 'welsh',
 'xh': 'xhosa',
 'yi': 'yiddish',
 'yo': 'yoruba',
 'zu': 'zulu'}

# Sample Chunk

In [11]:
from google_trans_new import google_translator
text_original = 'Encantada de conocerte!'
print('* original text:\t', text_original)
text_translated = google_translator().translate(text_original, lang_tgt='en')
print('* translated text:\t', text_translated)

* original text:	 Encantada de conocerte!
* translated text:	 Nice to meet you! 


# References
 - https://towardsdatascience.com/how-to-detect-and-translate-languages-for-nlp-project-dfd52af0c3b5
 - https://github.com/lushan88a/google_trans_new/issues/36

# Appendix

#### past
 - googletrans package >> Translator module

In [12]:
import googletrans
from googletrans import Translator

translator_past = Translator()
lang_df = pd.DataFrame.from_dict(googletrans.LANGUAGES,  orient='index', columns=['Language'])
display(lang_df.shape, lang_df.head(3), lang_df.tail(3))

# # find the code for english, french, italian, spanish and greek
# lang_df[lang_df.Language.isin(['english', 'french', 'italian', 'spanish', 'greek'])]

(107, 1)

Unnamed: 0,Language
af,afrikaans
sq,albanian
am,amharic


Unnamed: 0,Language
yi,yiddish
yo,yoruba
zu,zulu


In [13]:
translator_past.detect('hello').lang

'en'

In [14]:
translator_past.translate('hello', 'ko').text

'안녕하세요'

In [15]:
translator_past.translate('hello', 'ja').text

'こんにちは'

In [16]:
translator_past.translate(src='en', text='hello', dest='ja')

<googletrans.models.Translated at 0x7fbbb1734b20>

In [17]:
translator_past.translate(src='en', text='hello', dest='ja').pronunciation

"Kon'nichiwa"

In [18]:
print(translator_past.translate('hello', 'ja'))

Translated(src=en, dest=ja, text=こんにちは, pronunciation=Kon'nichiwa, extra_data="{'confiden...")
