# Translating Non-English Text Descriptions

In [2]:
import pandas as pd
import numpy as np
from google_trans_new import google_translator
from langid.langid import LanguageIdentifier, model

## Load Data

In [32]:
data = pd.read_csv('../data/podcasts.csv')

In [33]:
data.head()

Unnamed: 0,uuid,title,image,description,language,categories,website,author,itunes_id
0,8d62d3880db2425b890b986e58aca393,"Ecommerce Conversations, by Practical Ecommerce",http://is4.mzstatic.com/image/thumb/Music6/v4/...,Listen in as the Practical Ecommerce editorial...,English,Technology,http://www.practicalecommerce.com,Practical Ecommerce,874457373
1,cbbefd691915468c90f87ab2f00473f9,Eat Sleep Code Podcast,http://is4.mzstatic.com/image/thumb/Music71/v4...,On the show we’ll be talking to passionate peo...,English,Tech News | Technology,http://developer.telerik.com/,Telerik,1015556393
2,73626ad1edb74dbb8112cd159bda86cf,SoundtrackAlley,http://is5.mzstatic.com/image/thumb/Music71/v4...,A podcast about soundtracks and movies from my...,English,Podcasting | Technology,https://soundtrackalley.podbean.com,Randy Andrews,1158188937
3,0f50631ebad24cedb2fee80950f37a1a,The Tech M&A Podcast,http://is1.mzstatic.com/image/thumb/Music71/v4...,The Tech M&A Podcast pulls from the best of th...,English,Business News | Technology | Tech News | Business,http://www.corumgroup.com,Timothy Goddard,538160025
4,69580e7b419045839ca07af06cf0d653,"The Tech Informist - For fans of Apple, Google...",http://is4.mzstatic.com/image/thumb/Music62/v4...,The tech news show with two guys shooting the ...,English,Gadgets | Tech News | Technology,http://techinformist.com,The Tech Informist,916080498


In [4]:
data.shape

(121175, 9)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121175 entries, 0 to 121174
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   uuid         121175 non-null  object
 1   title        121173 non-null  object
 2   image        121175 non-null  object
 3   description  119832 non-null  object
 4   language     121175 non-null  object
 5   categories   121175 non-null  object
 6   website      120005 non-null  object
 7   author       118678 non-null  object
 8   itunes_id    121175 non-null  int64 
dtypes: int64(1), object(8)
memory usage: 8.3+ MB


## Clean Data

In [6]:
# Remove null values.
data.dropna(axis=0, inplace=True)
data.shape

(116374, 9)

## Translate Data

In [17]:
# Create translator function.
translator = google_translator()
def translate(x):
    try:
        return translator.translate(x, lang_tgt='en')
    except:
        return x

### Full Dataset (for eda)

In [None]:
# Split into English and Non-English dataframes to optimize timing for translations.
english_df = data[data['language']=='English']
non_english_df = data[data['language']!='English']

In [None]:
# Translates descriptions.
non_english_df['english'] = non_english_df['description'].apply(lambda x: translate(x))

In [None]:
# Remove the rows that could not be translated.
non_english_df = non_english_df[non_english_df['description'] != non_english_df['english']]

In [None]:
# Replace description column with english translation.
non_english_df['description'] = non_english_df['english']
non_english_df.drop('english', axis=1, inplace=True)

In [None]:
# Concatenate the split dataframes after translations.
data = pd.concat([english_df, non_english_df])

In [None]:
# Check for null values.
data.info()

In [None]:
# Save full translated podcasts to csv file.
data = data[['uuid', 'title', 'description', 'language', 'categories', 'author']]
data.to_csv('../data/translated_podcasts.csv')

### Sampled Dataset (for recommender)

In [3]:
data = pd.read_csv('../data/translated_podcasts.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300669 entries, 0 to 300668
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Unnamed: 0   300668 non-null  object
 1   uuid         300667 non-null  object
 2   title        300667 non-null  object
 3   description  116354 non-null  object
 4   language     116344 non-null  object
 5   categories   116339 non-null  object
 6   author       116339 non-null  object
dtypes: object(7)
memory usage: 16.1+ MB


In [5]:
data.dropna(axis=0, inplace=True)

In [6]:
sample = data.sample(50000, random_state=1234)

In [7]:
sample.shape

(50000, 7)

In [8]:
# Check for null values.
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 270496 to 28048
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   50000 non-null  object
 1   uuid         50000 non-null  object
 2   title        50000 non-null  object
 3   description  50000 non-null  object
 4   language     50000 non-null  object
 5   categories   50000 non-null  object
 6   author       50000 non-null  object
dtypes: object(7)
memory usage: 3.1+ MB


In [9]:
sample.drop('Unnamed: 0', axis=1, inplace=True)

In [10]:
# Create pred_lang function to detect language of descriptions.
def pred_lang(df, column):
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    lang_lst = []
    for text in df[column].values:
        lang, score = identifier.classify(text)
        lang_lst.append(lang)
    return lang_lst

In [11]:
lang_lst = pred_lang(sample, 'description')

In [12]:
# Add lang_lst to sample dataframe.
sample['pred_lang'] = lang_lst

In [13]:
sample.sample(10)

Unnamed: 0,uuid,title,description,language,categories,author,pred_lang
293557,5cca597570a242459aa2bf942461d37f,欧洲华语播客 Podcast - Eurasian Matters,Interview with Huina *Pin Lao Yangtou *Appreci...,Chinese,Health | Philosophy | Society & Culture | Educ...,Huina,en
274687,98e9ad89d62546da81cf4304cce5f366,Match of the Year Podcast,We look at the evolution of professional wrest...,English,Society & Culture | Sports & Recreation | Prof...,Christopher J Garcia,en
268113,035b5d1864c740c1bd10e675be2fdc80,"First United Methodist Church of Kearney, Miss...",Weekly teaching from First United Methodist Ch...,English,Religion & Spirituality | Christianity,First United Methodist Church,en
294677,ca2868fc71c849469af050a02a6e9b14,Vortex CaosCast,We opened a Vortex and broke with consensual r...,Portuguese,Spirituality | Religion & Spirituality | Socie...,Vortex CaosCast,en
138163,39e5be7abe9642c5aa3d9e1e4a91db62,No Meat Athlete Radio,"Vegan and vegetarian nutrition, running and tr...",English,Outdoor | Alternative Health | Fitness & Nutri...,Matt Frazier,en
285742,93e214a350d945cca9de323c21c3e358,夜听,Night listening,Chinese,Arts | Spirituality | Literature | Religion & ...,精选FM,es
277615,dd0095354e2d4a968d82fcc6ae1e11c1,On the Frontlines of Multiple Sclerosis,Welcome to On the Frontlines of Multiple Scler...,English,Medicine | Science & Medicine | Health,ReachMD,en
284214,433860e17e124a5a931073f54ea218d9,"Catalyzr | Inspiration, Motivation und Coaching",Catalyzr is about leading a more than just you...,German,Self-Help | Health,Tim Chaborski | Inspiriert durch Christian Bis...,en
39098,46b9d7e1a0844289b41837d2032cdec7,The Chicken Social,"Each week hosts Obes and Cha, along with the m...",English,Comedy | Sports & Recreation | Philosophy | Ne...,The Chicken Social,en
16248,054470dbe8c74f6cb2b8847ad228889d,Step Up To Level Up Podcast,Podcast by Justin Feldman,English,Health,Justin Feldman,en


In [14]:
# Drop language column.
sample.drop('language', axis=1, inplace=True)

In [15]:
# Split into English and Non-English dataframes.
non_english_df = sample[sample['pred_lang']!='en']
english_df = sample[sample['pred_lang']=='en']

In [18]:
# Translate Non-English descriptions.
non_english_df['translation'] = non_english_df['description'].apply(lambda x: translate(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [19]:
# Sanity check.
non_english_df.sample(10)

Unnamed: 0,uuid,title,description,categories,author,pred_lang,translation
50356,4e588fc49e614b129dfe719298b86dae,DevTalk,DevTalk: podcast programistyczny Macieja Anise...,Technology | Software How-To,Maciej Aniserowicz,pl,Devthalk: Podcast programming Maciej Aniserowi...
2805,a9c78d3009d347458eb28cda54b06400,24syv Dokumentar,24syv Dokumentar er resultatet af ugens samled...,News & Politics,Radio24syv,da,24 SYV Documentary is the result of the overal...
293184,46b38f3bdff94bdabf41c3054513b4a1,Le journal des bonnes nouvelles FB Armorique,The good news journal FB Armorique,Comedy,Radio France,fr,The good news journal FB Armorique
53822,693f9d2ffc42450e9ad2ba4506bfdeb1,Rev. Adam R. Quine,"religion,presbyterian,sermon,church",Other | Spirituality | Religion & Spirituality...,Janis Klockenga,da,"religion,presbyterian,sermon,church"
14518,35dbdae349df483db47245ff1fd7bf5d,Millionærklubben,"Millionærklubbens består af to værter, Pernill...",Investing | Business,Radio24syv,da,"The millionaire club consists of two hosts, Pe..."
140447,791bc07a274540af926069033fe911d4,마을미디어뻔,중랑구 공동체라디오 마을미디어뻔입니다,Local | Government & Organizations,마을미디어뻔,ko,Jungnang-gu Co Cylio Village Media
23048,2933451c1bf945cfbada0cc79972fda7,YTN 지식카페,그 무언가를 찾는 여정에 올드 미디어가 작은 도움이 될 수 있기를 바랍니다. 이렇게...,News & Politics,"YTN,와이티엔FM94.5 (ytnfm@ytnradio.kr)",ko,I hope that the old media can be a small help ...
23025,6e4e2f6281e24d42b04dc69b433e6c76,The Eloquent Savage Podcast,Some description,Professional | Sports & Recreation,CoachSteve,fr,Some description
18046,6c447fc698984df799aa9cf7c3cd4919,O'Quinn Baptist Church,O'Quinn Baptist Church,Religion & Spirituality | Christianity,Gerald A Greenlee Jr,ga,O'Quinn Baptist Church
136832,8fd7a79d9395474b9e25d50c5f9f4e13,Terça Livre: Detona News,Acesse nosso website - www.tercalivre.com.br,News & Politics | TV & Film,Terça Livre,it,Access our website - www.tercalivre.com.br


In [20]:
# Remove podcasts where translation did not work.
non_english_df = non_english_df[non_english_df['description']==non_english_df['translation']]

In [21]:
# Replace description column.
non_english_df['description'] = non_english_df['translation']
non_english_df.drop('translation', axis=1, inplace=True)

In [22]:
# Put the dataframes back together.
sample = pd.concat([english_df, non_english_df])

In [23]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47679 entries, 270496 to 286203
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   uuid         47679 non-null  object
 1   title        47679 non-null  object
 2   description  47679 non-null  object
 3   categories   47679 non-null  object
 4   author       47679 non-null  object
 5   pred_lang    47679 non-null  object
dtypes: object(6)
memory usage: 2.5+ MB


In [24]:
# Save sampled translated podcasts to csv file.
sample = sample[['uuid', 'title', 'description', 'pred_lang', 'categories', 'author']]
sample.to_csv('../data/translated_podcast_samples.csv')