# Translating Non-English Text Descriptions

In [41]:
import pandas as pd
import numpy as np
from google_trans_new import google_translator
from langid.langid import LanguageIdentifier, model

## Load Data

In [32]:
data = pd.read_csv('../data/podcasts.csv')

In [33]:
data.head()

Unnamed: 0,uuid,title,image,description,language,categories,website,author,itunes_id
0,8d62d3880db2425b890b986e58aca393,"Ecommerce Conversations, by Practical Ecommerce",http://is4.mzstatic.com/image/thumb/Music6/v4/...,Listen in as the Practical Ecommerce editorial...,English,Technology,http://www.practicalecommerce.com,Practical Ecommerce,874457373
1,cbbefd691915468c90f87ab2f00473f9,Eat Sleep Code Podcast,http://is4.mzstatic.com/image/thumb/Music71/v4...,On the show we’ll be talking to passionate peo...,English,Tech News | Technology,http://developer.telerik.com/,Telerik,1015556393
2,73626ad1edb74dbb8112cd159bda86cf,SoundtrackAlley,http://is5.mzstatic.com/image/thumb/Music71/v4...,A podcast about soundtracks and movies from my...,English,Podcasting | Technology,https://soundtrackalley.podbean.com,Randy Andrews,1158188937
3,0f50631ebad24cedb2fee80950f37a1a,The Tech M&A Podcast,http://is1.mzstatic.com/image/thumb/Music71/v4...,The Tech M&A Podcast pulls from the best of th...,English,Business News | Technology | Tech News | Business,http://www.corumgroup.com,Timothy Goddard,538160025
4,69580e7b419045839ca07af06cf0d653,"The Tech Informist - For fans of Apple, Google...",http://is4.mzstatic.com/image/thumb/Music62/v4...,The tech news show with two guys shooting the ...,English,Gadgets | Tech News | Technology,http://techinformist.com,The Tech Informist,916080498


In [4]:
data.shape

(121175, 9)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121175 entries, 0 to 121174
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   uuid         121175 non-null  object
 1   title        121173 non-null  object
 2   image        121175 non-null  object
 3   description  119832 non-null  object
 4   language     121175 non-null  object
 5   categories   121175 non-null  object
 6   website      120005 non-null  object
 7   author       118678 non-null  object
 8   itunes_id    121175 non-null  int64 
dtypes: int64(1), object(8)
memory usage: 8.3+ MB


## Clean Data

In [6]:
# Remove null values.
data.dropna(axis=0, inplace=True)
data.shape

(116374, 9)

## Translate Data

In [78]:
# Create translator function.
translator = google_translator()
def translate(x):
    try:
        return translator.translate(x, lang_tgt='en')
    except:
        return x

### Full Dataset (for eda)

In [None]:
# Split into English and Non-English dataframes to optimize timing for translations.
english_df = data[data['language']=='English']
non_english_df = data[data['language']!='English']

In [None]:
# Translates descriptions.
non_english_df['english'] = non_english_df['description'].apply(lambda x: translate(x))

In [None]:
# Remove the rows that could not be translated.
non_english_df = non_english_df[non_english_df['description'] != non_english_df['english']]

In [None]:
# Replace description column with english translation.
non_english_df['description'] = non_english_df['english']
non_english_df.drop('english', axis=1, inplace=True)

In [None]:
# Concatenate the split dataframes after translations.
data = pd.concat([english_df, non_english_df])

In [None]:
# Check for null values.
data.info()

In [None]:
# Save full translated podcasts to csv file.
data = data[['uuid', 'title', 'description', 'language', 'categories', 'author']]
data.to_csv('../data/translated_podcasts.csv')

### Sampled Dataset (for recommender)

In [55]:
data = pd.read_csv('../data/translated_podcasts.csv')

In [56]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300669 entries, 0 to 300668
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Unnamed: 0   300668 non-null  object
 1   uuid         300667 non-null  object
 2   title        300667 non-null  object
 3   description  116354 non-null  object
 4   language     116344 non-null  object
 5   categories   116339 non-null  object
 6   author       116339 non-null  object
dtypes: object(7)
memory usage: 16.1+ MB


In [57]:
data.dropna(axis=0, inplace=True)

In [58]:
sample = data.sample(20000, random_state=1234)

In [59]:
sample.shape

(20000, 7)

In [60]:
# Check for null values.
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 270496 to 138409
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   20000 non-null  object
 1   uuid         20000 non-null  object
 2   title        20000 non-null  object
 3   description  20000 non-null  object
 4   language     20000 non-null  object
 5   categories   20000 non-null  object
 6   author       20000 non-null  object
dtypes: object(7)
memory usage: 1.2+ MB


In [61]:
sample.drop('Unnamed: 0', axis=1, inplace=True)

In [65]:
# Create pred_lang function to detect language of descriptions.
def pred_lang(df, column):
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    lang_lst = []
    for text in df[column].values:
        lang, score = identifier.classify(text)
        lang_lst.append(lang)
    return lang_lst

In [66]:
lang_lst = pred_lang(sample, 'description')

In [68]:
# Add lang_lst to sample dataframe.
sample['pred_lang'] = lang_lst

In [74]:
sample.sample(10)

Unnamed: 0,uuid,title,description,language,categories,author,pred_lang
9994,2a7d921d7e0741ecbc9822aea6401929,Just Torts the Podcast,"Law school is tricky, believe me, you need all...",English,Education,Just Law,en
270,1b676049b2dd40d680d973faa726000f,Podcasts - Bloom,Podcasts,English,Religion & Spirituality | Christianity,"Bloom Church (St. Paul, MN)",en
3921,e06edba460e64a90ac0145f2b83b091b,Jesters Academy,"Join your ""instructors"" of Jesters Academy, Du...",English,Comedy,Jesters Academy,en
52646,8824098835e84edfb70482885a4246fe,The Neil Garfield Show,Controversial host and world renown financial ...,English,Business | Investing,The Neil Garfield Show,en
25199,609799572dea4a709b220564bd91d8ae,Football Garbage Time: The Podcast!,Enjoy NFL football news and nonsense? Enjoy ou...,English,Professional | Sports & Recreation,Football Garbage Time The Podcast,en
63242,c487c08868354a3281a2a0969394ed3e,Unfiltered Single Dad,"As a single dad for the last 15 years, I wante...",English,Society & Culture,Unfiltered Single Dad / Anchor,en
292634,529b86387b2046e48c2dc9e07aa44987,Sanfte Weltherrren,"Again and again at some point - often called ""...",German,Comedy | Philosophy | Spirituality | Society &...,Mykx,en
283989,6e610f9c67994cc0865e8ffcd9f29e07,Guildnews Podcast,"In our podcast on MMO Guild Wars 2, Sputti, Du...",German,Video Games | Games & Hobbies | Podcasting | T...,Alexander Leitsch,en
27648,6ba9994b0d634c90b00245d9cc910960,aarontra radio 📻,Trying to be more creative and not just be a c...,English,Comedy,Aaron Trahan / Anchor,en
270290,f093df064dcf4a0697383bef43a95aa5,The Penumbra Podcast,"Depending on who you ask, the Penumbra is eith...",English,Arts | Performing Arts | Comedy,Sophie Kaner and Kevin Vibert,en


In [75]:
# Drop language column.
sample.drop('language', axis=1, inplace=True)

In [76]:
# Split into English and Non-English dataframes.
non_english_df = sample[sample['pred_lang']!='en']
english_df = sample[sample['pred_lang']=='en']

In [79]:
# Translate Non-English descriptions.
non_english_df['translation'] = non_english_df['description'].apply(lambda x: translate(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [85]:
# Sanity check.
non_english_df.sample(10)

Unnamed: 0,uuid,title,description,categories,author,pred_lang,translation
276158,bc51e1d167d84f018aebcc623b8fb904,ByznysPark,"ByznysPark přináší rozhovory s lidmi, které ba...",Business | Management & Marketing,Michal Andera,cs,ByznysPark brings interviews with people who e...
15298,e301fa81424948cfb9fc4df0dda97c8d,On Air with Niilo,"Teacher, blogger, YouTuber, podcaster, EdTech ...",Education,"BeSmartOnAir, DViSonAir",cs,"Teacher, blogger, YouTuber, podcaster, EdTech ..."
293843,63ddbfbce31f47cc99df0e427a6abbd8,Rádio Mais FM,Rádio Mais FM - 87.9 - Anápolis - Goiás,News & Politics,Rádio Mais FM,ga,Rádio Mais FM - 87.9 - Anápolis - Goiás
11431,0b9f1d497630469182b802139c963444,Khoresht-e Tech - خورشت تکنولوژی,Khoresht-e Tech is a weekly podcast in Persian...,Tech News | Gadgets | Technology,SBS Persian,ur,Khoresht-e Tech is a weekly podcast in Persian...
50334,2679ed07a2ba4b8fbbbc37792c76d4b9,Semilla OC,Somos una comunidad multigeneracional apasiona...,Christianity | Religion & Spirituality,Hector Hermosillo Podcast,es,We are a multigenerational community passionat...
142803,3ce7d39298984c1d8d58d3df4ee2a431,Slacker & Steve,Slacker & Steve,Technology | Society & Culture | Comedy,Slacker & Steve,sl,Slacker & Steve
20754,e21efed3b04a4ce2abc3bed674901278,Swiss Cove Christian Church,Swiss Cove Christian Church,Religion & Spirituality | Christianity,Swiss Cove,de,Swiss Cove Christian Church
29378,5b8b889b232c4d3898f66b934d601189,Jorge De Los Ríos,Podcast en español en donde comparto mis lecci...,Society & Culture,Jorge De Los Ríos / Anchor,es,Podcast in Spanish where I share my life lesso...
262989,962f35be3b8b438d9a55f8c84a0d09bc,Premier League Podden,Välkommen till Premier League Podden! \r\n\r\n...,Sports & Recreation | Professional,Fansens podcast,sv,Welcome to the Premier League Pod! Like our p...
264371,69facd2f96374442bdc5a33d5733c2c4,Liga dos 32 (NFL),Falando de NFL como você nunca ouviu. Trazemos...,Sports & Recreation | Professional,Liga dos 32,pt,Speaking of NFL like you've never heard. We br...


In [86]:
# Remove podcasts where translation did not work.
non_english_df = non_english_df[non_english_df['description']==non_english_df['translation']]

In [87]:
# Replace description column.
non_english_df['description'] = non_english_df['translation']
non_english_df.drop('translation', axis=1, inplace=True)

In [88]:
# Put the dataframes back together.
sample = pd.concat([english_df, non_english_df])

In [89]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19126 entries, 270496 to 281091
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   uuid         19126 non-null  object
 1   title        19126 non-null  object
 2   description  19126 non-null  object
 3   categories   19126 non-null  object
 4   author       19126 non-null  object
 5   pred_lang    19126 non-null  object
dtypes: object(6)
memory usage: 1.0+ MB


In [92]:
# Save sampled translated podcasts to csv file.
sample = sample[['uuid', 'title', 'description', 'pred_lang', 'categories', 'author']]
sample.to_csv('../data/translated_podcast_samples.csv')