# Import libraries

In [81]:
#Navigate local files
import os
import glob

#Web scraping
import requests
from bs4 import BeautifulSoup

#Output df to contain file directories and transcriptions
import numpy as np
import pandas as pd

#Transcription
from transcribe_audio import transcribe_audio

import IPython.display as ipd  #Display music in jupyter notebook

from googletrans import Translator  #Translate English transcripts into Chinese

import re

# Customized functions

In [104]:
def judge_create_directory(directory):
    #Judge whether directory and if not, create it
    judgement = os.path.exists(directory)
    if judgement is not True:
        os.makedirs(directory)


def download_audio(download_url, save_directory):
    #Save audio file from download_url to save_directory
    audio = requests.get(download_url)

    folder_directory = os.path.dirname(save_directory)  #save_directory is
    # the file's directory and folder_directory will be the directory of the
    # folder that the file will be downloaded to
    judge_create_directory(folder_directory)

    if audio.status_code == 200:
        with open(save_directory, 'wb') as file:
            file.write(audio.content)


def Eng_to_Chn(eng_str):
    #Translate eng_str into a string in simplified Chinese
    translator = Translator()
    result = translator.translate(
        eng_str,
        src='en',
        dest='zh-cn'  #Simplified Chinese
    )
    return result.text



# Scrape website
https://voicewiki.cn/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922
* These command voice are very suitable for voice model training since the
website already provided the transcripts and the voices are all in the
length around 10 sec, which is good for training.

## ÂâØÂÆòÈÄöÁî®ËØ≠Èü≥
https://voicewiki.cn/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5
%AE%98%E9%80%9A%E7%94%A8%E8%AF%AD%E9%9F%B3
* These voices are spoken with a very calm tone, so they are very good for
voice training. Also I like the calm tone here, which makes them a bit
robotic or machinery.
* Yuri's voice here is slightly different from other sections on the website
. So just to be careful.

In [12]:
url = 'https://voicewiki.cn/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E9%80%9A%E7%94%A8%E8%AF%AD%E9%9F%B3'

request = requests.get(url)

print(request.status_code)

200


In [13]:
#Obtain all the contents from that request
if request.status_code == 200:
    soup = BeautifulSoup(request.content, 'html.parser')
    print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="zh-Hans-CN">
 <head>
  <meta charset="utf-8"/>
  <title>
   Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÈÄöÁî®ËØ≠Èü≥ - ËØ≠Èü≥Áª¥Âü∫
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"zh","wgMonthNames":["","1Êúà","2Êúà","3Êúà","4Êúà","5Êúà","6Êúà","7Êúà","8Êúà","9Êúà","10Êúà","11Êúà","12Êúà"],"wgRequestId":"2ee2f8073a5994c14b366a18","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÈÄöÁî®ËØ≠Èü≥","wgTitle":"Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÈÄöÁî®ËØ≠Èü≥","wgCurRevisionId":72594,"wgRevisionId":72594,"wgArticleId":22304,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":[],"wgPageContentLanguage":"zh-cn","wgPageContentModel":"wikitext","wgRelevantPageName":"Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÈÄöÁî®ËØ≠Èü≥","wgRelev

In [20]:
#Obtain the tab that contains all the tables
tables_tab = soup.find_all('div', {'class': 'mw-parser-output'})
# print(len(tables_tab)) #It should be 1
print(tables_tab[0].prettify())

<div class="mw-parser-output">
 <div style="font-size: small; margin-bottom: 1.5em;">
  <a href="/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%8F%8A%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3" title="Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÂèäÊàòÂΩπËØ≠Èü≥">
   &lt;
  </a>
  ËøîÂõû‰∏äÁ∫ßÔºö
  <a href="/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%8F%8A%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3" title="Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÂèäÊàòÂΩπËØ≠Èü≥">
   Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÂèäÊàòÂΩπËØ≠Èü≥
  </a>
 </div>
 <p>
  ‰ª•‰∏ã‰∏∫ÈÄöÁî®ÂâØÂÆòËØ≠Èü≥ÔºåÁî±‰∏âÈòµËê•ÂâØÂÆòÂøµÂêå‰∏ÄÂè•ËØùÔºàÂ∞ΩÁÆ°Êúâ‰∫õÊó†Ê≥ïËß¶ÂèëÔºåÊØîÂ¶ÇÂ∞§ÈáåÈòµËê•ÁöÑÊàòÂΩπËØ≠Èü≥ÔºåÂõ†‰∏∫Â∞§ÈáåÊ≤°ÊúâÊàòÂΩπÔºâ„ÄÇÂèØÂú®Âçï‰∫∫Ê®°Âºè‰∏ãÁöÑÊàòÂΩπÊàñÈÅ≠ÈÅáÊàò‰∏≠Âê¨Âà∞„ÄÇ
 </p>
 <div aria-labelledby="mw-toc-heading" class="toc" id="toc" role="navigation">
  <input class="toctogglecheckbox" id="toctogglecheckbox" role="button" style="display:none" type="checkbox"/>
  <div class="toctitle" dir="ltr" lang="zh-Hans-CN">
   <h2 id="mw-toc-heading">

In [25]:
#Obtain all tha tables
tables = tables_tab[0].find_all('table')
# print(len(tables)) #This should be the total table number on that page
print(tables[0].prettify())

<table>
 <tbody>
  <tr>
   <th rowspan="2">
    <div class="downloadable-audio template-render-cloak">
     <div class="audio-player">
      <a class="internal" href="/w/images/0/04/RA2_ceva001.mp3" title="RA2 ceva001.mp3">
       Media:RA2_ceva001.mp3
      </a>
     </div>
     <div class="downloadable-audio-dropdown">
      <div class="download-button">
       <div class="download-icon">
       </div>
       <div class="download-link">
        <a class="internal" href="/w/images/0/04/RA2_ceva001.mp3" title="RA2 ceva001.mp3">
         Media:RA2_ceva001.mp3
        </a>
       </div>
      </div>
     </div>
    </div>
   </th>
   <th rowspan="2">
    <div class="downloadable-audio template-render-cloak">
     <div class="audio-player">
      <a class="internal" href="/w/images/b/b3/RA2_csof001.mp3" title="RA2 csof001.mp3">
       Media:RA2_csof001.mp3
      </a>
     </div>
     <div class="downloadable-audio-dropdown">
      <div class="download-button">
       <div class="download-

In [86]:
#Create a df to record all the info
general_df = pd.DataFrame(columns=['EvaLee_direct', 'Zofia_direct',
                                   'Yuri_direct', 'transcript_Eng',
                                   'transcript_Chn'])
general_df

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,transcript_Eng,transcript_Chn


In [87]:
#Iterate through each table
for table in tables:
    df_row_list = []

    #Find audio urls
    audios = table.find_all('a', {'class': 'internal'})  #Don't use 'audio'
    # here for find_all since those should be generated by JS and the
    # returned html doesn't have that

    audios = sorted(set(audios), key=audios.index)  # This is a special
    # way to remove
    # duplications while preserve the original order. 1 table should have 3
    # audios but each will be repeated twice in the original audios.

    audios_direct = []
    for audio in audios:
        internal_url = audio['href']
        download_link = 'https://voicewiki.cn' + internal_url  #The URL you can
        # use to download the audio file

        file_name = internal_url.split('/')[-1]  #The name of the original
        # file that you can use to name the file for the download. It'll be
        # like RA2_ceva001.mp3 (for Allies Lieutenant Eva Lee), RA2_csof001
        # .mp3 (for Soviets Lieutenant Zofia),
        # RA2_cyur001.mp3 (for Yuri)

        if 'ev' in file_name:
            folder_directory = './audio_RA2/EvaLee/General/'
        elif 'so' in file_name:
            folder_directory = './audio_RA2/Zofia/General/'
        elif 'yu' in file_name:
            folder_directory = './audio_RA2/Yuri/General/'
        else:
            print('Something is wrong with file name {}'.format(file_name))

        file_directory = folder_directory + file_name
        audios_direct.append(file_directory)

        download_audio(download_link, file_directory)
    df_row_list += audios_direct

    #Find transcripts
    transcripts = table.find_all('td')  #transcripts will be a list
    # containing 2 stings, 1 as English and the other as Chinese
    cleaned_transcripts = []
    for transcript in transcripts:
        cleaned_transcripts.append(transcript.text.strip())
    df_row_list += cleaned_transcripts

    general_df.loc[
        len(general_df)] = df_row_list  #Appened df_row_list to general_df

general_df

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/EvaLee/General/RA2_ceva001.mp3,./audio_RA2/Zofia/General/RA2_csof001.mp3,./audio_RA2/Yuri/General/RA2_cyur001.mp3,"Warning, Nuclear Silo detected.",Ë≠¶ÂëäÔºå‰æ¶ÊµãÂà∞Ê†∏ÂºπÂèëÂ∞Ñ‰∫ï
1,./audio_RA2/EvaLee/General/RA2_ceva002.mp3,./audio_RA2/Zofia/General/RA2_csof002.mp3,./audio_RA2/Yuri/General/RA2_cyur002.mp3,"Warning, Nuclear Missile launched.",Ë≠¶ÂëäÔºåÊ†∏ÂºπÂ∑≤ÂèëÂ∞Ñ
2,./audio_RA2/EvaLee/General/RA2_ceva003.mp3,./audio_RA2/Zofia/General/RA2_csof003.mp3,./audio_RA2/Yuri/General/RA2_cyur003.mp3,Nuclear Missile ready.,Ê†∏ÂºπÂ∑≤ÂáÜÂ§áÂ∞±Áª™
3,./audio_RA2/EvaLee/General/RA2_ceva004.mp3,./audio_RA2/Zofia/General/RA2_csof004.mp3,./audio_RA2/Yuri/General/RA2_cyur004.mp3,"Warning, Iron Curtain detected.",Ë≠¶ÂëäÔºå‰æ¶ÊµãÂà∞ÈìÅÂπïË£ÖÁΩÆ
4,./audio_RA2/EvaLee/General/RA2_ceva005.mp3,./audio_RA2/Zofia/General/RA2_csof005.mp3,./audio_RA2/Yuri/General/RA2_cyur005.mp3,"Warning, Iron Curtain activated.",Ë≠¶ÂëäÔºåÈìÅÂπïË£ÖÁΩÆÂ∑≤ÊøÄÊ¥ª
...,...,...,...,...,...
129,./audio_RA2/EvaLee/General/RA2_ceva142.mp3,./audio_RA2/Zofia/General/RA2_csof142.mp3,./audio_RA2/Yuri/General/RA2_cyur142.mp3,Spy Plane on route.,‰æ¶ÂØüÊú∫Âú®Ë∑Ø‰∏ä
130,./audio_RA2/EvaLee/General/RA2_ceva143.mp3,./audio_RA2/Zofia/General/RA2_csof143.mp3,./audio_RA2/Yuri/General/RA2_cyur143.mp3,Battle control offline.,‰ΩúÊàòÊéßÂà∂Á¶ªÁ∫ø
131,./audio_RA2/EvaLee/General/RA2_ceva150.mp3,./audio_RA2/Zofia/General/RA2_csof150.mp3,./audio_RA2/Yuri/General/RA2_cyur150.mp3,Paratroopers ready.,‰ºûÂÖµÂ∞±Áª™
132,./audio_RA2/EvaLee/General/RA2_ceva151.mp3,./audio_RA2/Zofia/General/RA2_csof151.mp3,./audio_RA2/Yuri/General/RA2_cyur151.mp3,Weather control device temporarily unavailable.,Â§©Ê∞îÊéßÂà∂Êú∫ÊöÇÊó∂Â§±Êïà


In [90]:
#Save the df as csv
general_df.to_csv('./audio_RA2/transcripts/general.csv',
                  index=False,
                  encoding='utf-8-sig'  #Without this option, the Chinese
                  # column will be unreadable
                  )

## ÁõüÂÜõÊàòÂΩπËØ≠Èü≥
https://voicewiki.cn/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E7%9B%9F%E5%86%9B%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3
* Some voices don't belong to Eva Lee, Zofia, or Yuri, so you need to
 save them into the "Others" folder
 * This page doesn't have any Zofia voice
  * Some of the "ÊàòÂâçËøáÂú∫" voices are spoken by Eva Lee but they were put in the
   Others folder since their file name doesn't follow rule. Also some of
   thpse voices aren't Eva so if you want, you can manually move those
   spoken by Eva to the EvaLee folder
     *  This was solved in the final clean-up

In [35]:
url = 'https://voicewiki.cn/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E7%9B%9F%E5%86%9B%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3'

request = requests.get(url)

print(request.status_code)

200


In [36]:
#Obtain all the contents from that request
if request.status_code == 200:
    soup = BeautifulSoup(request.content, 'html.parser')
    print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="zh-Hans-CN">
 <head>
  <meta charset="utf-8"/>
  <title>
   Á∫¢Ëâ≤Ë≠¶Êàí2/ÁõüÂÜõÊàòÂΩπËØ≠Èü≥ - ËØ≠Èü≥Áª¥Âü∫
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"zh","wgMonthNames":["","1Êúà","2Êúà","3Êúà","4Êúà","5Êúà","6Êúà","7Êúà","8Êúà","9Êúà","10Êúà","11Êúà","12Êúà"],"wgRequestId":"508557e4fbe31ff2827824d3","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Á∫¢Ëâ≤Ë≠¶Êàí2/ÁõüÂÜõÊàòÂΩπËØ≠Èü≥","wgTitle":"Á∫¢Ëâ≤Ë≠¶Êàí2/ÁõüÂÜõÊàòÂΩπËØ≠Èü≥","wgCurRevisionId":120022,"wgRevisionId":120022,"wgArticleId":22307,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":[],"wgPageContentLanguage":"zh-cn","wgPageContentModel":"wikitext","wgRelevantPageName":"Á∫¢Ëâ≤Ë≠¶Êàí2/ÁõüÂÜõÊàòÂΩπËØ≠Èü≥","wgRel

In [37]:
#Obtain the tab that contains all the tables
tables_tab = soup.find_all('div', {'class': 'mw-parser-output'})
# print(len(tables_tab)) #It should be 1
print(tables_tab[0].prettify())

<div class="mw-parser-output">
 <div style="font-size: small; margin-bottom: 1.5em;">
  <a href="/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%8F%8A%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3" title="Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÂèäÊàòÂΩπËØ≠Èü≥">
   &lt;
  </a>
  ËøîÂõû‰∏äÁ∫ßÔºö
  <a href="/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%8F%8A%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3" title="Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÂèäÊàòÂΩπËØ≠Èü≥">
   Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÂèäÊàòÂΩπËØ≠Èü≥
  </a>
 </div>
 <p>
  langmd.mixÈáåÂè™Êúâ‚ÄúÊ≤°ÊúâÂØπÂ∫îËßÜÈ¢ëÂè™ÊúâÂ£∞Èü≥‚ÄùÁöÑËØ≠Èü≥„ÄÇËøáÂú∫Âä®ÁîªÁöÑËØ≠Èü≥‰∏çÂ≠òÂú®ËøôÈáå„ÄÇ
 </p>
 <div aria-labelledby="mw-toc-heading" class="toc" id="toc" role="navigation">
  <input class="toctogglecheckbox" id="toctogglecheckbox" role="button" style="display:none" type="checkbox"/>
  <div class="toctitle" dir="ltr" lang="zh-Hans-CN">
   <h2 id="mw-toc-heading">
    ÁõÆÂΩï
   </h2>
   <span class="toctogglespan">
    <label class="toctogglelabel" for="toctogglecheckb

In [38]:
#Obtain all tha tables
tables = tables_tab[0].find_all('table')
# print(len(tables)) #This should be the total table number on that page
print(tables[0].prettify())

<table>
 <tbody>
  <tr>
   <th rowspan="2">
    <div class="downloadable-audio template-render-cloak">
     <div class="audio-player">
      <a class="internal" href="/w/images/4/4a/RA2_cevau06.mp3" title="RA2 cevau06.mp3">
       Media:RA2_cevau06.mp3
      </a>
     </div>
     <div class="downloadable-audio-dropdown">
      <div class="download-button">
       <div class="download-icon">
       </div>
       <div class="download-link">
        <a class="internal" href="/w/images/4/4a/RA2_cevau06.mp3" title="RA2 cevau06.mp3">
         Media:RA2_cevau06.mp3
        </a>
       </div>
      </div>
     </div>
    </div>
   </th>
   <td>
    The V3 is a powerful long-range artillery weapon. Beware Commander, it can level your base in minutes without you ever getting a chance to even return fire.
   </td>
  </tr>
  <tr>
   <td>
    <a href="/wiki/V3%E7%81%AB%E7%AE%AD%E5%8F%91%E5%B0%84%E8%BD%A6%EF%BC%88%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922%EF%BC%89" title="V3ÁÅ´ÁÆ≠ÂèëÂ∞ÑËΩ¶ÔºàÁ∫¢Ëâ≤Ë≠¶Êàí

In [39]:
#Create a df to record all the info
allied_war_df = pd.DataFrame(
    columns=['EvaLee_direct', 'Yuri_direct', 'Other_direct', 'transcript_Eng',
             'transcript_Chn'])
allied_war_df

Unnamed: 0,EvaLee_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn


In [40]:
#Iterate through each table
for table in tables:
    df_row_list = []

    #Find audio urls
    audios = table.find_all('a', {'class': 'internal'})  #Don't use 'audio'
    # here for find_all since those should be generated by JS and the
    # returned html doesn't have that

    audios = sorted(set(audios), key=audios.index)  # This is a special
    # way to remove
    # duplications while preserve the original order. 1 table should have 3
    # audios but each will be repeated twice in the original audios.

    audios_direct = []
    for audio in audios:
        internal_url = audio['href']
        download_link = 'https://voicewiki.cn' + internal_url  #The URL you can
        # use to download the audio file

        file_name = internal_url.split('/')[-1]  #The name of the original
        # file that you can use to name the file for the download. It'll be
        # like RA2_ceva001.mp3 (for Allies Lieutenant Eva Lee), RA2_csof001
        # .mp3 (for Soviets Lieutenant Zofia),
        # RA2_cyur001.mp3 (for Yuri)

        if 'ev' in file_name:
            folder_directory = './audio_RA2/EvaLee/AlliedWar/'
        elif 'yu' in file_name:
            folder_directory = './audio_RA2/Yuri/AlliedWar/'
        else:
            folder_directory = './audio_RA2/Others/AlliedWar/'

        file_directory = folder_directory + file_name

        #Add in additional nan values so the row match the header of the df
        if 'ev' in file_name:
            audios_direct.append(file_directory)
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
        elif 'yu' in file_name:
            audios_direct.append(np.nan)
            audios_direct.append(file_directory)
            audios_direct.append(np.nan)
        else:
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
            audios_direct.append(file_directory)

        download_audio(download_link, file_directory)
    df_row_list += audios_direct

    #Find transcripts
    transcripts = table.find_all('td')  #transcripts will be a list
    # containing 2 stings, 1 as English and the other as Chinese
    cleaned_transcripts = []
    for transcript in transcripts:
        cleaned_transcripts.append(transcript.text.strip())
    df_row_list += cleaned_transcripts

    try:
        allied_war_df.loc[len(allied_war_df)] = df_row_list
    except:
        print(df_row_list)  #This will be the df_row_list containing the error

allied_war_df

Unnamed: 0,EvaLee_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/EvaLee/AlliedWar/RA2_cevau06.mp3,,,The V3 is a powerful long-range artillery weap...,V3ÊòØÂ®ÅÂäõÂº∫Â§ßÁöÑÈïøÁ®ãÁÅ´ÁÆ≠„ÄÇÂΩìÂøÉÔºåÊåáÊå•ÂÆòÔºåV3ËÉΩÂú®Âá†ÂàÜÈíüÂÜÖÂ∞ÜÊÇ®ÁöÑÂü∫Âú∞Â§∑‰∏∫Âπ≥Âú∞ÔºåÊÇ®ÁîöËá≥ËøûËøòÂáªÁöÑÊú∫...
1,./audio_RA2/EvaLee/AlliedWar/RA2_cevau07.mp3,,,"The most powerful Soviet Tank ever built, the ...",Â§©ÂêØÂù¶ÂÖãÊòØËãèËÅîÂè≤‰∏äÊúÄÂº∫ÁöÑÂù¶ÂÖãÔºåÊú¨Ë∫´Â∞±ÂÖ∑Â§á‰∏ÄÂè™Â∞èÂûãÂÜõÈòüÁöÑÁÅ´ÂäõÔºåË£ÖÂ§áÂØπÁ©∫ÂèäÂØπÂú∞ÁöÑÊ≠¶Ë£Ö
2,./audio_RA2/EvaLee/AlliedWar/RA2_cevau08.mp3,,,"The most powerful air unit ever created, the K...",Âü∫Ê¥õÂ§´Á©∫ËâáÊòØÊúÄÂº∫ÁöÑÁ©∫‰∏≠Ê≠¶Âô®ÔºåËÉΩÊäïÊé∑Êó†ÈôêÁöÑÈìÅÂà∂ÁÇ∏ÂºπÔºåÊîªÂáªÊïå‰∫∫
3,./audio_RA2/EvaLee/AlliedWar/RA2_cevau13.mp3,,,"The ultimate in Soviet long-range bombardment,...",Êó†ÁïèÁ∫ßÊàòËà∞ÊòØËãèËÅîÁ©∂ÊûÅÁöÑÈïøÁ®ãÁÇÆËΩ∞Ê≠¶Âô®ÔºåËøúË∑ùÊîªÂáªÂäõÊó†‰ª•‰º¶ÊØîÔºå‰ΩÜÂæàÂÆπÊòìÂú®ËøëË∑ùÁ¶ªË¢´ÂáªÊ≤â„ÄÇÊó†ÁïèÁ∫ßÊàòËà∞ÊâÄ...
4,./audio_RA2/EvaLee/AlliedWar/RA2_cevau15.mp3,,,The Soviets have taken Tesla Technology to the...,ËãèËÅîÁ£ÅËÉΩÁßëÊäÄÁöÑÊúÄÂêéÈò∂ÊÆµÔºåË∫´Á©øÁ£ÅËÉΩÂä®ÂäõË£ÖÁöÑÂ£´ÂÖµ„ÄÇËøô‰∫õÂ£´ÂÖµÊúÄÊìÖÈïøÂØπ‰ªòÂù¶ÂÖãÔºå‰πüËÉΩÁî®‰ªñ‰ª¨È¢ùÂ§ñÁöÑËÉΩÊ∫êÂä†...
...,...,...,...,...,...
189,./audio_RA2/EvaLee/AlliedWar/RA2_xa7ev03.mp3,,,"We don't have much time, Commander. Build up y...",Êàë‰ª¨Ê≤°ÊúâÂ§öÂ∞ëÊó∂Èó¥‰∫ÜÔºåÊåáÊå•ÂÆò„ÄÇÂú®Â∞§ÈáåÂç†È¢Ü‰Ω†ÁöÑ‰ΩçÁΩÆ‰πãÂâçÔºåËøÖÈÄüÂª∫Á´ãËµ∑‰Ω†Âú®ÂçóÊûÅÊ¥≤ÁöÑÂü∫Âú∞
190,./audio_RA2/EvaLee/AlliedWar/RA2_xa7ev04.mp3,,,"Good job, Commander. We're search for a suitab...",ÂÅöÂæóÂ•ΩÔºåÊåáÊå•ÂÆò„ÄÇÊàë‰ª¨Ê≠£Âú®ÂØªÊâæ‰∏Ä‰∏™ÂêàÈÄÇÁöÑÁùÄÈôÜÂå∫Â∞Ü‰Ω†ÁöÑMCV‰º†ÈÄÅËøáÂéª„ÄÇÂè™Ë¶ÅÂÜçÊãñ‰ΩèÂ∞§ÈáåÁöÑÈÉ®ÈòüÂ∞±ÂèØ‰ª•‰∫Ü
191,./audio_RA2/EvaLee/AlliedWar/RA2_xa7ev05.mp3,,,"Sir, if you can combine Soviet technology with...",ÈïøÂÆòÔºåÂ¶ÇÊûú‰Ω†ËÉΩÊääËãèËÅîÁöÑÊäÄÊúØÂíåÊàë‰ª¨ÁöÑÊäÄÊúØÁªìÂêàËµ∑Êù•Ôºå‰Ω†Â∫îËØ•ËÉΩÂæóÂà∞‰∏Ä‰∫õÊúâË∂£ÁöÑÁªìÊûú„ÄÇÂÆÉÂèØ‰ª•Áªô‰Ω†Â∏¶Êù•‰Ω†...
192,./audio_RA2/EvaLee/AlliedWar/RA2_xa7ev06.mp3,,,"Sir, our Soviet base on Tierra del Fuego has c...",ÈïøÂÆòÔºåÊàë‰ª¨Âú®ÁÅ´Âú∞Â≤õÁöÑËãèÁª¥ÂüÉÂü∫Âú∞ÂèóÂà∞‰∫ÜÊîªÂáª„ÄÇ‰øùÊåÅÂü∫Âú∞ÂÆåÂ•ΩÔºåËøôÊ†∑Êàë‰ª¨Â∞±ËÉΩÁªßÁª≠‰øùÁïôÂπ∂‰ΩøÁî®ËãèËÅîÁöÑÁßëÊäÄ


In [42]:
#Save the df as csv
allied_war_df.to_csv('./audio_RA2/transcripts/allied_war.csv',
                     index=False,
                     encoding='utf-8-sig'  #Without this option, the Chinese
                     # column will be unreadable
                     )

## ËãèËÅîÊàòÂΩπËØ≠Èü≥
https://voicewiki.cn/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E8%8B%8F%E8%81%94%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3
* Some voices don't belong to Eva Lee, Zofia, or Yuri, so you need to
 save them into the "other" folder
 * This page doesn‚Äôt have any Eva Lee voice
  * Some of the ‚ÄúÊàòÂâçËøáÂú∫‚Äù voices are spoken by Eva Lee but they were put in the Others folder since their file name doesn‚Äôt follow rule. Also some of thpse voices aren‚Äôt Eva so if you want, you can manually move those spoken by Eva to the EvaLee folder
    * This was solved in the final clean-up
* Some of the voices don't have transcript on the website, so you need
    to transcript them yourself

In [43]:
url = 'https://voicewiki.cn/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E8%8B%8F%E8%81%94%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3'

request = requests.get(url)

print(request.status_code)

200


In [44]:
#Obtain all the contents from that request
if request.status_code == 200:
    soup = BeautifulSoup(request.content, 'html.parser')
    print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="zh-Hans-CN">
 <head>
  <meta charset="utf-8"/>
  <title>
   Á∫¢Ëâ≤Ë≠¶Êàí2/ËãèËÅîÊàòÂΩπËØ≠Èü≥ - ËØ≠Èü≥Áª¥Âü∫
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"zh","wgMonthNames":["","1Êúà","2Êúà","3Êúà","4Êúà","5Êúà","6Êúà","7Êúà","8Êúà","9Êúà","10Êúà","11Êúà","12Êúà"],"wgRequestId":"d74da4ce4ac76d65b890e331","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Á∫¢Ëâ≤Ë≠¶Êàí2/ËãèËÅîÊàòÂΩπËØ≠Èü≥","wgTitle":"Á∫¢Ëâ≤Ë≠¶Êàí2/ËãèËÅîÊàòÂΩπËØ≠Èü≥","wgCurRevisionId":120922,"wgRevisionId":120922,"wgArticleId":22308,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["ÈúÄË¶ÅË°•ÂÖÖÂÜÖÂÆπ"],"wgPageContentLanguage":"zh-cn","wgPageContentModel":"wikitext","wgRelevantPageName":"Á∫¢Ëâ≤Ë≠¶Êàí2/ËãèËÅî

In [45]:
#Obtain the tab that contains all the tables
tables_tab = soup.find_all('div', {'class': 'mw-parser-output'})
# print(len(tables_tab)) #It should be 1
print(tables_tab[0].prettify())

<div class="mw-parser-output">
 <div style="font-size: small; margin-bottom: 1.5em;">
  <a href="/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%8F%8A%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3" title="Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÂèäÊàòÂΩπËØ≠Èü≥">
   &lt;
  </a>
  ËøîÂõû‰∏äÁ∫ßÔºö
  <a href="/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%8F%8A%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3" title="Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÂèäÊàòÂΩπËØ≠Èü≥">
   Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÂèäÊàòÂΩπËØ≠Èü≥
  </a>
 </div>
 <div class="infoBox">
  <div class="infoBoxContent">
   <div class="infoBoxTitle">
    <div class="infoBoxIcon" style="color:#e69100;">
     <i aria-hidden="true" class="fa fa-info-circle faa-pulse animated" style="animation: pulse 2s cubic-bezier(0.18, 0.89, 0.32, 1.28) infinite;">
     </i>
    </div>
    <div class="infoBoxText">
     <b>
      Ê≠§È°µÈù¢ÂÜÖÂÆπÈúÄË¶ÅË°•ÂÖÖ
     </b>
     <br/>
     Ê≠§È°µÈù¢ÂÜÖÂÆπÂ∞ö‰∏çÂÆåÊï¥ÔºåËØ≠Èü≥Áª¥Âü∫Ê¨¢ËøéÊÇ®
     <font class="text-chromatic">
     

In [46]:
#Obtain all tha tables
tables = tables_tab[0].find_all('table')
# print(len(tables)) #This should be the total table number on that page
print(tables[0].prettify())

<table>
 <tbody>
  <tr>
   <th rowspan="2">
    <div class="downloadable-audio template-render-cloak">
     <div class="audio-player">
      <a class="internal" href="/w/images/f/fa/RA2_csofu04.mp3" title="RA2 csofu04.mp3">
       Media:RA2_csofu04.mp3
      </a>
     </div>
     <div class="downloadable-audio-dropdown">
      <div class="download-button">
       <div class="download-icon">
       </div>
       <div class="download-link">
        <a class="internal" href="/w/images/f/fa/RA2_csofu04.mp3" title="RA2 csofu04.mp3">
         Media:RA2_csofu04.mp3
        </a>
       </div>
      </div>
     </div>
    </div>
   </th>
   <td>
    Soviet technology at its finest. Terror Drones can enter and destroy enemy vehicles from the inside.
   </td>
  </tr>
  <tr>
   <td>
    ËãèËÅîÁßëÊäÄÁöÑÁªìÊô∂„ÄÇ
    <a href="/wiki/%E6%81%90%E6%80%96%E6%9C%BA%E5%99%A8%E4%BA%BA%EF%BC%88%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922%EF%BC%89" title="ÊÅêÊÄñÊú∫Âô®‰∫∫ÔºàÁ∫¢Ëâ≤Ë≠¶Êàí2Ôºâ">
     ÊÅêÊÄñÊú∫Âô®‰∫∫
   

In [55]:
#Create a df to record all the info
soviet_war_df = pd.DataFrame(
    columns=['Zofia_direct', 'Yuri_direct', 'Other_direct', 'transcript_Eng',
             'transcript_Chn'])
soviet_war_df

Unnamed: 0,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn


In [56]:
#Iterate through each table
for table in tables:
    df_row_list = []

    #Find audio urls
    audios = table.find_all('a', {'class': 'internal'})  #Don't use 'audio'
    # here for find_all since those should be generated by JS and the
    # returned html doesn't have that

    audios = sorted(set(audios), key=audios.index)  # This is a special
    # way to remove
    # duplications while preserve the original order. 1 table should have 3
    # audios but each will be repeated twice in the original audios.

    audios_direct = []
    for audio in audios:
        internal_url = audio['href']
        download_link = 'https://voicewiki.cn' + internal_url  #The URL you can
        # use to download the audio file

        file_name = internal_url.split('/')[-1]  #The name of the original
        # file that you can use to name the file for the download. It'll be
        # like RA2_ceva001.mp3 (for Allies Lieutenant Eva Lee), RA2_csof001
        # .mp3 (for Soviets Lieutenant Zofia),
        # RA2_cyur001.mp3 (for Yuri)

        if 'so' in file_name:
            folder_directory = './audio_RA2/Zofia/SovietWar/'
        elif 'yu' in file_name:
            folder_directory = './audio_RA2/Yuri/SovietWar/'
        else:
            folder_directory = './audio_RA2/Others/SovietWar/'

        file_directory = folder_directory + file_name

        #Add in additional nan values so the row match the header of the df
        if 'so' in file_name:
            audios_direct.append(file_directory)
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
        elif 'yu' in file_name:
            audios_direct.append(np.nan)
            audios_direct.append(file_directory)
            audios_direct.append(np.nan)
        else:
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
            audios_direct.append(file_directory)

        download_audio(download_link, file_directory)
    df_row_list += audios_direct

    #Find transcripts
    transcripts = table.find_all('td')  #transcripts will be a list
    # containing 2 stings, 1 as English and the other as Chinese
    cleaned_transcripts = []
    for transcript in transcripts:
        cleaned_transcripts.append(transcript.text.strip())
    df_row_list += cleaned_transcripts

    try:
        soviet_war_df.loc[len(soviet_war_df)] = df_row_list
    except:
        print(df_row_list)  #This will be the df_row_list containing the error

soviet_war_df

Unnamed: 0,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/Zofia/SovietWar/RA2_csofu04.mp3,,,Soviet technology at its finest. Terror Drones...,ËãèËÅîÁßëÊäÄÁöÑÁªìÊô∂„ÄÇÊÅêÊÄñÊú∫Âô®‰∫∫ËÉΩÊΩúÂÖ•Êïå‰∫∫ËΩ¶ËæÜÂÜÖÈÉ®ÔºåÂπ∂Áî±ÂÜÖÈÉ®Â∞Ü‰πãÊëßÊØÅ
1,./audio_RA2/Zofia/SovietWar/RA2_csofu06.mp3,,,The V3 Launcher has extreme range capabilities...,V3ÁÅ´ÁÆ≠ÂèëÂ∞ÑËΩ¶ÊòØÂçÅÂàÜÈïøÁ®ãÁöÑÊ≠¶Âô®ÔºåÂÖ∑ÊúâÊó†ÊØî‰º¶ÊØîÁöÑÊΩúÂú®Á†¥ÂùèÂäõ„ÄÇËÇØÂÆöËÉΩÈÄ†ÊàêÊµ©Âä´Ëà¨ÁöÑ‰º§ÂÆ≥Ôºå‰ΩÜ‰∏çË¶ÅËÆ©ÂÖ∂...
2,./audio_RA2/Zofia/SovietWar/RA2_csofu07.mp3,,,The appropriately named Apocalypse Tank is war...,ËΩ¶Â¶ÇÂÖ∂ÂêçÁöÑÂ§©ÂêØÂù¶ÂÖãÔºåÊòØÂÆåÁæéÁªìÂêàÁöÑÊàò‰∫âÊ≠¶Âô®„ÄÇÂ§©ÂêØÂù¶ÂÖãËÉΩÊîªÂáª‰ªª‰ΩïÂú∞Èù¢ÊàñÁ©∫‰∏≠ÁõÆÊ†áÔºå‰πüÊòØËÉΩÂú®ÊàòÂú∫‰∏äÂ≠ò...
3,./audio_RA2/Zofia/SovietWar/RA2_csofu08.mp3,,,"They may be slow, but the Kirov Airships can t...",ËôΩÁÑ∂Âü∫Ê¥õÂ§´Á©∫ËâáÁöÑÈÄüÂ∫¶‰∏çÂø´Ôºå‰ΩÜËÉΩÈÄ†ÊàêÂ§ßÈáèÁöÑÁ†¥ÂùèÔºåÂπ∂‰∏î‰∏çÂÅúÁöÑÊîªÂáª
4,./audio_RA2/Zofia/SovietWar/RA2_csofu09.mp3,,,The Sea Scorpion is a fast-attack sea raider. ...,Êµ∑Ë†çÊòØÂø´ÈÄüÊîªÂáªÊµ∑‰∏äÁ™ÅË¢≠Ëà∞„ÄÇËôΩÁÑ∂‰∏çËÉΩ‰∏∫ÊÇ®Ëµ¢ÂæóÊàò‰∫âÔºå‰ΩÜÊ≤°ÊúâÂÆÉÊÇ®‰πüËÇØÂÆöËµ¢‰∏ç‰∫Ü„ÄÇËøôÊòØÊÇ®Âú®Ê∞¥Èù¢‰∏äÁöÑÂîØ‰∏Ä...
...,...,...,...,...,...
234,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3,,,,
235,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3,,,,
236,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3,,,,
237,,./audio_RA2/Yuri/SovietWar/RA2_xs7yu01.mp3,,,


In [57]:
#Save the df as csv
soviet_war_df.to_csv('./audio_RA2/transcripts/soviet_war.csv',
                     index=False,
                     encoding='utf-8-sig'  #Without this option, the Chinese
                     # column will be unreadable
                     )

## ÂâØÂÆòÂÖ∂‰ªñËØ≠Èü≥
https://voicewiki.cn/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%85%B6%E4%BB%96%E8%AF%AD%E9%9F%B3
* On the lower part, you can find some long readings of Eva Lee, Zofia, and
Yuri

In [58]:
url = 'https://voicewiki.cn/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%85%B6%E4%BB%96%E8%AF%AD%E9%9F%B3'

request = requests.get(url)

print(request.status_code)

200


In [59]:
#Obtain all the contents from that request
if request.status_code == 200:
    soup = BeautifulSoup(request.content, 'html.parser')
    print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="zh-Hans-CN">
 <head>
  <meta charset="utf-8"/>
  <title>
   Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÂÖ∂‰ªñËØ≠Èü≥ - ËØ≠Èü≥Áª¥Âü∫
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"zh","wgMonthNames":["","1Êúà","2Êúà","3Êúà","4Êúà","5Êúà","6Êúà","7Êúà","8Êúà","9Êúà","10Êúà","11Êúà","12Êúà"],"wgRequestId":"48e435a4edbb82299c646c25","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÂÖ∂‰ªñËØ≠Èü≥","wgTitle":"Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÂÖ∂‰ªñËØ≠Èü≥","wgCurRevisionId":69114,"wgRevisionId":69114,"wgArticleId":23105,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":[],"wgPageContentLanguage":"zh-cn","wgPageContentModel":"wikitext","wgRelevantPageName":"Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÂÖ∂‰ªñËØ≠Èü≥","wgRelev

In [60]:
#Obtain the tab that contains all the tables
tables_tab = soup.find_all('div', {'class': 'mw-parser-output'})
# print(len(tables_tab)) #It should be 1
print(tables_tab[0].prettify())

<div class="mw-parser-output">
 <div style="font-size: small; margin-bottom: 1.5em;">
  <a href="/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%8F%8A%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3" title="Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÂèäÊàòÂΩπËØ≠Èü≥">
   &lt;
  </a>
  ËøîÂõû‰∏äÁ∫ßÔºö
  <a href="/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%8F%8A%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3" title="Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÂèäÊàòÂΩπËØ≠Èü≥">
   Á∫¢Ëâ≤Ë≠¶Êàí2/ÂâØÂÆòÂèäÊàòÂΩπËØ≠Èü≥
  </a>
 </div>
 <div aria-labelledby="mw-toc-heading" class="toc" id="toc" role="navigation">
  <input class="toctogglecheckbox" id="toctogglecheckbox" role="button" style="display:none" type="checkbox"/>
  <div class="toctitle" dir="ltr" lang="zh-Hans-CN">
   <h2 id="mw-toc-heading">
    ÁõÆÂΩï
   </h2>
   <span class="toctogglespan">
    <label class="toctogglelabel" for="toctogglecheckbox">
    </label>
   </span>
  </div>
  <ul>
   <li class="toclevel-1 tocsection-1">
    <a href="#YR.E7.89.88.E6.9C.AC.

In [61]:
#Obtain all tha tables
tables = tables_tab[0].find_all('table')
# print(len(tables)) #This should be the total table number on that page
print(tables[0].prettify())

<table>
 <tbody>
  <tr>
   <th rowspan="2">
    <div class="downloadable-audio template-render-cloak">
     <div class="audio-player">
      <a class="internal" href="/w/images/8/8d/RA2_macev02.mp3" title="RA2 macev02.mp3">
       Media:RA2_macev02.mp3
      </a>
     </div>
     <div class="downloadable-audio-dropdown">
      <div class="download-button">
       <div class="download-icon">
       </div>
       <div class="download-link">
        <a class="internal" href="/w/images/8/8d/RA2_macev02.mp3" title="RA2 macev02.mp3">
         Media:RA2_macev02.mp3
        </a>
       </div>
      </div>
     </div>
    </div>
   </th>
   <td>
    Teleporting troops arriving in 5, 4, 3, 2, 1.
   </td>
  </tr>
  <tr>
   <td>
    ‰º†ÈÄÅÈÉ®ÈòüÂ∞ÜÂú®5Ôºå4Ôºå3Ôºå2Ôºå1ÁßíÂêéÂà∞Ëææ
   </td>
  </tr>
 </tbody>
</table>



In [74]:
#Create a df to record all the info
other_voice_df = pd.DataFrame(
    columns=['EvaLee_direct', 'Zofia_direct', 'Yuri_direct', 'Other_direct',
             'transcript_Eng',
             'transcript_Chn'])
other_voice_df

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn


In [75]:
#Iterate through each table
for table in tables:
    df_row_list = []

    #Find audio urls
    audios = table.find_all('a', {'class': 'internal'})  #Don't use 'audio'
    # here for find_all since those should be generated by JS and the
    # returned html doesn't have that

    audios = sorted(set(audios), key=audios.index)  # This is a special
    # way to remove
    # duplications while preserve the original order. 1 table should have 3
    # audios but each will be repeated twice in the original audios.

    audios_direct = []
    for audio in audios:
        internal_url = audio['href']
        download_link = 'https://voicewiki.cn' + internal_url  #The URL you can
        # use to download the audio file

        file_name = internal_url.split('/')[-1]  #The name of the original
        # file that you can use to name the file for the download. It'll be
        # like RA2_ceva001.mp3 (for Allies Lieutenant Eva Lee), RA2_csof001
        # .mp3 (for Soviets Lieutenant Zofia),
        # RA2_cyur001.mp3 (for Yuri)

        if 'ev' in file_name:
            folder_directory = './audio_RA2/EvaLee/OtherVoices/'
        elif 'so' in file_name:
            folder_directory = './audio_RA2/Zofia/OtherVoices/'
        elif 'in' in file_name:  #Somehow it's not "yu" anymore
            folder_directory = './audio_RA2/Yuri/OtherVoices/'
        else:
            folder_directory = './audio_RA2/Others/OtherVoices/'

        file_directory = folder_directory + file_name

        #Add in additional nan values so the row match the header of the df
        if 'ev' in file_name:
            audios_direct.append(file_directory)
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
        elif 'so' in file_name:
            audios_direct.append(np.nan)
            audios_direct.append(file_directory)
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
        elif 'in' in file_name:  #Somehow it's not "yu" anymore
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
            audios_direct.append(file_directory)
            audios_direct.append(np.nan)
        else:
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
            audios_direct.append(file_directory)

        download_audio(download_link, file_directory)
    df_row_list += audios_direct

    #Find transcripts
    transcripts = table.find_all('td')  #transcripts will be a list
    # containing 2 stings, 1 as English and the other as Chinese
    cleaned_transcripts = []
    for transcript in transcripts:
        cleaned_transcripts.append(transcript.text.strip())
    df_row_list += cleaned_transcripts

    try:
        other_voice_df.loc[len(other_voice_df)] = df_row_list
    except:
        print(df_row_list)  #This will be the df_row_list containing the error

other_voice_df

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/EvaLee/OtherVoices/RA2_macev02.mp3,,,,"Teleporting troops arriving in 5, 4, 3, 2, 1.",‰º†ÈÄÅÈÉ®ÈòüÂ∞ÜÂú®5Ôºå4Ôºå3Ôºå2Ôºå1ÁßíÂêéÂà∞Ëææ
1,,,,./audio_RA2/Others/OtherVoices/RA2_ms9pr03.mp3,They're breaking in! Do something!,‰ªñ‰ª¨ÈóØËøõÊù•‰∫ÜÔºÅÂø´ÂÅöÁÇπ‰ªÄ‰πàÔºÅ
2,,,,./audio_RA2/Others/OtherVoices/RA2_ms9pr04.mp3,You don't want the Vice President running the ...,‰Ω†Âπ∂‰∏çÂ∏åÊúõÂâØÊÄªÁªüÁÆ°ÁêÜÂõΩÂÆ∂Ôºå‰∏çÊòØÂêóÔºü
3,,,,./audio_RA2/Others/OtherVoices/RA2_ms9pr05.mp3,Er...,Âó∑...
4,./audio_RA2/EvaLee/OtherVoices/RA2_xc0ev01.mp3,,,,Defend America against the Soviet threat by de...,ÈÄöËøáÊëßÊØÅ‰ªñ‰ª¨Âú®Âè§Â∑¥ÁöÑÂü∫Âú∞Êù•‰øùÊä§ÁæéÂõΩÂÖçÂèóËãèËÅîÁöÑÂ®ÅËÉÅÔºå‰ªéÂ∞§ÈáåÁöÑÂÜõÈòü‰∏≠ÊãØÊïëÊ¥õÊùâÁü∂Âπ∂Â∞Ü‰ªñËµ∂Âõû‰øÑÁΩóÊñØÔºå...
5,,,./audio_RA2/Yuri/OtherVoices/RA2_xc0in01.mp3,,Support Yuri's request for world domination by...,ÈÄöËøáÂç†È¢ÜÁæéÂõΩÈ¶ñÈÉΩÊù•ÂÆåÊàêÂ∞§ÈáåÁöÑ‰∏ñÁïåÁªüÊ≤ªË¶ÅÊ±ÇÔºåÁÑ∂ÂêéÊåáÊå•Â∞§ÈáåÂú®ÂçóÁæéÊ¥≤ÁöÑÂÜõÈòüÊëßÊØÅËãèËÅîÂú®Â∞§Âç°Âù¶ÂçäÂ≤õÁöÑÁßò...
6,,./audio_RA2/Zofia/OtherVoices/RA2_xc0so01.mp3,,,Ensure the success of the Soviet invasion by c...,ÈÄöËøáÁ≤âÁ¢éË•øÈõÖÂõæÁöÑÁõüÂÜõÂäõÈáèÁ°Æ‰øùËãèËÅîÂÖ•‰æµÁöÑÊàêÂäüÔºåÂ§∫Ëµ∞‰ªñÁöÑÁü≥Ê≤πÂÇ®Â§áÂπ∂ÊëßÊØÅ‰ªñÂú®ÂüÉÂèäÁöÑÁßòÂØÜÂü∫Âú∞Êù•ÊÉ©ÁΩöÂèõ...
7,./audio_RA2/EvaLee/OtherVoices/RA2_xc1ev01.mp3,,,,The Soviet bases in Cuba must be eliminated. O...,ÂøÖÈ°ªÊ∂àÁÅ≠ËãèËÅîÂú®Âè§Â∑¥ÁöÑÂü∫Âú∞Ôºå‰∏ÄÂêçÊåáÊå•ÂÆòÂ∞ÜÈ©ªÊâéÂú®ÂìàÁì¶ÈÇ£ÈôÑËøëÔºåÁ¨¨‰∫åÂêçÊåáÊå•ÂÆòÂ∞ÜË¢´ÂÆâÁΩÆÂú®Èù†ËøëÁõüÂÜõÁßòÂØÜÁ©∫...
8,,,./audio_RA2/Yuri/OtherVoices/RA2_xc1in01.mp3,,The Allies have constructed defenses around th...,ÁõüÂÜõÂú®‰ªñ‰ª¨ÂÆùË¥µÁöÑÁôΩÂÆ´Âë®Âõ¥ÈÄ†Êª°‰∫ÜÈò≤Âæ°Â∑•‰∫ãÔºå‰ªñ‰ª¨ÂÉèÂèóÊÉäÁöÑÂ≠©Â≠ê‰∏ÄÊ†∑Ë∫≤Âú®ÁßëÊäÄËÉåÂêéÔºåÊ≤°Êúâ‰ªª‰ΩïÂäûÊ≥ïÂØπÊäóÊàë...
9,,./audio_RA2/Zofia/OtherVoices/RA2_xc1so01.mp3,,,The Allied have a stronghold in the city of Se...,ÁõüÂÜõÂú®Ë•øÈõÖÂõæÊã•ÊúâÊçÆÁÇπÔºå‰ªñ‰ª¨Âú®Â§™Á©∫ÈíàÂ°îÈôÑËøëÂª∫Á´ã‰∫Ü‰∏Ä‰∏™Êã•ÊúâÈáçÁÇÆÁöÑÂü∫Âú∞ÔºåÁ¨¨‰∫å‰∏™Âü∫Âú∞‰Ωç‰∫éÂ∏Ç‰∏≠ÂøÉÔºåÈò≤Âæ°...


In [76]:
#Save the df as csv
other_voice_df.to_csv('./audio_RA2/transcripts/other_voice.csv',
                      index=False,
                      encoding='utf-8-sig'  #Without this option, the Chinese
                      # column will be unreadable
                      )

# Transcript df clean up
* Some voices in soviet_war.csv don't have transcripts. Transcribe them
manually before proceeding.
* For this section, I manually iterated through each audio file that has no
transcript (all from SovietWar), ask Google to transcribe it first, play the
 audio, and I determined the final transcript with some references from
 videos on Bilibili (mainly this person's: https://space.bilibili.com/404470374).
    * This should be the best reseult I can get to

In [143]:
csv_direct = 'audio_RA2/transcripts'

csv_files = glob.glob(os.path.join(csv_direct, "*.csv"))

total_df = pd.DataFrame()
total_len = 0
for csv_file in csv_files:
    temp_df = pd.read_csv(csv_file)
    print('File {} has {} rows'.format(csv_file, len(temp_df)))

    total_len += len(temp_df)
    total_df = pd.concat([total_df, temp_df], axis=0)

print('Total df should have {} rows'.format(total_len))
total_df

File audio_RA2/transcripts\allied_war.csv has 194 rows
File audio_RA2/transcripts\general.csv has 134 rows
File audio_RA2/transcripts\other_voice.csv has 18 rows
File audio_RA2/transcripts\soviet_war.csv has 239 rows
Total df should have 585 rows


Unnamed: 0,EvaLee_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn,Zofia_direct
0,./audio_RA2/EvaLee/AlliedWar/RA2_cevau06.mp3,,,The V3 is a powerful long-range artillery weap...,V3ÊòØÂ®ÅÂäõÂº∫Â§ßÁöÑÈïøÁ®ãÁÅ´ÁÆ≠„ÄÇÂΩìÂøÉÔºåÊåáÊå•ÂÆòÔºåV3ËÉΩÂú®Âá†ÂàÜÈíüÂÜÖÂ∞ÜÊÇ®ÁöÑÂü∫Âú∞Â§∑‰∏∫Âπ≥Âú∞ÔºåÊÇ®ÁîöËá≥ËøûËøòÂáªÁöÑÊú∫...,
1,./audio_RA2/EvaLee/AlliedWar/RA2_cevau07.mp3,,,"The most powerful Soviet Tank ever built, the ...",Â§©ÂêØÂù¶ÂÖãÊòØËãèËÅîÂè≤‰∏äÊúÄÂº∫ÁöÑÂù¶ÂÖãÔºåÊú¨Ë∫´Â∞±ÂÖ∑Â§á‰∏ÄÂè™Â∞èÂûãÂÜõÈòüÁöÑÁÅ´ÂäõÔºåË£ÖÂ§áÂØπÁ©∫ÂèäÂØπÂú∞ÁöÑÊ≠¶Ë£Ö,
2,./audio_RA2/EvaLee/AlliedWar/RA2_cevau08.mp3,,,"The most powerful air unit ever created, the K...",Âü∫Ê¥õÂ§´Á©∫ËâáÊòØÊúÄÂº∫ÁöÑÁ©∫‰∏≠Ê≠¶Âô®ÔºåËÉΩÊäïÊé∑Êó†ÈôêÁöÑÈìÅÂà∂ÁÇ∏ÂºπÔºåÊîªÂáªÊïå‰∫∫,
3,./audio_RA2/EvaLee/AlliedWar/RA2_cevau13.mp3,,,"The ultimate in Soviet long-range bombardment,...",Êó†ÁïèÁ∫ßÊàòËà∞ÊòØËãèËÅîÁ©∂ÊûÅÁöÑÈïøÁ®ãÁÇÆËΩ∞Ê≠¶Âô®ÔºåËøúË∑ùÊîªÂáªÂäõÊó†‰ª•‰º¶ÊØîÔºå‰ΩÜÂæàÂÆπÊòìÂú®ËøëË∑ùÁ¶ªË¢´ÂáªÊ≤â„ÄÇÊó†ÁïèÁ∫ßÊàòËà∞ÊâÄ...,
4,./audio_RA2/EvaLee/AlliedWar/RA2_cevau15.mp3,,,The Soviets have taken Tesla Technology to the...,ËãèËÅîÁ£ÅËÉΩÁßëÊäÄÁöÑÊúÄÂêéÈò∂ÊÆµÔºåË∫´Á©øÁ£ÅËÉΩÂä®ÂäõË£ÖÁöÑÂ£´ÂÖµ„ÄÇËøô‰∫õÂ£´ÂÖµÊúÄÊìÖÈïøÂØπ‰ªòÂù¶ÂÖãÔºå‰πüËÉΩÁî®‰ªñ‰ª¨È¢ùÂ§ñÁöÑËÉΩÊ∫êÂä†...,
...,...,...,...,...,...,...
234,,,,,,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3
235,,,,,,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3
236,,,,,,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3
237,,./audio_RA2/Yuri/SovietWar/RA2_xs7yu01.mp3,,,,


## Reorder columns

In [144]:
total_df.columns

Index(['EvaLee_direct', 'Yuri_direct', 'Other_direct', 'transcript_Eng',
       'transcript_Chn', 'Zofia_direct'],
      dtype='object')

In [145]:
total_df = total_df.reindex(columns=['EvaLee_direct', 'Zofia_direct',
                                     'Yuri_direct', 'Other_direct',
                                     'transcript_Eng',
                                     'transcript_Chn'])
total_df

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/EvaLee/AlliedWar/RA2_cevau06.mp3,,,,The V3 is a powerful long-range artillery weap...,V3ÊòØÂ®ÅÂäõÂº∫Â§ßÁöÑÈïøÁ®ãÁÅ´ÁÆ≠„ÄÇÂΩìÂøÉÔºåÊåáÊå•ÂÆòÔºåV3ËÉΩÂú®Âá†ÂàÜÈíüÂÜÖÂ∞ÜÊÇ®ÁöÑÂü∫Âú∞Â§∑‰∏∫Âπ≥Âú∞ÔºåÊÇ®ÁîöËá≥ËøûËøòÂáªÁöÑÊú∫...
1,./audio_RA2/EvaLee/AlliedWar/RA2_cevau07.mp3,,,,"The most powerful Soviet Tank ever built, the ...",Â§©ÂêØÂù¶ÂÖãÊòØËãèËÅîÂè≤‰∏äÊúÄÂº∫ÁöÑÂù¶ÂÖãÔºåÊú¨Ë∫´Â∞±ÂÖ∑Â§á‰∏ÄÂè™Â∞èÂûãÂÜõÈòüÁöÑÁÅ´ÂäõÔºåË£ÖÂ§áÂØπÁ©∫ÂèäÂØπÂú∞ÁöÑÊ≠¶Ë£Ö
2,./audio_RA2/EvaLee/AlliedWar/RA2_cevau08.mp3,,,,"The most powerful air unit ever created, the K...",Âü∫Ê¥õÂ§´Á©∫ËâáÊòØÊúÄÂº∫ÁöÑÁ©∫‰∏≠Ê≠¶Âô®ÔºåËÉΩÊäïÊé∑Êó†ÈôêÁöÑÈìÅÂà∂ÁÇ∏ÂºπÔºåÊîªÂáªÊïå‰∫∫
3,./audio_RA2/EvaLee/AlliedWar/RA2_cevau13.mp3,,,,"The ultimate in Soviet long-range bombardment,...",Êó†ÁïèÁ∫ßÊàòËà∞ÊòØËãèËÅîÁ©∂ÊûÅÁöÑÈïøÁ®ãÁÇÆËΩ∞Ê≠¶Âô®ÔºåËøúË∑ùÊîªÂáªÂäõÊó†‰ª•‰º¶ÊØîÔºå‰ΩÜÂæàÂÆπÊòìÂú®ËøëË∑ùÁ¶ªË¢´ÂáªÊ≤â„ÄÇÊó†ÁïèÁ∫ßÊàòËà∞ÊâÄ...
4,./audio_RA2/EvaLee/AlliedWar/RA2_cevau15.mp3,,,,The Soviets have taken Tesla Technology to the...,ËãèËÅîÁ£ÅËÉΩÁßëÊäÄÁöÑÊúÄÂêéÈò∂ÊÆµÔºåË∫´Á©øÁ£ÅËÉΩÂä®ÂäõË£ÖÁöÑÂ£´ÂÖµ„ÄÇËøô‰∫õÂ£´ÂÖµÊúÄÊìÖÈïøÂØπ‰ªòÂù¶ÂÖãÔºå‰πüËÉΩÁî®‰ªñ‰ª¨È¢ùÂ§ñÁöÑËÉΩÊ∫êÂä†...
...,...,...,...,...,...,...
234,,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3,,,,
235,,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3,,,,
236,,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3,,,,
237,,,./audio_RA2/Yuri/SovietWar/RA2_xs7yu01.mp3,,,


## Clean up duplication rows

In [146]:
print('Before cleaning, the df has {} rows'.format(len(total_df)))
total_df = total_df.drop_duplicates()
print('After cleaning, the df has {} rows'.format(len(total_df)))

Before cleaning, the df has 585 rows
After cleaning, the df has 585 rows


## Transcribe voices that lack transcriptions
For this section, I manually iterated through each audio file that has no
transcript (all from SovietWar), ask Google to transcribe it first, play the
 audio, and I determined the final transcript with some references from
 videos on Bilibili (mainly this person's: https://space.bilibili
 .com/404470374).
 * This should be the best I can get to

In [147]:
#Filter out the voices that has no transcripts
no_transcription = total_df[total_df['transcript_Eng'].isnull() == True]
no_transcription = no_transcription.reset_index(drop=True)

no_transcription

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,,./audio_RA2/Zofia/SovietWar/RA2_csofu39.mp3,,,,
1,,./audio_RA2/Zofia/SovietWar/RA2_csofu40.mp3,,,,
2,,./audio_RA2/Zofia/SovietWar/RA2_csofu41.mp3,,,,
3,,./audio_RA2/Zofia/SovietWar/RA2_csofu42.mp3,,,,
4,,./audio_RA2/Zofia/SovietWar/RA2_csofu43.mp3,,,,
...,...,...,...,...,...,...
92,,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3,,,,
93,,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3,,,,
94,,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3,,,,
95,,,./audio_RA2/Yuri/SovietWar/RA2_xs7yu01.mp3,,,


In [148]:
#Filter out columns that contains data more than just all NaN
wanted_col_list = []
for col in no_transcription.columns:
    if len(no_transcription[col].value_counts()) != 0:
        wanted_col_list.append(col)

wanted_col_list += ['transcript_Eng', 'transcript_Chn']
wanted_col_list

['Zofia_direct',
 'Yuri_direct',
 'Other_direct',
 'transcript_Eng',
 'transcript_Chn']

In [None]:
#Slice the no_transcription df by the wanted columns
no_transcription = no_transcription.filter(wanted_col_list, axis=1)
no_transcription

### Select an audio

In [863]:
audio_dict = no_transcription.iloc[96].to_dict()

non_nan_key = ''
for key, value in audio_dict.items():
    if type(value) == str:
        audio_file_direct = value
        non_nan_key = key

print('{}: {}'.format(non_nan_key, audio_file_direct))

Yuri_direct: ./audio_RA2/Yuri/SovietWar/RA2_xs7yu02.mp3


### Determine a transcript

In [864]:
transcribe_audio(audio_file_direct)

{'alternative': [{'transcript': 'I have grown weary of waiting for you to make up your mind wherever you make your base you will be unable to defend it from my forces or those of my Sophia and Allied lb',
   'confidence': 0.84470391},
  {'transcript': 'I have grown weary of waiting for you to make up your mind wherever you make your base you will be unable to defend it from my forces or those of my Sophia and Allied pounds'},
  {'transcript': 'I have grown weary of waiting for you to make up your mind wherever you make your base you will be unable to defend it from my forces or those of my Sofia and Allied lb'},
  {'transcript': 'I have grown weary of waiting for you to make up your mind wherever you make your base you will be unable to defend it from my forces or those of my services and Allied lb'},
  {'transcript': 'I have grown weary of waiting for you to make up your mind forever you make your base you will be unable to defend it from my forces or those of my Sophia and Allied lb'

### Listen to the audio

In [858]:
ipd.Audio(audio_file_direct)

In [859]:
decided_transcript = "I have grown weary of waiting for you to make up your ""mind. Wherever you make your base, you will be unable ""to defend it from my forces or those of my Soviet and"" ""Allied bombs" + '.'

new_entry = pd.DataFrame({non_nan_key: [audio_file_direct], 'transcript_Eng':
    [decided_transcript]})  #Use [] version of the data here to avoid
# ValueError: If
# using all scalar values, you must pass an index
new_entry

Unnamed: 0,Yuri_direct,transcript_Eng
0,./audio_RA2/Yuri/SovietWar/RA2_xs7yu02.mp3,I have grown weary of waiting for you to make ...


In [860]:
no_transcription_save_direct = './audio_RA2/transcripts/no_transcript_audios.csv'

mother_df = pd.read_csv(no_transcription_save_direct)
mother_df

Unnamed: 0,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/Zofia/SovietWar/RA2_csofu39.mp3,,,Yuri's Boomer submarine combines the stuff of ...,
1,./audio_RA2/Zofia/SovietWar/RA2_csofu40.mp3,,,"Commander, this unit is the strongest Yuri has...",
2,./audio_RA2/Zofia/SovietWar/RA2_csofu41.mp3,,,Unfortunately the chaos drone was a promising ...,
3,./audio_RA2/Zofia/SovietWar/RA2_csofu42.mp3,,,Yuri is running his war mostly on stolen Sovie...,
4,./audio_RA2/Zofia/SovietWar/RA2_csofu43.mp3,,,"Beware of these floating discs, Comrade Genera...",
...,...,...,...,...,...
91,./audio_RA2/Zofia/SovietWar/RA2_xs7so01.mp3,,,"Commander, be advised that Yuri is combining h...",
92,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3,,,"Look, Comrade General, at back door! Perhaps i...",
93,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3,,,"Sir, if Yuri's genetic mutator is not taken, h...",
94,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3,,,"Commander, this group of freedom fighters is w...",


In [861]:
for col in ['Zofia_direct', 'Yuri_direct', 'Other_direct']:
    if audio_file_direct in mother_df[col].to_list():
        print('The audio file {} is already in the mother df. The program is'
              ' terminating now to avoid overwriting.'.format
              (audio_file_direct))
        break
    else:
        new_mother_df = pd.concat([mother_df, new_entry])
        new_mother_df.reset_index()
        new_mother_df.to_csv(no_transcription_save_direct, index=False)
        display(new_mother_df)
        break

Unnamed: 0,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/Zofia/SovietWar/RA2_csofu39.mp3,,,Yuri's Boomer submarine combines the stuff of ...,
1,./audio_RA2/Zofia/SovietWar/RA2_csofu40.mp3,,,"Commander, this unit is the strongest Yuri has...",
2,./audio_RA2/Zofia/SovietWar/RA2_csofu41.mp3,,,Unfortunately the chaos drone was a promising ...,
3,./audio_RA2/Zofia/SovietWar/RA2_csofu42.mp3,,,Yuri is running his war mostly on stolen Sovie...,
4,./audio_RA2/Zofia/SovietWar/RA2_csofu43.mp3,,,"Beware of these floating discs, Comrade Genera...",
...,...,...,...,...,...
92,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3,,,"Look, Comrade General, at back door! Perhaps i...",
93,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3,,,"Sir, if Yuri's genetic mutator is not taken, h...",
94,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3,,,"Commander, this group of freedom fighters is w...",
95,,./audio_RA2/Yuri/SovietWar/RA2_xs7yu01.mp3,,"A good thought, Commander. I think my puppets ...",


## Final clean up

### Translate English transcripts into Chinese for audios missing transcripts previously

In [24]:
no_transcription_save_direct = './audio_RA2/transcripts/no_transcript_audios.csv'

no_transcription_audio_df = pd.read_csv(no_transcription_save_direct)
no_transcription_audio_df

Unnamed: 0,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/Zofia/SovietWar/RA2_csofu39.mp3,,,Yuri's Boomer submarine combines the stuff of ...,
1,./audio_RA2/Zofia/SovietWar/RA2_csofu40.mp3,,,"Commander, this unit is the strongest Yuri has...",
2,./audio_RA2/Zofia/SovietWar/RA2_csofu41.mp3,,,Unfortunately the chaos drone was a promising ...,
3,./audio_RA2/Zofia/SovietWar/RA2_csofu42.mp3,,,Yuri is running his war mostly on stolen Sovie...,
4,./audio_RA2/Zofia/SovietWar/RA2_csofu43.mp3,,,"Beware of these floating discs, Comrade Genera...",
...,...,...,...,...,...
92,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3,,,"Look, Comrade General, at back door! Perhaps i...",
93,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3,,,"Sir, if Yuri's genetic mutator is not taken, h...",
94,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3,,,"Commander, this group of freedom fighters is w...",
95,,./audio_RA2/Yuri/SovietWar/RA2_xs7yu01.mp3,,"A good thought, Commander. I think my puppets ...",


In [31]:
if no_transcription_audio_df[
    'transcript_Chn'].isnull().values.all() == True:  #If 'transcript_Chn' contains all NaN
    no_transcription_audio_df['transcript_Chn'] = no_transcription_audio_df[
        'transcript_Eng'].apply(Eng_to_Chn)
    no_transcription_audio_df
else:
    print("The column 'transcript_Chn' contains at least 1 non-NaN value. "
          "Check it up to avoid overwriting.")

Unnamed: 0,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/Zofia/SovietWar/RA2_csofu39.mp3,,,Yuri's Boomer submarine combines the stuff of ...,Â∞§ÈáåÁöÑ Boomer ÊΩúËâáÁªìÂêà‰∫ÜÊΩúËâáÁöÑÁâπÊÄßÂíåÊàë‰ª¨Ëá™Â∑±ÁöÑÊó†ÁïèËà∞ÁöÑÂºπÈÅìËÉΩÂäõ„ÄÇÊàë‰ª¨ÁöÑÈ±øÈ±ºÊòØÂîØ‰∏ÄÂèØ‰ª•...
1,./audio_RA2/Zofia/SovietWar/RA2_csofu40.mp3,,,"Commander, this unit is the strongest Yuri has...",ÊåáÊå•ÂÆòÔºåËøôÊîØÈÉ®ÈòüÊòØÂ∞§ÈáåÊã•ÊúâÁöÑÊúÄÂº∫ÈÉ®Èòü„ÄÇ‰ªñÊòØÈáéËõÆ‰∫∫Ôºå‰ªñÂèØ‰ª•ÊØ´‰∏çË¥πÂäõÂú∞Á†∏Á¢é‰∏úË•ø
2,./audio_RA2/Zofia/SovietWar/RA2_csofu41.mp3,,,Unfortunately the chaos drone was a promising ...,‰∏çÂπ∏ÁöÑÊòØÔºåÊ∑∑Ê≤åÊó†‰∫∫Êú∫ÊòØÂ∞§ÈáåÂÅ∑Ëµ∞ÁöÑÂæàÊúâÂâçÈÄîÁöÑËãèËÅîËÆæËÆ°„ÄÇÂÆÉ‰ºö‰∫ßÁîü‰∏ÄÂõ¢Ëá¥ÂπªÊ∞î‰ΩìÔºå‰ΩøÊâÄÊúâÂèóÂΩ±ÂìçÁöÑ‰∫∫Âú®...
3,./audio_RA2/Zofia/SovietWar/RA2_csofu42.mp3,,,Yuri is running his war mostly on stolen Sovie...,Â∞§ÈáåÁöÑÊàò‰∫â‰∏ªË¶Å‰æùÈù†ËãèËÅîÂèõÈÄÉËÄÖÁª¥Êä§ÁöÑË¢´ÁõóËãèËÅîÊäÄÊúØ„ÄÇ‰ªñÁöÑÂ∑•Á®ãÂ∏àÂèØËÉΩÊõæÁªè‰∏∫Êàë‰ª¨Â∑•‰ΩúËøáÔºå‰ΩÜ‰ªñ‰ª¨Áé∞Âú®Âä†...
4,./audio_RA2/Zofia/SovietWar/RA2_csofu43.mp3,,,"Beware of these floating discs, Comrade Genera...",ÂΩìÂøÉËøô‰∫õÊµÆÁõòÔºåÂ∞ÜÂÜõÂêåÂøó„ÄÇÂ∞§ÈáåÂ∞ÜÂÆÉ‰ª¨ËÆæËÆ°ÊàêËÉΩÂ§ü‰ªéÊàë‰ª¨ÁöÑÁâπÊñØÊãâÂíåÊ†∏ÂèçÂ∫îÂ†Ü‰ª•ÂèäÊàë‰ª¨Ëá™Â∑±ÁöÑÈò≤Âæ°Âª∫Á≠ë‰∏≠...
...,...,...,...,...,...
92,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3,,,"Look, Comrade General, at back door! Perhaps i...",ÁúãÔºåÂ∞ÜÂÜõÂêåÂøóÔºåÂú®ÂêéÈó®ÔºÅ‰πüËÆ∏Â¶ÇÊûú‰Ω†Ê¥æ‰∏Ä‰∫õÂ∑•Á®ãÂ∏àËøáÂéªÂ§∫ÂèñÁÅµËÉΩ‰ø°Ê†áÔºå‰Ω†ÂèØËÉΩ‰ºö‰ªéÂ∞§ÈáåÊâã‰∏≠Â§∫ÂèñËãèËÅîÂü∫Âú∞...
93,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3,,,"Sir, if Yuri's genetic mutator is not taken, h...",ÈïøÂÆòÔºåÂ¶ÇÊûúÂ∞§ÈáåÁöÑÂü∫Âõ†Á™ÅÂèòÂô®‰∏çË¢´ÊãøËµ∞Ôºå‰ªñ‰ºöÊää‰Ω†ÁöÑÊ≠•ÂÖµÂèòÊàê‰ªñÊéßÂà∂‰∏ãÁöÑÈáéÂÖΩ„ÄÇ
94,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3,,,"Commander, this group of freedom fighters is w...",ÊåáÊå•ÂÆòÔºåËøôÁæ§Ëá™Áî±ÊàòÂ£´ÊÑøÊÑèÂä†ÂÖ•Êàë‰ª¨Ôºå‰∏éÂ∞§ÈáåËøõË°åÊúÄÂêéÁöÑÂÜ≥Êàò„ÄÇÂÆÉ‰ª¨Â∫îËØ•Ë¢´ËØÅÊòéÊòØÊúÄÊúâÁî®ÁöÑ„ÄÇ
95,,./audio_RA2/Yuri/SovietWar/RA2_xs7yu01.mp3,,"A good thought, Commander. I think my puppets ...",Â•Ω‰∏ªÊÑèÔºåÊåáÊå•ÂÆò„ÄÇÊàëËÆ§‰∏∫ÊàëÁöÑÂÇÄÂÑ°‰ª¨Â∫îËØ•Êïà‰ªø‰Ω†ÔºåÊâìÈÄ†Â±û‰∫é‰ªñ‰ª¨Ëá™Â∑±ÁöÑË∂ÖÁ∫ßÊ≠¶Âô®„ÄÇÂóØÂóØÂóØÂóØ„ÄÇ


In [32]:
no_transcription_audio_df.to_csv(
    './audio_RA2/transcripts/no_transcript_audios.csv',
    index=False,
    encoding='utf-8-sig'  #Otherwise the 'transcript_Chn' column is not
    # readable
)

### Put in the missing transcripts

In [33]:
csv_direct = 'audio_RA2/transcripts'

csv_files = glob.glob(os.path.join(csv_direct, "*.csv"))

all_df = pd.DataFrame()
all_len = 0
for csv_file in csv_files:
    temp_df = pd.read_csv(csv_file)
    print('File {} has {} rows'.format(csv_file, len(temp_df)))

    all_len += len(temp_df)
    all_df = pd.concat([all_df, temp_df], axis=0)

print('Total df should have {} rows'.format(all_len))
all_df

File audio_RA2/transcripts\allied_war.csv has 194 rows
File audio_RA2/transcripts\general.csv has 134 rows
File audio_RA2/transcripts\no_transcript_audios.csv has 97 rows
File audio_RA2/transcripts\other_voice.csv has 18 rows
File audio_RA2/transcripts\soviet_war.csv has 239 rows
Total df should have 682 rows


Unnamed: 0,EvaLee_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn,Zofia_direct
0,./audio_RA2/EvaLee/AlliedWar/RA2_cevau06.mp3,,,The V3 is a powerful long-range artillery weap...,V3ÊòØÂ®ÅÂäõÂº∫Â§ßÁöÑÈïøÁ®ãÁÅ´ÁÆ≠„ÄÇÂΩìÂøÉÔºåÊåáÊå•ÂÆòÔºåV3ËÉΩÂú®Âá†ÂàÜÈíüÂÜÖÂ∞ÜÊÇ®ÁöÑÂü∫Âú∞Â§∑‰∏∫Âπ≥Âú∞ÔºåÊÇ®ÁîöËá≥ËøûËøòÂáªÁöÑÊú∫...,
1,./audio_RA2/EvaLee/AlliedWar/RA2_cevau07.mp3,,,"The most powerful Soviet Tank ever built, the ...",Â§©ÂêØÂù¶ÂÖãÊòØËãèËÅîÂè≤‰∏äÊúÄÂº∫ÁöÑÂù¶ÂÖãÔºåÊú¨Ë∫´Â∞±ÂÖ∑Â§á‰∏ÄÂè™Â∞èÂûãÂÜõÈòüÁöÑÁÅ´ÂäõÔºåË£ÖÂ§áÂØπÁ©∫ÂèäÂØπÂú∞ÁöÑÊ≠¶Ë£Ö,
2,./audio_RA2/EvaLee/AlliedWar/RA2_cevau08.mp3,,,"The most powerful air unit ever created, the K...",Âü∫Ê¥õÂ§´Á©∫ËâáÊòØÊúÄÂº∫ÁöÑÁ©∫‰∏≠Ê≠¶Âô®ÔºåËÉΩÊäïÊé∑Êó†ÈôêÁöÑÈìÅÂà∂ÁÇ∏ÂºπÔºåÊîªÂáªÊïå‰∫∫,
3,./audio_RA2/EvaLee/AlliedWar/RA2_cevau13.mp3,,,"The ultimate in Soviet long-range bombardment,...",Êó†ÁïèÁ∫ßÊàòËà∞ÊòØËãèËÅîÁ©∂ÊûÅÁöÑÈïøÁ®ãÁÇÆËΩ∞Ê≠¶Âô®ÔºåËøúË∑ùÊîªÂáªÂäõÊó†‰ª•‰º¶ÊØîÔºå‰ΩÜÂæàÂÆπÊòìÂú®ËøëË∑ùÁ¶ªË¢´ÂáªÊ≤â„ÄÇÊó†ÁïèÁ∫ßÊàòËà∞ÊâÄ...,
4,./audio_RA2/EvaLee/AlliedWar/RA2_cevau15.mp3,,,The Soviets have taken Tesla Technology to the...,ËãèËÅîÁ£ÅËÉΩÁßëÊäÄÁöÑÊúÄÂêéÈò∂ÊÆµÔºåË∫´Á©øÁ£ÅËÉΩÂä®ÂäõË£ÖÁöÑÂ£´ÂÖµ„ÄÇËøô‰∫õÂ£´ÂÖµÊúÄÊìÖÈïøÂØπ‰ªòÂù¶ÂÖãÔºå‰πüËÉΩÁî®‰ªñ‰ª¨È¢ùÂ§ñÁöÑËÉΩÊ∫êÂä†...,
...,...,...,...,...,...,...
234,,,,,,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3
235,,,,,,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3
236,,,,,,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3
237,,./audio_RA2/Yuri/SovietWar/RA2_xs7yu01.mp3,,,,


In [40]:
all_df = all_df.reindex(columns=['EvaLee_direct', 'Zofia_direct',
                                     'Yuri_direct', 'Other_direct',
                                     'transcript_Eng',
                                     'transcript_Chn'])
all_df

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/EvaLee/AlliedWar/RA2_cevau06.mp3,,,,The V3 is a powerful long-range artillery weap...,V3ÊòØÂ®ÅÂäõÂº∫Â§ßÁöÑÈïøÁ®ãÁÅ´ÁÆ≠„ÄÇÂΩìÂøÉÔºåÊåáÊå•ÂÆòÔºåV3ËÉΩÂú®Âá†ÂàÜÈíüÂÜÖÂ∞ÜÊÇ®ÁöÑÂü∫Âú∞Â§∑‰∏∫Âπ≥Âú∞ÔºåÊÇ®ÁîöËá≥ËøûËøòÂáªÁöÑÊú∫...
1,./audio_RA2/EvaLee/AlliedWar/RA2_cevau07.mp3,,,,"The most powerful Soviet Tank ever built, the ...",Â§©ÂêØÂù¶ÂÖãÊòØËãèËÅîÂè≤‰∏äÊúÄÂº∫ÁöÑÂù¶ÂÖãÔºåÊú¨Ë∫´Â∞±ÂÖ∑Â§á‰∏ÄÂè™Â∞èÂûãÂÜõÈòüÁöÑÁÅ´ÂäõÔºåË£ÖÂ§áÂØπÁ©∫ÂèäÂØπÂú∞ÁöÑÊ≠¶Ë£Ö
2,./audio_RA2/EvaLee/AlliedWar/RA2_cevau08.mp3,,,,"The most powerful air unit ever created, the K...",Âü∫Ê¥õÂ§´Á©∫ËâáÊòØÊúÄÂº∫ÁöÑÁ©∫‰∏≠Ê≠¶Âô®ÔºåËÉΩÊäïÊé∑Êó†ÈôêÁöÑÈìÅÂà∂ÁÇ∏ÂºπÔºåÊîªÂáªÊïå‰∫∫
3,./audio_RA2/EvaLee/AlliedWar/RA2_cevau13.mp3,,,,"The ultimate in Soviet long-range bombardment,...",Êó†ÁïèÁ∫ßÊàòËà∞ÊòØËãèËÅîÁ©∂ÊûÅÁöÑÈïøÁ®ãÁÇÆËΩ∞Ê≠¶Âô®ÔºåËøúË∑ùÊîªÂáªÂäõÊó†‰ª•‰º¶ÊØîÔºå‰ΩÜÂæàÂÆπÊòìÂú®ËøëË∑ùÁ¶ªË¢´ÂáªÊ≤â„ÄÇÊó†ÁïèÁ∫ßÊàòËà∞ÊâÄ...
4,./audio_RA2/EvaLee/AlliedWar/RA2_cevau15.mp3,,,,The Soviets have taken Tesla Technology to the...,ËãèËÅîÁ£ÅËÉΩÁßëÊäÄÁöÑÊúÄÂêéÈò∂ÊÆµÔºåË∫´Á©øÁ£ÅËÉΩÂä®ÂäõË£ÖÁöÑÂ£´ÂÖµ„ÄÇËøô‰∫õÂ£´ÂÖµÊúÄÊìÖÈïøÂØπ‰ªòÂù¶ÂÖãÔºå‰πüËÉΩÁî®‰ªñ‰ª¨È¢ùÂ§ñÁöÑËÉΩÊ∫êÂä†...
...,...,...,...,...,...,...
234,,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3,,,,
235,,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3,,,,
236,,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3,,,,
237,,,./audio_RA2/Yuri/SovietWar/RA2_xs7yu01.mp3,,,


In [41]:
if len(all_df[all_df['transcript_Eng'].isnull() == True]) != len(
    no_transcription_audio_df):
    print('Something is wrong. The all_df should have the same number of '
          'rows that miss a transcript with the number of rows in '
          'no_transcript_audios.csv. Now all_df has {} rows without '
          'transcripts but no_transcript_audios.csv has {} rows'.format(len
                                                                        (
                                                                            all_df[
                                                                                all_df[
                                                                                    'transcript_Eng'].isnull() == True]),
                                                                        len(no_transcription_audio_df)))
else:
    print('All things match up. You can proceed to the next step to clean up.')

All things match up. You can proceed to the next step to clean up.


In [52]:
non_transcript_col_duplicate_row_num = len(all_df) - len(all_df.drop_duplicates(subset = ['EvaLee_direct', 'Zofia_direct',
                                 'Yuri_direct','Other_direct']))

if non_transcript_col_duplicate_row_num != len(
    no_transcription_audio_df):
    print('Something is wrong. The number of duplicate rows defined by the 4'
          ' non-transcript columns in all_df should be the same as the '
          'number of rows in no_transcript_audios.csv. Now all_df has '
          '{} duplicate rows defined by the 4 non-transcript columns but '
          'no_transcript_audios.csv has {} rows'.format
          (non_transcript_col_duplicate_row_num, len(no_transcription_audio_df)))
else:
    print('All things match up. You can now drop the duplicate rows defined '
          'by the 4 non-transcript columns.')

All things match up. You can now drop the duplicate rows defined by the 4 non-transcript columns.


In [53]:
print('Before dropping the duplicate rows defined by the 4 non-transcript '
      'columns, all_df has {} rows.'.format(len(all_df)))
all_df = all_df.drop_duplicates(subset = ['EvaLee_direct', 'Zofia_direct',
                                 'Yuri_direct','Other_direct'])

print('no_transcript_audios.csv has {} rows.'.format(len(no_transcription_audio_df)))
print('After dropping the duplicate rows defined by the 4 non-transcript '
      'columns, all_df has {} rows.'.format(len(all_df)))


Before dropping the duplicate rows defined by the 4 non-transcript columns, all_df has 682 rows.
no_transcript_audios.csv has 97 rows.
After dropping the duplicate rows defined by the 4 non-transcript columns, all_df has 585 rows.


In [56]:
all_df.to_csv(
    './audio_RA2/transcripts/all.csv',
    index=False,
    encoding='utf-8-sig'  #Otherwise the 'transcript_Chn' column is not
    # readable
)
#In this all.csv, in the transcript_Chn column, as long as a transcript is
# missing a trailing Chinese period punctuation (excluding other special
# punctuations), then that transcript was manually processed by me

### Relocate those ÊàòÂâçËøáÂú∫ audios to correct folder
All those ÊàòÂâçËøáÂú∫ audios are put in the "Other_direct" column since their file
names did not meet the batch filtering rules as I defined before. Some of
those voices are spoken by Eva Lee or Zofia so this step aims to move them
to the right speaker.

In [57]:
all_df = pd.read_csv('./audio_RA2/transcripts/all.csv')
all_df

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/EvaLee/AlliedWar/RA2_cevau06.mp3,,,,The V3 is a powerful long-range artillery weap...,V3ÊòØÂ®ÅÂäõÂº∫Â§ßÁöÑÈïøÁ®ãÁÅ´ÁÆ≠„ÄÇÂΩìÂøÉÔºåÊåáÊå•ÂÆòÔºåV3ËÉΩÂú®Âá†ÂàÜÈíüÂÜÖÂ∞ÜÊÇ®ÁöÑÂü∫Âú∞Â§∑‰∏∫Âπ≥Âú∞ÔºåÊÇ®ÁîöËá≥ËøûËøòÂáªÁöÑÊú∫...
1,./audio_RA2/EvaLee/AlliedWar/RA2_cevau07.mp3,,,,"The most powerful Soviet Tank ever built, the ...",Â§©ÂêØÂù¶ÂÖãÊòØËãèËÅîÂè≤‰∏äÊúÄÂº∫ÁöÑÂù¶ÂÖãÔºåÊú¨Ë∫´Â∞±ÂÖ∑Â§á‰∏ÄÂè™Â∞èÂûãÂÜõÈòüÁöÑÁÅ´ÂäõÔºåË£ÖÂ§áÂØπÁ©∫ÂèäÂØπÂú∞ÁöÑÊ≠¶Ë£Ö
2,./audio_RA2/EvaLee/AlliedWar/RA2_cevau08.mp3,,,,"The most powerful air unit ever created, the K...",Âü∫Ê¥õÂ§´Á©∫ËâáÊòØÊúÄÂº∫ÁöÑÁ©∫‰∏≠Ê≠¶Âô®ÔºåËÉΩÊäïÊé∑Êó†ÈôêÁöÑÈìÅÂà∂ÁÇ∏ÂºπÔºåÊîªÂáªÊïå‰∫∫
3,./audio_RA2/EvaLee/AlliedWar/RA2_cevau13.mp3,,,,"The ultimate in Soviet long-range bombardment,...",Êó†ÁïèÁ∫ßÊàòËà∞ÊòØËãèËÅîÁ©∂ÊûÅÁöÑÈïøÁ®ãÁÇÆËΩ∞Ê≠¶Âô®ÔºåËøúË∑ùÊîªÂáªÂäõÊó†‰ª•‰º¶ÊØîÔºå‰ΩÜÂæàÂÆπÊòìÂú®ËøëË∑ùÁ¶ªË¢´ÂáªÊ≤â„ÄÇÊó†ÁïèÁ∫ßÊàòËà∞ÊâÄ...
4,./audio_RA2/EvaLee/AlliedWar/RA2_cevau15.mp3,,,,The Soviets have taken Tesla Technology to the...,ËãèËÅîÁ£ÅËÉΩÁßëÊäÄÁöÑÊúÄÂêéÈò∂ÊÆµÔºåË∫´Á©øÁ£ÅËÉΩÂä®ÂäõË£ÖÁöÑÂ£´ÂÖµ„ÄÇËøô‰∫õÂ£´ÂÖµÊúÄÊìÖÈïøÂØπ‰ªòÂù¶ÂÖãÔºå‰πüËÉΩÁî®‰ªñ‰ª¨È¢ùÂ§ñÁöÑËÉΩÊ∫êÂä†...
...,...,...,...,...,...,...
580,,./audio_RA2/Zofia/SovietWar/RA2_xs4so02.mp3,,,"Sir, if the enemy base near the airport is sti...",ÊåáÊå•ÂÆòÔºåÂ¶ÇÊûúÊú∫Âú∫ÈôÑËøëÁöÑÊïå‰∫∫Âü∫Âú∞ËøòÂú®ÔºåÁΩóÊõºËØ∫Â§´ÊÄªÁêÜÁöÑÁîüÂëΩÂ∞±ÊúâÂç±Èô©„ÄÇ‰Ω†‰∏çËÉΩÂÜçÂ∏¶‰ªñÈù†Ëøë‰∫Ü„ÄÇ
581,,./audio_RA2/Zofia/SovietWar/RA2_xs4so03.mp3,,,We must get Premier Romanov to the safety of t...,Êàë‰ª¨ÂøÖÈ°ªÊääÁΩóÊõºËØ∫Â§´ÊÄªÁêÜÈÄÅÂà∞Êú∫Âú∫ÁöÑÂÆâÂÖ®Âú∞Â∏¶„ÄÇ
582,,./audio_RA2/Zofia/SovietWar/RA2_xs4so04.mp3,,,We believe that Premier Romanov is hiding out ...,Êàë‰ª¨Áõ∏‰ø°ÁΩóÊõºËØ∫Â§´ÊÄªÁêÜÊ≠£Ë∫≤Âú®ÂüéÂ∏ÇÁöÑ‰∏úÈÉ®Âú∞Âå∫„ÄÇÊàë‰ª¨ÂøÖÈ°ªÂú®Â∞§Èáå‰πãÂâçÊâæÂà∞‰ªñ„ÄÇ
583,,./audio_RA2/Zofia/SovietWar/RA2_xs4so05.mp3,,,"Excellent, comrade general. The airport's air ...",ÂæàÂ•ΩÔºåÂ∞ÜÂÜõÂêåÂøó„ÄÇÊú∫Âú∫ÁöÑÈò≤Á©∫ËÆæÊñΩÂ∑≤ÁªèË¢´Ëß£Èô§‰∫Ü„ÄÇÊàë‰ª¨Áé∞Âú®ÂèØ‰ª•ÂÆâÂÖ®ÁñèÊï£ÁΩóÊõºËØ∫Â§´ÊÄªÁêÜ‰∫Ü„ÄÇ


In [79]:
non_nan_all_df = all_df[all_df['Other_direct'].isnull() == False]
non_nan_all_df

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
29,,,,./audio_RA2/Others/AlliedWar/RA2_ma2ta01.mp3,All right! This beacon's neutralized! Send in ...,Â•ΩÊûÅ‰∫ÜÔºÅÂøÉÁÅµ‰ø°Ê†áÂ§±Êïà‰∫ÜÔºÅÊääË£ÖÁî≤ËΩ¶ÂºÄËøõÊù•Âêß
36,,,,./audio_RA2/Others/AlliedWar/RA2_ma5i101.mp3,"Thanks for rescuing us, Sir! Our plane was sho...",ÊÑüË∞¢ÊÇ®Ëê•ÊïëÊàë‰ª¨ÔºåÈïøÂÆòÔºÅÊàë‰ª¨ÁöÑÈ£ûÊú∫Ë¢´ËãèËÅîÈò≤Á©∫ÁÇÆÂáªÂù†„ÄÇËØ¥‰∏çÂÆöËøôÈôÑËøëËøòÊúâÊõ¥Â§öÁöÑÁîüËøòËÄÖ
37,,,,./audio_RA2/Others/AlliedWar/RA2_ma5i201.mp3,"Glad you found us, Sir! We're freezing out here!",ÂæàÈ´òÂÖ¥ÊÇ®ÊâæÂà∞Êàë‰ª¨ÔºåÈïøÂÆòÔºÅÊàë‰ª¨Âú®ËøôÂ•ΩÂÜ∑ÔºÅ
38,,,,./audio_RA2/Others/AlliedWar/RA2_ma5i301.mp3,"Good to see you, Sir! What are our orders?",ÂæàÈ´òÂÖ¥ËßÅÂà∞ÊÇ®ÔºåÈïøÂÆòÔºÅÊàë‰ª¨ÁöÑÂëΩ‰ª§Ôºü
39,,,,./audio_RA2/Others/AlliedWar/RA2_ma5i401.mp3,"Be on the lookout for spies, Comrade. It seems...",ÂêåÂøóÔºå‰ªîÁªÜÁõØÁùÄÔºå‰∏çË¶ÅËÆ©Èó¥Ë∞çÊ∑∑ËøõÊù•„ÄÇ‰ªäÊôö‰ºº‰πéÊúâÁÇπÂ§™ÂÆâÈùô‰∫Ü
...,...,...,...,...,...,...
574,,,,./audio_RA2/Others/SovietWar/RA2_xs4ro01.mp3,Err...Yuri is calling me.,ÂëÉÂïä...Â∞§ÈáåÊ≠£Âú®ÂëºÂî§ÊàëÔºàÁΩóÊõºËØ∫Â§´Ë¢´ÂøÉÊéßÊó∂Ëß¶ÂèëÔºâ
575,,,,./audio_RA2/Others/SovietWar/RA2_xs4ro02.mp3,These are plans for Iron Curtain. Use on our D...,Ëøô‰∫õÊòØÈìÅÂπïË£ÖÁΩÆÁöÑÂª∫ÈÄ†ËÆ°Âàí„ÄÇÂ∞ÜÈìÅÂπïÊñΩÂä†Âú®Êàë‰ª¨ÁöÑËá™ÁàÜÂç°ËΩ¶‰∏äÊù•Âà∂ÈÄ†ÁªàÊûÅÁ†¥ÂùèÊÄßÊ≠¶Âô®„ÄÇ
576,,,,./audio_RA2/Others/SovietWar/RA2_xs4ro03.mp3,"I am in Yuri's base. Help me, comrade general.",ÊàëÁé∞Âú®Âú®Â∞§ÈáåÁöÑÂü∫Âú∞Èáå„ÄÇÊïëÊïëÊàëÔºåÂ∞ÜÂÜõÂêåÂøó„ÄÇÔºàÁΩóÊõºËØ∫Â§´Ë¢´ÂøÉÊéßÂπ∂‰∏îÂ∏¶ÂõûÂ∞§ÈáåÂü∫Âú∞ÂÜÖÂêéËß¶ÂèëÔºâ
577,,,,./audio_RA2/Others/SovietWar/RA2_xs4ro04.mp3,Ah...I am myself once again.,Âïä...ÊàëÁªà‰∫éÂÜçÊ¨°ÂÅöÂõûËá™Â∑±‰∫Ü„ÄÇ


In [103]:
def find_pre_game_voices(input_str):
    #Use the re pattern to judge whether input_str contains the pattern
    if re.search(r"RA2_..._p..", input_str) != None:
        return True
    else:
        return False

pre_game_voices_df = non_nan_all_df[non_nan_all_df['Other_direct'].apply
(find_pre_game_voices)] #Use the re pattern to filter the strings in the
# "Other_direct" column
pre_game_voices_df = pre_game_voices_df.reset_index() #Reset the indexes of
# pre_game_voices_df. Keep the index so you can change the original row in
# the all.csv
pre_game_voices_df

Unnamed: 0,index,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,129,,,,./audio_RA2/Others/AlliedWar/RA2_a01_p01.mp3,The Golden Gate Bridge entrance was destroyed ...,ÈáëÈó®Â§ßÊ°•ÂÖ•Âè£Âú®Êàò‰∫â‰∏≠Ë¢´ËãèËÅîÊëßÊØÅ„ÄÇ‰øùÊä§Êó∂Èó¥Êú∫Âô®ÔºåÁõ¥Âà∞ÊÇ®ÂèØ‰ª•Âç†È¢ÜË∂≥Â§üÁöÑÂèëÁîµÂéÇ‰ª•‰ΩøÂÖ∂ÊÅ¢Â§ç‰æõÁîµ„ÄÇÂ∞èÂøÉ...
1,130,,,,./audio_RA2/Others/AlliedWar/RA2_a02_p01.mp3,You should establish your own base before atte...,Âú®Â∞ùËØïÂÆö‰ΩçÂ∞§ÈáåÁöÑÈÉ®Èòü‰πãÂâçÔºå‰Ω†Â∫îËØ•Âª∫Á´ãËá™Â∑±ÁöÑÂü∫Âú∞
2,131,,,,./audio_RA2/Others/AlliedWar/RA2_a03_p01.mp3,Taking out those nukes is priority one! We hav...,Èô§ÊéâÈÇ£‰∫õÊ†∏Ê≠¶Âô®ÊòØÁ¨¨‰∏ÄË¶ÅÂä°ÔºÅÊàë‰ª¨ÊúâËá™Â∑±ÁöÑË∂ÖÁ∫ßÊ≠¶Âô®ÔºöÁà±Âõ†ÊñØÂù¶ÁöÑÂ§©Ê∞îÊéßÂà∂Êú∫‰æøÊòØÂÖ∂‰∏≠‰πã‰∏Ä„ÄÇÂØπÁùÄÈÇ£‰∏™ËÆæ...
3,132,,,,./audio_RA2/Others/AlliedWar/RA2_a04_p01.mp3,You'll have to construct your own base. Specia...,‰Ω†ÂøÖÈ°ªÂª∫Á´ãËá™Â∑±ÁöÑÂü∫Âú∞„ÄÇÁâπÂ∑•Ë∞≠ÈõÖÂ∞ÜÂçèÂä©ÊÇ®Ëß£ÊïëÊïôÊéà„ÄÇ‰ªñË¢´ÂÖ≥ÊäºÂú®Â§ßÈáëÂ≠óÂ°îÈôÑËøëÁöÑ‰∏Ä‰∏™ÁßòÂØÜÂü∫Âú∞„ÄÇ‰∏úËæπÊúâ...
4,133,,,,./audio_RA2/Others/AlliedWar/RA2_a05_p01.mp3,We'll launch our offensive near the Opera Hous...,Êàë‰ª¨Â∞ÜÂú®Ê≠åÂâßÈô¢ÈôÑËøëÂèëÂä®ËøõÊîª„ÄÇÂ∞§ÈáåÁöÑÂÆûÈ™åÂÆ§Â∞±Âú®ËøôÈáåÔºåÊÉÖÊä•ËøòËØ¥‰ªñÂú®ËØ•Âú∞Âå∫ÁöÑÊüê‰∏™Âú∞ÊñπÊã•Êúâ‰∏ÄÊîØÊΩúËâáËà∞Èòü
5,134,,,,./audio_RA2/Others/AlliedWar/RA2_a05_p02.mp3,This horribly disfigured soldier is called a B...,ËøôÁßçÂèØÊÄïÁöÑÊØÅÂÆπÂ£´ÂÖµË¢´Áß∞‰∏∫ÁãÇÂÖΩ‰∫∫Ôºå‰ªñ‰ª¨ÁöÑÂäõÈáèË∂≥‰ª•Êää‰∏Ä‰∏™Ê≠£Â∏∏Â§ßÂ∞èÁöÑÂù¶ÂÖãÊéÄÁøª„ÄÇÂ∞èÂøÉ‰ªñ‰ª¨ÔºåÊåáÊå•ÂÆò
6,135,,,,./audio_RA2/Others/AlliedWar/RA2_a06_p01.mp3,We'll setup our defenses by the Parliament bui...,Êàë‰ª¨Â∞ÜÂú®ËÆÆ‰ºöÂ§ßÊ•ºÊóÅËÆæÁΩÆÈò≤Âæ°„ÄÇÊàëÁöÑÈí±ËØ¥Â∞§Èáå‰ºö‰ªé‰∏úÊñπÂèëÂä®ËøõÊîªÔºå‰ªñÁîöËá≥ÂèØËÉΩ‰ºöÂÅ∑ÂÅ∑ÊΩúÂÖ•Ê≥∞Êô§Â£´Ê≤≥„ÄÇÊàë‰ª¨...
7,136,,,,./audio_RA2/Others/AlliedWar/RA2_a06_p04.mp3,We've tracked down the location of Yuri's Lond...,Êàë‰ª¨Â∑≤ÁªèÊâæÂà∞‰∫ÜÂ∞§Èáå‰º¶Êï¶Âü∫Âú∞ÁöÑ‰ΩçÁΩÆ„ÄÇËãèËÅîÂÜõÈòüÂ∞ÜÈöèÊó∂Âä†ÂÖ•Êàë‰ª¨ÔºåÂà©Áî®‰ªñ‰ª¨ÔºÅÊääÈÇ£‰∏™Âü∫Âú∞ÁÉßÊàêÁÅ∞ÁÉ¨ÔºÅ
8,137,,,,./audio_RA2/Others/AlliedWar/RA2_a07_p01.mp3,You'll have to repair this abandoned Soviet ba...,‰Ω†ÂøÖÈ°ª‰øÆÂ§çËøô‰∏™Â∫üÂºÉÁöÑËãèËÅîÂü∫Âú∞„ÄÇÂçóÊûÅÊµ∑Â≤∏Á∫øÂë®Âõ¥ÁöÑÊÇ¨Â¥ñ‰ΩøÊµ∑ÂÜõËøõÊîªÊàê‰∏∫‰∏çÂèØËÉΩ„ÄÇ‰∏ÄÊó¶ÊÇ®ÂêØÂä®Âπ∂ËøêË°åÈõ∑Ëææ...
9,383,,,,./audio_RA2/Others/SovietWar/RA2_s01_p01.mp3,Yuri's forces are seizing power generating pla...,Â∞§ÈáåÁöÑÈÉ®ÈòüÊ≠£Âú®Â§∫ÂèñÂèëÁîµÂéÇÔºå‰ª•ÊøÄÊ¥ªÊÅ∂È≠îÂ≤õ‰∏äÁöÑÁ≤æÁ•ûÊîØÈÖçËÄÖ„ÄÇÊàë‰ª¨ÁöÑÈÉ®ÈòüÂú®ËøôÈáå„ÄÇË¶ÅÊÉ≥ÊàêÂäüÂ§∫ÂèñÁæéÂõΩÊó∂Èó¥...


In [138]:
change_list = []
speaker = 'Zofia'

In [165]:
#I cycled this cell with iteration through pre_game_voices_df to determine
# who speaks that voice
audio_dict = pre_game_voices_df.iloc[16].to_dict()
audio_direct = audio_dict['Other_direct']
ipd.Audio(audio_direct)

In [162]:
change_dict = {}
change_dict['index'] = audio_dict['index']
change_dict['original_direct'] = audio_dict['Other_direct']
change_dict['new_direct'] = audio_dict['Other_direct'].replace('Others',
                                                               speaker)
change_dict

{'index': 390,
 'original_direct': './audio_RA2/Others/SovietWar/RA2_s07_p01.mp3',
 'new_direct': './audio_RA2/Zofia/SovietWar/RA2_s07_p01.mp3'}

In [163]:
change_list.append(change_dict)
change_list

[{'index': 129,
  'original_direct': './audio_RA2/Others/AlliedWar/RA2_a01_p01.mp3',
  'new_direct': './audio_RA2/EvaLee/AlliedWar/RA2_a01_p01.mp3'},
 {'index': 130,
  'original_direct': './audio_RA2/Others/AlliedWar/RA2_a02_p01.mp3',
  'new_direct': './audio_RA2/EvaLee/AlliedWar/RA2_a02_p01.mp3'},
 {'index': 132,
  'original_direct': './audio_RA2/Others/AlliedWar/RA2_a04_p01.mp3',
  'new_direct': './audio_RA2/EvaLee/AlliedWar/RA2_a04_p01.mp3'},
 {'index': 133,
  'original_direct': './audio_RA2/Others/AlliedWar/RA2_a05_p01.mp3',
  'new_direct': './audio_RA2/EvaLee/AlliedWar/RA2_a05_p01.mp3'},
 {'index': 134,
  'original_direct': './audio_RA2/Others/AlliedWar/RA2_a05_p02.mp3',
  'new_direct': './audio_RA2/EvaLee/AlliedWar/RA2_a05_p02.mp3'},
 {'index': 137,
  'original_direct': './audio_RA2/Others/AlliedWar/RA2_a07_p01.mp3',
  'new_direct': './audio_RA2/EvaLee/AlliedWar/RA2_a07_p01.mp3'},
 {'index': 383,
  'original_direct': './audio_RA2/Others/SovietWar/RA2_s01_p01.mp3',
  'new_direct':

In [189]:
for change_dict in change_list:
    index = change_dict['index']
    original_direct = change_dict['original_direct']
    new_direct = change_dict['new_direct']

    #Change the file directory in all_df
    if 'EvaLee' in new_direct:
        all_df.at[index,'EvaLee_direct'] = new_direct
        all_df.at[index, 'Other_direct'] = np.nan
    elif 'Zofia' in new_direct:
        all_df.at[index,'Zofia_direct'] = new_direct
        all_df.at[index, 'Other_direct'] = np.nan
    else:
        print('Neither EvaLee or Zofia is in the new_direct. Something is '
              'wrong.')

    #Change the file directory in file system
    os.rename(original_direct, new_direct)
    print('File has been moved to new_direct {}'.format(new_direct))

File has been moved to new_direct ./audio_RA2/EvaLee/AlliedWar/RA2_a01_p01.mp3
File has been moved to new_direct ./audio_RA2/EvaLee/AlliedWar/RA2_a02_p01.mp3
File has been moved to new_direct ./audio_RA2/EvaLee/AlliedWar/RA2_a04_p01.mp3
File has been moved to new_direct ./audio_RA2/EvaLee/AlliedWar/RA2_a05_p01.mp3
File has been moved to new_direct ./audio_RA2/EvaLee/AlliedWar/RA2_a05_p02.mp3
File has been moved to new_direct ./audio_RA2/EvaLee/AlliedWar/RA2_a07_p01.mp3
File has been moved to new_direct ./audio_RA2/Zofia/SovietWar/RA2_s01_p01.mp3
File has been moved to new_direct ./audio_RA2/Zofia/SovietWar/RA2_s01_p03.mp3
File has been moved to new_direct ./audio_RA2/Zofia/SovietWar/RA2_s02_p01.mp3
File has been moved to new_direct ./audio_RA2/Zofia/SovietWar/RA2_s03_p01.mp3
File has been moved to new_direct ./audio_RA2/Zofia/SovietWar/RA2_s04_p01.mp3
File has been moved to new_direct ./audio_RA2/Zofia/SovietWar/RA2_s05_p01.mp3
File has been moved to new_direct ./audio_RA2/Zofia/Soviet

In [195]:
#Final check on all_df before saving
non_nan_all_df = all_df[all_df['Other_direct'].isnull() == False]
pre_game_voices_df = non_nan_all_df[non_nan_all_df['Other_direct'].apply
(find_pre_game_voices)]
pre_game_voices_df = pre_game_voices_df.reset_index()
pre_game_voices_df

Unnamed: 0,index,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,131,,,,./audio_RA2/Others/AlliedWar/RA2_a03_p01.mp3,Taking out those nukes is priority one! We hav...,Èô§ÊéâÈÇ£‰∫õÊ†∏Ê≠¶Âô®ÊòØÁ¨¨‰∏ÄË¶ÅÂä°ÔºÅÊàë‰ª¨ÊúâËá™Â∑±ÁöÑË∂ÖÁ∫ßÊ≠¶Âô®ÔºöÁà±Âõ†ÊñØÂù¶ÁöÑÂ§©Ê∞îÊéßÂà∂Êú∫‰æøÊòØÂÖ∂‰∏≠‰πã‰∏Ä„ÄÇÂØπÁùÄÈÇ£‰∏™ËÆæ...
1,135,,,,./audio_RA2/Others/AlliedWar/RA2_a06_p01.mp3,We'll setup our defenses by the Parliament bui...,Êàë‰ª¨Â∞ÜÂú®ËÆÆ‰ºöÂ§ßÊ•ºÊóÅËÆæÁΩÆÈò≤Âæ°„ÄÇÊàëÁöÑÈí±ËØ¥Â∞§Èáå‰ºö‰ªé‰∏úÊñπÂèëÂä®ËøõÊîªÔºå‰ªñÁîöËá≥ÂèØËÉΩ‰ºöÂÅ∑ÂÅ∑ÊΩúÂÖ•Ê≥∞Êô§Â£´Ê≤≥„ÄÇÊàë‰ª¨...
2,136,,,,./audio_RA2/Others/AlliedWar/RA2_a06_p04.mp3,We've tracked down the location of Yuri's Lond...,Êàë‰ª¨Â∑≤ÁªèÊâæÂà∞‰∫ÜÂ∞§Èáå‰º¶Êï¶Âü∫Âú∞ÁöÑ‰ΩçÁΩÆ„ÄÇËãèËÅîÂÜõÈòüÂ∞ÜÈöèÊó∂Âä†ÂÖ•Êàë‰ª¨ÔºåÂà©Áî®‰ªñ‰ª¨ÔºÅÊääÈÇ£‰∏™Âü∫Âú∞ÁÉßÊàêÁÅ∞ÁÉ¨ÔºÅ


In [198]:
#I cycled this cell to iterate through all the remaining ÊàòÂâçËøáÂú∫ audio files in
# Other_direct to ensure they're not spoken by Eva Lee or Zofia
audio_dict = pre_game_voices_df.iloc[2].to_dict()
audio_direct = audio_dict['Other_direct']
ipd.Audio(audio_direct)

In [199]:
#I also checked manually in EvaLee and Zofia folders to ensure those ÊàòÂâçËøáÂú∫
# audios are really spoken by them

### Save the final all_df

In [200]:
all_df #The total length of this df should not change

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/EvaLee/AlliedWar/RA2_cevau06.mp3,,,,The V3 is a powerful long-range artillery weap...,V3ÊòØÂ®ÅÂäõÂº∫Â§ßÁöÑÈïøÁ®ãÁÅ´ÁÆ≠„ÄÇÂΩìÂøÉÔºåÊåáÊå•ÂÆòÔºåV3ËÉΩÂú®Âá†ÂàÜÈíüÂÜÖÂ∞ÜÊÇ®ÁöÑÂü∫Âú∞Â§∑‰∏∫Âπ≥Âú∞ÔºåÊÇ®ÁîöËá≥ËøûËøòÂáªÁöÑÊú∫...
1,./audio_RA2/EvaLee/AlliedWar/RA2_cevau07.mp3,,,,"The most powerful Soviet Tank ever built, the ...",Â§©ÂêØÂù¶ÂÖãÊòØËãèËÅîÂè≤‰∏äÊúÄÂº∫ÁöÑÂù¶ÂÖãÔºåÊú¨Ë∫´Â∞±ÂÖ∑Â§á‰∏ÄÂè™Â∞èÂûãÂÜõÈòüÁöÑÁÅ´ÂäõÔºåË£ÖÂ§áÂØπÁ©∫ÂèäÂØπÂú∞ÁöÑÊ≠¶Ë£Ö
2,./audio_RA2/EvaLee/AlliedWar/RA2_cevau08.mp3,,,,"The most powerful air unit ever created, the K...",Âü∫Ê¥õÂ§´Á©∫ËâáÊòØÊúÄÂº∫ÁöÑÁ©∫‰∏≠Ê≠¶Âô®ÔºåËÉΩÊäïÊé∑Êó†ÈôêÁöÑÈìÅÂà∂ÁÇ∏ÂºπÔºåÊîªÂáªÊïå‰∫∫
3,./audio_RA2/EvaLee/AlliedWar/RA2_cevau13.mp3,,,,"The ultimate in Soviet long-range bombardment,...",Êó†ÁïèÁ∫ßÊàòËà∞ÊòØËãèËÅîÁ©∂ÊûÅÁöÑÈïøÁ®ãÁÇÆËΩ∞Ê≠¶Âô®ÔºåËøúË∑ùÊîªÂáªÂäõÊó†‰ª•‰º¶ÊØîÔºå‰ΩÜÂæàÂÆπÊòìÂú®ËøëË∑ùÁ¶ªË¢´ÂáªÊ≤â„ÄÇÊó†ÁïèÁ∫ßÊàòËà∞ÊâÄ...
4,./audio_RA2/EvaLee/AlliedWar/RA2_cevau15.mp3,,,,The Soviets have taken Tesla Technology to the...,ËãèËÅîÁ£ÅËÉΩÁßëÊäÄÁöÑÊúÄÂêéÈò∂ÊÆµÔºåË∫´Á©øÁ£ÅËÉΩÂä®ÂäõË£ÖÁöÑÂ£´ÂÖµ„ÄÇËøô‰∫õÂ£´ÂÖµÊúÄÊìÖÈïøÂØπ‰ªòÂù¶ÂÖãÔºå‰πüËÉΩÁî®‰ªñ‰ª¨È¢ùÂ§ñÁöÑËÉΩÊ∫êÂä†...
...,...,...,...,...,...,...
580,,./audio_RA2/Zofia/SovietWar/RA2_xs4so02.mp3,,,"Sir, if the enemy base near the airport is sti...",ÊåáÊå•ÂÆòÔºåÂ¶ÇÊûúÊú∫Âú∫ÈôÑËøëÁöÑÊïå‰∫∫Âü∫Âú∞ËøòÂú®ÔºåÁΩóÊõºËØ∫Â§´ÊÄªÁêÜÁöÑÁîüÂëΩÂ∞±ÊúâÂç±Èô©„ÄÇ‰Ω†‰∏çËÉΩÂÜçÂ∏¶‰ªñÈù†Ëøë‰∫Ü„ÄÇ
581,,./audio_RA2/Zofia/SovietWar/RA2_xs4so03.mp3,,,We must get Premier Romanov to the safety of t...,Êàë‰ª¨ÂøÖÈ°ªÊääÁΩóÊõºËØ∫Â§´ÊÄªÁêÜÈÄÅÂà∞Êú∫Âú∫ÁöÑÂÆâÂÖ®Âú∞Â∏¶„ÄÇ
582,,./audio_RA2/Zofia/SovietWar/RA2_xs4so04.mp3,,,We believe that Premier Romanov is hiding out ...,Êàë‰ª¨Áõ∏‰ø°ÁΩóÊõºËØ∫Â§´ÊÄªÁêÜÊ≠£Ë∫≤Âú®ÂüéÂ∏ÇÁöÑ‰∏úÈÉ®Âú∞Âå∫„ÄÇÊàë‰ª¨ÂøÖÈ°ªÂú®Â∞§Èáå‰πãÂâçÊâæÂà∞‰ªñ„ÄÇ
583,,./audio_RA2/Zofia/SovietWar/RA2_xs4so05.mp3,,,"Excellent, comrade general. The airport's air ...",ÂæàÂ•ΩÔºåÂ∞ÜÂÜõÂêåÂøó„ÄÇÊú∫Âú∫ÁöÑÈò≤Á©∫ËÆæÊñΩÂ∑≤ÁªèË¢´Ëß£Èô§‰∫Ü„ÄÇÊàë‰ª¨Áé∞Âú®ÂèØ‰ª•ÂÆâÂÖ®ÁñèÊï£ÁΩóÊõºËØ∫Â§´ÊÄªÁêÜ‰∫Ü„ÄÇ


In [201]:
all_df.to_csv(
    './audio_RA2/transcripts/all.csv',
    index=False,
    encoding='utf-8-sig'  #Otherwise the 'transcript_Chn' column is not
    # readable
)