# Import libraries

In [81]:
#Navigate local files
import os
import glob

#Web scraping
import requests
from bs4 import BeautifulSoup

#Output df to contain file directories and transcriptions
import numpy as np
import pandas as pd

#Transcription
from transcribe_audio import transcribe_audio

import IPython.display as ipd  #Display music in jupyter notebook

from googletrans import Translator  #Translate English transcripts into Chinese

import re

# Customized functions

In [104]:
def judge_create_directory(directory):
    #Judge whether directory and if not, create it
    judgement = os.path.exists(directory)
    if judgement is not True:
        os.makedirs(directory)


def download_audio(download_url, save_directory):
    #Save audio file from download_url to save_directory
    audio = requests.get(download_url)

    folder_directory = os.path.dirname(save_directory)  #save_directory is
    # the file's directory and folder_directory will be the directory of the
    # folder that the file will be downloaded to
    judge_create_directory(folder_directory)

    if audio.status_code == 200:
        with open(save_directory, 'wb') as file:
            file.write(audio.content)


def Eng_to_Chn(eng_str):
    #Translate eng_str into a string in simplified Chinese
    translator = Translator()
    result = translator.translate(
        eng_str,
        src='en',
        dest='zh-cn'  #Simplified Chinese
    )
    return result.text



# Scrape website
https://voicewiki.cn/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922
* These command voice are very suitable for voice model training since the
website already provided the transcripts and the voices are all in the
length around 10 sec, which is good for training.

## 副官通用语音
https://voicewiki.cn/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5
%AE%98%E9%80%9A%E7%94%A8%E8%AF%AD%E9%9F%B3
* These voices are spoken with a very calm tone, so they are very good for
voice training. Also I like the calm tone here, which makes them a bit
robotic or machinery.
* Yuri's voice here is slightly different from other sections on the website
. So just to be careful.

In [12]:
url = 'https://voicewiki.cn/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E9%80%9A%E7%94%A8%E8%AF%AD%E9%9F%B3'

request = requests.get(url)

print(request.status_code)

200


In [13]:
#Obtain all the contents from that request
if request.status_code == 200:
    soup = BeautifulSoup(request.content, 'html.parser')
    print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="zh-Hans-CN">
 <head>
  <meta charset="utf-8"/>
  <title>
   红色警戒2/副官通用语音 - 语音维基
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"zh","wgMonthNames":["","1月","2月","3月","4月","5月","6月","7月","8月","9月","10月","11月","12月"],"wgRequestId":"2ee2f8073a5994c14b366a18","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"红色警戒2/副官通用语音","wgTitle":"红色警戒2/副官通用语音","wgCurRevisionId":72594,"wgRevisionId":72594,"wgArticleId":22304,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":[],"wgPageContentLanguage":"zh-cn","wgPageContentModel":"wikitext","wgRelevantPageName":"红色警戒2/副官通用语音","wgRelevantArticleId":22304,"wgIsProbablyEditable":!1,"wgRelevantPageIsProbablyEditable":!1,"wgRestrictionEdit":["sysop"

In [20]:
#Obtain the tab that contains all the tables
tables_tab = soup.find_all('div', {'class': 'mw-parser-output'})
# print(len(tables_tab)) #It should be 1
print(tables_tab[0].prettify())

<div class="mw-parser-output">
 <div style="font-size: small; margin-bottom: 1.5em;">
  <a href="/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%8F%8A%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3" title="红色警戒2/副官及战役语音">
   &lt;
  </a>
  返回上级：
  <a href="/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%8F%8A%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3" title="红色警戒2/副官及战役语音">
   红色警戒2/副官及战役语音
  </a>
 </div>
 <p>
  以下为通用副官语音，由三阵营副官念同一句话（尽管有些无法触发，比如尤里阵营的战役语音，因为尤里没有战役）。可在单人模式下的战役或遭遇战中听到。
 </p>
 <div aria-labelledby="mw-toc-heading" class="toc" id="toc" role="navigation">
  <input class="toctogglecheckbox" id="toctogglecheckbox" role="button" style="display:none" type="checkbox"/>
  <div class="toctitle" dir="ltr" lang="zh-Hans-CN">
   <h2 id="mw-toc-heading">
    目录
   </h2>
   <span class="toctogglespan">
    <label class="toctogglelabel" for="toctogglecheckbox">
    </label>
   </span>
  </div>
  <ul>
   <li class="toclevel-1 tocsection-1">
    <a href="#.E5.8E.9F.E7.89.

In [25]:
#Obtain all tha tables
tables = tables_tab[0].find_all('table')
# print(len(tables)) #This should be the total table number on that page
print(tables[0].prettify())

<table>
 <tbody>
  <tr>
   <th rowspan="2">
    <div class="downloadable-audio template-render-cloak">
     <div class="audio-player">
      <a class="internal" href="/w/images/0/04/RA2_ceva001.mp3" title="RA2 ceva001.mp3">
       Media:RA2_ceva001.mp3
      </a>
     </div>
     <div class="downloadable-audio-dropdown">
      <div class="download-button">
       <div class="download-icon">
       </div>
       <div class="download-link">
        <a class="internal" href="/w/images/0/04/RA2_ceva001.mp3" title="RA2 ceva001.mp3">
         Media:RA2_ceva001.mp3
        </a>
       </div>
      </div>
     </div>
    </div>
   </th>
   <th rowspan="2">
    <div class="downloadable-audio template-render-cloak">
     <div class="audio-player">
      <a class="internal" href="/w/images/b/b3/RA2_csof001.mp3" title="RA2 csof001.mp3">
       Media:RA2_csof001.mp3
      </a>
     </div>
     <div class="downloadable-audio-dropdown">
      <div class="download-button">
       <div class="download-

In [86]:
#Create a df to record all the info
general_df = pd.DataFrame(columns=['EvaLee_direct', 'Zofia_direct',
                                   'Yuri_direct', 'transcript_Eng',
                                   'transcript_Chn'])
general_df

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,transcript_Eng,transcript_Chn


In [87]:
#Iterate through each table
for table in tables:
    df_row_list = []

    #Find audio urls
    audios = table.find_all('a', {'class': 'internal'})  #Don't use 'audio'
    # here for find_all since those should be generated by JS and the
    # returned html doesn't have that

    audios = sorted(set(audios), key=audios.index)  # This is a special
    # way to remove
    # duplications while preserve the original order. 1 table should have 3
    # audios but each will be repeated twice in the original audios.

    audios_direct = []
    for audio in audios:
        internal_url = audio['href']
        download_link = 'https://voicewiki.cn' + internal_url  #The URL you can
        # use to download the audio file

        file_name = internal_url.split('/')[-1]  #The name of the original
        # file that you can use to name the file for the download. It'll be
        # like RA2_ceva001.mp3 (for Allies Lieutenant Eva Lee), RA2_csof001
        # .mp3 (for Soviets Lieutenant Zofia),
        # RA2_cyur001.mp3 (for Yuri)

        if 'ev' in file_name:
            folder_directory = './audio_RA2/EvaLee/General/'
        elif 'so' in file_name:
            folder_directory = './audio_RA2/Zofia/General/'
        elif 'yu' in file_name:
            folder_directory = './audio_RA2/Yuri/General/'
        else:
            print('Something is wrong with file name {}'.format(file_name))

        file_directory = folder_directory + file_name
        audios_direct.append(file_directory)

        download_audio(download_link, file_directory)
    df_row_list += audios_direct

    #Find transcripts
    transcripts = table.find_all('td')  #transcripts will be a list
    # containing 2 stings, 1 as English and the other as Chinese
    cleaned_transcripts = []
    for transcript in transcripts:
        cleaned_transcripts.append(transcript.text.strip())
    df_row_list += cleaned_transcripts

    general_df.loc[
        len(general_df)] = df_row_list  #Appened df_row_list to general_df

general_df

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/EvaLee/General/RA2_ceva001.mp3,./audio_RA2/Zofia/General/RA2_csof001.mp3,./audio_RA2/Yuri/General/RA2_cyur001.mp3,"Warning, Nuclear Silo detected.",警告，侦测到核弹发射井
1,./audio_RA2/EvaLee/General/RA2_ceva002.mp3,./audio_RA2/Zofia/General/RA2_csof002.mp3,./audio_RA2/Yuri/General/RA2_cyur002.mp3,"Warning, Nuclear Missile launched.",警告，核弹已发射
2,./audio_RA2/EvaLee/General/RA2_ceva003.mp3,./audio_RA2/Zofia/General/RA2_csof003.mp3,./audio_RA2/Yuri/General/RA2_cyur003.mp3,Nuclear Missile ready.,核弹已准备就绪
3,./audio_RA2/EvaLee/General/RA2_ceva004.mp3,./audio_RA2/Zofia/General/RA2_csof004.mp3,./audio_RA2/Yuri/General/RA2_cyur004.mp3,"Warning, Iron Curtain detected.",警告，侦测到铁幕装置
4,./audio_RA2/EvaLee/General/RA2_ceva005.mp3,./audio_RA2/Zofia/General/RA2_csof005.mp3,./audio_RA2/Yuri/General/RA2_cyur005.mp3,"Warning, Iron Curtain activated.",警告，铁幕装置已激活
...,...,...,...,...,...
129,./audio_RA2/EvaLee/General/RA2_ceva142.mp3,./audio_RA2/Zofia/General/RA2_csof142.mp3,./audio_RA2/Yuri/General/RA2_cyur142.mp3,Spy Plane on route.,侦察机在路上
130,./audio_RA2/EvaLee/General/RA2_ceva143.mp3,./audio_RA2/Zofia/General/RA2_csof143.mp3,./audio_RA2/Yuri/General/RA2_cyur143.mp3,Battle control offline.,作战控制离线
131,./audio_RA2/EvaLee/General/RA2_ceva150.mp3,./audio_RA2/Zofia/General/RA2_csof150.mp3,./audio_RA2/Yuri/General/RA2_cyur150.mp3,Paratroopers ready.,伞兵就绪
132,./audio_RA2/EvaLee/General/RA2_ceva151.mp3,./audio_RA2/Zofia/General/RA2_csof151.mp3,./audio_RA2/Yuri/General/RA2_cyur151.mp3,Weather control device temporarily unavailable.,天气控制机暂时失效


In [90]:
#Save the df as csv
general_df.to_csv('./audio_RA2/transcripts/general.csv',
                  index=False,
                  encoding='utf-8-sig'  #Without this option, the Chinese
                  # column will be unreadable
                  )

## 盟军战役语音
https://voicewiki.cn/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E7%9B%9F%E5%86%9B%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3
* Some voices don't belong to Eva Lee, Zofia, or Yuri, so you need to
 save them into the "Others" folder
 * This page doesn't have any Zofia voice
  * Some of the "战前过场" voices are spoken by Eva Lee but they were put in the
   Others folder since their file name doesn't follow rule. Also some of
   thpse voices aren't Eva so if you want, you can manually move those
   spoken by Eva to the EvaLee folder
     *  This was solved in the final clean-up

In [35]:
url = 'https://voicewiki.cn/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E7%9B%9F%E5%86%9B%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3'

request = requests.get(url)

print(request.status_code)

200


In [36]:
#Obtain all the contents from that request
if request.status_code == 200:
    soup = BeautifulSoup(request.content, 'html.parser')
    print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="zh-Hans-CN">
 <head>
  <meta charset="utf-8"/>
  <title>
   红色警戒2/盟军战役语音 - 语音维基
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"zh","wgMonthNames":["","1月","2月","3月","4月","5月","6月","7月","8月","9月","10月","11月","12月"],"wgRequestId":"508557e4fbe31ff2827824d3","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"红色警戒2/盟军战役语音","wgTitle":"红色警戒2/盟军战役语音","wgCurRevisionId":120022,"wgRevisionId":120022,"wgArticleId":22307,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":[],"wgPageContentLanguage":"zh-cn","wgPageContentModel":"wikitext","wgRelevantPageName":"红色警戒2/盟军战役语音","wgRelevantArticleId":22307,"wgIsProbablyEditable":!1,"wgRelevantPageIsProbablyEditable":!1,"wgRestrictionEdit":[],"wg

In [37]:
#Obtain the tab that contains all the tables
tables_tab = soup.find_all('div', {'class': 'mw-parser-output'})
# print(len(tables_tab)) #It should be 1
print(tables_tab[0].prettify())

<div class="mw-parser-output">
 <div style="font-size: small; margin-bottom: 1.5em;">
  <a href="/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%8F%8A%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3" title="红色警戒2/副官及战役语音">
   &lt;
  </a>
  返回上级：
  <a href="/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%8F%8A%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3" title="红色警戒2/副官及战役语音">
   红色警戒2/副官及战役语音
  </a>
 </div>
 <p>
  langmd.mix里只有“没有对应视频只有声音”的语音。过场动画的语音不存在这里。
 </p>
 <div aria-labelledby="mw-toc-heading" class="toc" id="toc" role="navigation">
  <input class="toctogglecheckbox" id="toctogglecheckbox" role="button" style="display:none" type="checkbox"/>
  <div class="toctitle" dir="ltr" lang="zh-Hans-CN">
   <h2 id="mw-toc-heading">
    目录
   </h2>
   <span class="toctogglespan">
    <label class="toctogglelabel" for="toctogglecheckbox">
    </label>
   </span>
  </div>
  <ul>
   <li class="toclevel-1 tocsection-1">
    <a href="#.E5.8E.9F.E7.89.88">
     <span class="tocnum

In [38]:
#Obtain all tha tables
tables = tables_tab[0].find_all('table')
# print(len(tables)) #This should be the total table number on that page
print(tables[0].prettify())

<table>
 <tbody>
  <tr>
   <th rowspan="2">
    <div class="downloadable-audio template-render-cloak">
     <div class="audio-player">
      <a class="internal" href="/w/images/4/4a/RA2_cevau06.mp3" title="RA2 cevau06.mp3">
       Media:RA2_cevau06.mp3
      </a>
     </div>
     <div class="downloadable-audio-dropdown">
      <div class="download-button">
       <div class="download-icon">
       </div>
       <div class="download-link">
        <a class="internal" href="/w/images/4/4a/RA2_cevau06.mp3" title="RA2 cevau06.mp3">
         Media:RA2_cevau06.mp3
        </a>
       </div>
      </div>
     </div>
    </div>
   </th>
   <td>
    The V3 is a powerful long-range artillery weapon. Beware Commander, it can level your base in minutes without you ever getting a chance to even return fire.
   </td>
  </tr>
  <tr>
   <td>
    <a href="/wiki/V3%E7%81%AB%E7%AE%AD%E5%8F%91%E5%B0%84%E8%BD%A6%EF%BC%88%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922%EF%BC%89" title="V3火箭发射车（红色警戒2）">
     V3
    </a

In [39]:
#Create a df to record all the info
allied_war_df = pd.DataFrame(
    columns=['EvaLee_direct', 'Yuri_direct', 'Other_direct', 'transcript_Eng',
             'transcript_Chn'])
allied_war_df

Unnamed: 0,EvaLee_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn


In [40]:
#Iterate through each table
for table in tables:
    df_row_list = []

    #Find audio urls
    audios = table.find_all('a', {'class': 'internal'})  #Don't use 'audio'
    # here for find_all since those should be generated by JS and the
    # returned html doesn't have that

    audios = sorted(set(audios), key=audios.index)  # This is a special
    # way to remove
    # duplications while preserve the original order. 1 table should have 3
    # audios but each will be repeated twice in the original audios.

    audios_direct = []
    for audio in audios:
        internal_url = audio['href']
        download_link = 'https://voicewiki.cn' + internal_url  #The URL you can
        # use to download the audio file

        file_name = internal_url.split('/')[-1]  #The name of the original
        # file that you can use to name the file for the download. It'll be
        # like RA2_ceva001.mp3 (for Allies Lieutenant Eva Lee), RA2_csof001
        # .mp3 (for Soviets Lieutenant Zofia),
        # RA2_cyur001.mp3 (for Yuri)

        if 'ev' in file_name:
            folder_directory = './audio_RA2/EvaLee/AlliedWar/'
        elif 'yu' in file_name:
            folder_directory = './audio_RA2/Yuri/AlliedWar/'
        else:
            folder_directory = './audio_RA2/Others/AlliedWar/'

        file_directory = folder_directory + file_name

        #Add in additional nan values so the row match the header of the df
        if 'ev' in file_name:
            audios_direct.append(file_directory)
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
        elif 'yu' in file_name:
            audios_direct.append(np.nan)
            audios_direct.append(file_directory)
            audios_direct.append(np.nan)
        else:
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
            audios_direct.append(file_directory)

        download_audio(download_link, file_directory)
    df_row_list += audios_direct

    #Find transcripts
    transcripts = table.find_all('td')  #transcripts will be a list
    # containing 2 stings, 1 as English and the other as Chinese
    cleaned_transcripts = []
    for transcript in transcripts:
        cleaned_transcripts.append(transcript.text.strip())
    df_row_list += cleaned_transcripts

    try:
        allied_war_df.loc[len(allied_war_df)] = df_row_list
    except:
        print(df_row_list)  #This will be the df_row_list containing the error

allied_war_df

Unnamed: 0,EvaLee_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/EvaLee/AlliedWar/RA2_cevau06.mp3,,,The V3 is a powerful long-range artillery weap...,V3是威力强大的长程火箭。当心，指挥官，V3能在几分钟内将您的基地夷为平地，您甚至连还击的机...
1,./audio_RA2/EvaLee/AlliedWar/RA2_cevau07.mp3,,,"The most powerful Soviet Tank ever built, the ...",天启坦克是苏联史上最强的坦克，本身就具备一只小型军队的火力，装备对空及对地的武装
2,./audio_RA2/EvaLee/AlliedWar/RA2_cevau08.mp3,,,"The most powerful air unit ever created, the K...",基洛夫空艇是最强的空中武器，能投掷无限的铁制炸弹，攻击敌人
3,./audio_RA2/EvaLee/AlliedWar/RA2_cevau13.mp3,,,"The ultimate in Soviet long-range bombardment,...",无畏级战舰是苏联究极的长程炮轰武器，远距攻击力无以伦比，但很容易在近距离被击沉。无畏级战舰所...
4,./audio_RA2/EvaLee/AlliedWar/RA2_cevau15.mp3,,,The Soviets have taken Tesla Technology to the...,苏联磁能科技的最后阶段，身穿磁能动力装的士兵。这些士兵最擅长对付坦克，也能用他们额外的能源加...
...,...,...,...,...,...
189,./audio_RA2/EvaLee/AlliedWar/RA2_xa7ev03.mp3,,,"We don't have much time, Commander. Build up y...",我们没有多少时间了，指挥官。在尤里占领你的位置之前，迅速建立起你在南极洲的基地
190,./audio_RA2/EvaLee/AlliedWar/RA2_xa7ev04.mp3,,,"Good job, Commander. We're search for a suitab...",做得好，指挥官。我们正在寻找一个合适的着陆区将你的MCV传送过去。只要再拖住尤里的部队就可以了
191,./audio_RA2/EvaLee/AlliedWar/RA2_xa7ev05.mp3,,,"Sir, if you can combine Soviet technology with...",长官，如果你能把苏联的技术和我们的技术结合起来，你应该能得到一些有趣的结果。它可以给你带来你...
192,./audio_RA2/EvaLee/AlliedWar/RA2_xa7ev06.mp3,,,"Sir, our Soviet base on Tierra del Fuego has c...",长官，我们在火地岛的苏维埃基地受到了攻击。保持基地完好，这样我们就能继续保留并使用苏联的科技


In [42]:
#Save the df as csv
allied_war_df.to_csv('./audio_RA2/transcripts/allied_war.csv',
                     index=False,
                     encoding='utf-8-sig'  #Without this option, the Chinese
                     # column will be unreadable
                     )

## 苏联战役语音
https://voicewiki.cn/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E8%8B%8F%E8%81%94%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3
* Some voices don't belong to Eva Lee, Zofia, or Yuri, so you need to
 save them into the "other" folder
 * This page doesn’t have any Eva Lee voice
  * Some of the “战前过场” voices are spoken by Eva Lee but they were put in the Others folder since their file name doesn’t follow rule. Also some of thpse voices aren’t Eva so if you want, you can manually move those spoken by Eva to the EvaLee folder
    * This was solved in the final clean-up
* Some of the voices don't have transcript on the website, so you need
    to transcript them yourself

In [43]:
url = 'https://voicewiki.cn/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E8%8B%8F%E8%81%94%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3'

request = requests.get(url)

print(request.status_code)

200


In [44]:
#Obtain all the contents from that request
if request.status_code == 200:
    soup = BeautifulSoup(request.content, 'html.parser')
    print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="zh-Hans-CN">
 <head>
  <meta charset="utf-8"/>
  <title>
   红色警戒2/苏联战役语音 - 语音维基
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"zh","wgMonthNames":["","1月","2月","3月","4月","5月","6月","7月","8月","9月","10月","11月","12月"],"wgRequestId":"d74da4ce4ac76d65b890e331","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"红色警戒2/苏联战役语音","wgTitle":"红色警戒2/苏联战役语音","wgCurRevisionId":120922,"wgRevisionId":120922,"wgArticleId":22308,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["需要补充内容"],"wgPageContentLanguage":"zh-cn","wgPageContentModel":"wikitext","wgRelevantPageName":"红色警戒2/苏联战役语音","wgRelevantArticleId":22308,"wgIsProbablyEditable":!1,"wgRelevantPageIsProbablyEditable":!1,"wgRestrictionEdit

In [45]:
#Obtain the tab that contains all the tables
tables_tab = soup.find_all('div', {'class': 'mw-parser-output'})
# print(len(tables_tab)) #It should be 1
print(tables_tab[0].prettify())

<div class="mw-parser-output">
 <div style="font-size: small; margin-bottom: 1.5em;">
  <a href="/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%8F%8A%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3" title="红色警戒2/副官及战役语音">
   &lt;
  </a>
  返回上级：
  <a href="/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%8F%8A%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3" title="红色警戒2/副官及战役语音">
   红色警戒2/副官及战役语音
  </a>
 </div>
 <div class="infoBox">
  <div class="infoBoxContent">
   <div class="infoBoxTitle">
    <div class="infoBoxIcon" style="color:#e69100;">
     <i aria-hidden="true" class="fa fa-info-circle faa-pulse animated" style="animation: pulse 2s cubic-bezier(0.18, 0.89, 0.32, 1.28) infinite;">
     </i>
    </div>
    <div class="infoBoxText">
     <b>
      此页面内容需要补充
     </b>
     <br/>
     此页面内容尚不完整，语音维基欢迎您
     <font class="text-chromatic">
      <span class="plainlinks">
       <a class="external text" href="https://voicewiki.cn/w/index.php?title=%E7%BA%A2%E8%89%B2%E8%AD

In [46]:
#Obtain all tha tables
tables = tables_tab[0].find_all('table')
# print(len(tables)) #This should be the total table number on that page
print(tables[0].prettify())

<table>
 <tbody>
  <tr>
   <th rowspan="2">
    <div class="downloadable-audio template-render-cloak">
     <div class="audio-player">
      <a class="internal" href="/w/images/f/fa/RA2_csofu04.mp3" title="RA2 csofu04.mp3">
       Media:RA2_csofu04.mp3
      </a>
     </div>
     <div class="downloadable-audio-dropdown">
      <div class="download-button">
       <div class="download-icon">
       </div>
       <div class="download-link">
        <a class="internal" href="/w/images/f/fa/RA2_csofu04.mp3" title="RA2 csofu04.mp3">
         Media:RA2_csofu04.mp3
        </a>
       </div>
      </div>
     </div>
    </div>
   </th>
   <td>
    Soviet technology at its finest. Terror Drones can enter and destroy enemy vehicles from the inside.
   </td>
  </tr>
  <tr>
   <td>
    苏联科技的结晶。
    <a href="/wiki/%E6%81%90%E6%80%96%E6%9C%BA%E5%99%A8%E4%BA%BA%EF%BC%88%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922%EF%BC%89" title="恐怖机器人（红色警戒2）">
     恐怖机器人
    </a>
    能潜入敌人车辆内部，并由内部将之摧毁
   </td>
  </tr>
 <

In [55]:
#Create a df to record all the info
soviet_war_df = pd.DataFrame(
    columns=['Zofia_direct', 'Yuri_direct', 'Other_direct', 'transcript_Eng',
             'transcript_Chn'])
soviet_war_df

Unnamed: 0,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn


In [56]:
#Iterate through each table
for table in tables:
    df_row_list = []

    #Find audio urls
    audios = table.find_all('a', {'class': 'internal'})  #Don't use 'audio'
    # here for find_all since those should be generated by JS and the
    # returned html doesn't have that

    audios = sorted(set(audios), key=audios.index)  # This is a special
    # way to remove
    # duplications while preserve the original order. 1 table should have 3
    # audios but each will be repeated twice in the original audios.

    audios_direct = []
    for audio in audios:
        internal_url = audio['href']
        download_link = 'https://voicewiki.cn' + internal_url  #The URL you can
        # use to download the audio file

        file_name = internal_url.split('/')[-1]  #The name of the original
        # file that you can use to name the file for the download. It'll be
        # like RA2_ceva001.mp3 (for Allies Lieutenant Eva Lee), RA2_csof001
        # .mp3 (for Soviets Lieutenant Zofia),
        # RA2_cyur001.mp3 (for Yuri)

        if 'so' in file_name:
            folder_directory = './audio_RA2/Zofia/SovietWar/'
        elif 'yu' in file_name:
            folder_directory = './audio_RA2/Yuri/SovietWar/'
        else:
            folder_directory = './audio_RA2/Others/SovietWar/'

        file_directory = folder_directory + file_name

        #Add in additional nan values so the row match the header of the df
        if 'so' in file_name:
            audios_direct.append(file_directory)
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
        elif 'yu' in file_name:
            audios_direct.append(np.nan)
            audios_direct.append(file_directory)
            audios_direct.append(np.nan)
        else:
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
            audios_direct.append(file_directory)

        download_audio(download_link, file_directory)
    df_row_list += audios_direct

    #Find transcripts
    transcripts = table.find_all('td')  #transcripts will be a list
    # containing 2 stings, 1 as English and the other as Chinese
    cleaned_transcripts = []
    for transcript in transcripts:
        cleaned_transcripts.append(transcript.text.strip())
    df_row_list += cleaned_transcripts

    try:
        soviet_war_df.loc[len(soviet_war_df)] = df_row_list
    except:
        print(df_row_list)  #This will be the df_row_list containing the error

soviet_war_df

Unnamed: 0,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/Zofia/SovietWar/RA2_csofu04.mp3,,,Soviet technology at its finest. Terror Drones...,苏联科技的结晶。恐怖机器人能潜入敌人车辆内部，并由内部将之摧毁
1,./audio_RA2/Zofia/SovietWar/RA2_csofu06.mp3,,,The V3 Launcher has extreme range capabilities...,V3火箭发射车是十分长程的武器，具有无比伦比的潜在破坏力。肯定能造成浩劫般的伤害，但不要让其...
2,./audio_RA2/Zofia/SovietWar/RA2_csofu07.mp3,,,The appropriately named Apocalypse Tank is war...,车如其名的天启坦克，是完美结合的战争武器。天启坦克能攻击任何地面或空中目标，也是能在战场上存...
3,./audio_RA2/Zofia/SovietWar/RA2_csofu08.mp3,,,"They may be slow, but the Kirov Airships can t...",虽然基洛夫空艇的速度不快，但能造成大量的破坏，并且不停的攻击
4,./audio_RA2/Zofia/SovietWar/RA2_csofu09.mp3,,,The Sea Scorpion is a fast-attack sea raider. ...,海蠍是快速攻击海上突袭舰。虽然不能为您赢得战争，但没有它您也肯定赢不了。这是您在水面上的唯一...
...,...,...,...,...,...
234,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3,,,,
235,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3,,,,
236,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3,,,,
237,,./audio_RA2/Yuri/SovietWar/RA2_xs7yu01.mp3,,,


In [57]:
#Save the df as csv
soviet_war_df.to_csv('./audio_RA2/transcripts/soviet_war.csv',
                     index=False,
                     encoding='utf-8-sig'  #Without this option, the Chinese
                     # column will be unreadable
                     )

## 副官其他语音
https://voicewiki.cn/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%85%B6%E4%BB%96%E8%AF%AD%E9%9F%B3
* On the lower part, you can find some long readings of Eva Lee, Zofia, and
Yuri

In [58]:
url = 'https://voicewiki.cn/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%85%B6%E4%BB%96%E8%AF%AD%E9%9F%B3'

request = requests.get(url)

print(request.status_code)

200


In [59]:
#Obtain all the contents from that request
if request.status_code == 200:
    soup = BeautifulSoup(request.content, 'html.parser')
    print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="zh-Hans-CN">
 <head>
  <meta charset="utf-8"/>
  <title>
   红色警戒2/副官其他语音 - 语音维基
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"zh","wgMonthNames":["","1月","2月","3月","4月","5月","6月","7月","8月","9月","10月","11月","12月"],"wgRequestId":"48e435a4edbb82299c646c25","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"红色警戒2/副官其他语音","wgTitle":"红色警戒2/副官其他语音","wgCurRevisionId":69114,"wgRevisionId":69114,"wgArticleId":23105,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":[],"wgPageContentLanguage":"zh-cn","wgPageContentModel":"wikitext","wgRelevantPageName":"红色警戒2/副官其他语音","wgRelevantArticleId":23105,"wgIsProbablyEditable":!1,"wgRelevantPageIsProbablyEditable":!1,"wgRestrictionEdit":[],"wgRe

In [60]:
#Obtain the tab that contains all the tables
tables_tab = soup.find_all('div', {'class': 'mw-parser-output'})
# print(len(tables_tab)) #It should be 1
print(tables_tab[0].prettify())

<div class="mw-parser-output">
 <div style="font-size: small; margin-bottom: 1.5em;">
  <a href="/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%8F%8A%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3" title="红色警戒2/副官及战役语音">
   &lt;
  </a>
  返回上级：
  <a href="/wiki/%E7%BA%A2%E8%89%B2%E8%AD%A6%E6%88%922/%E5%89%AF%E5%AE%98%E5%8F%8A%E6%88%98%E5%BD%B9%E8%AF%AD%E9%9F%B3" title="红色警戒2/副官及战役语音">
   红色警戒2/副官及战役语音
  </a>
 </div>
 <div aria-labelledby="mw-toc-heading" class="toc" id="toc" role="navigation">
  <input class="toctogglecheckbox" id="toctogglecheckbox" role="button" style="display:none" type="checkbox"/>
  <div class="toctitle" dir="ltr" lang="zh-Hans-CN">
   <h2 id="mw-toc-heading">
    目录
   </h2>
   <span class="toctogglespan">
    <label class="toctogglelabel" for="toctogglecheckbox">
    </label>
   </span>
  </div>
  <ul>
   <li class="toclevel-1 tocsection-1">
    <a href="#YR.E7.89.88.E6.9C.AC.E7.95.99.E5.AD.98RA2.E7.9A.84.E8.AF.AD.E9.9F.B3">
     <span class="tocnumber">


In [61]:
#Obtain all tha tables
tables = tables_tab[0].find_all('table')
# print(len(tables)) #This should be the total table number on that page
print(tables[0].prettify())

<table>
 <tbody>
  <tr>
   <th rowspan="2">
    <div class="downloadable-audio template-render-cloak">
     <div class="audio-player">
      <a class="internal" href="/w/images/8/8d/RA2_macev02.mp3" title="RA2 macev02.mp3">
       Media:RA2_macev02.mp3
      </a>
     </div>
     <div class="downloadable-audio-dropdown">
      <div class="download-button">
       <div class="download-icon">
       </div>
       <div class="download-link">
        <a class="internal" href="/w/images/8/8d/RA2_macev02.mp3" title="RA2 macev02.mp3">
         Media:RA2_macev02.mp3
        </a>
       </div>
      </div>
     </div>
    </div>
   </th>
   <td>
    Teleporting troops arriving in 5, 4, 3, 2, 1.
   </td>
  </tr>
  <tr>
   <td>
    传送部队将在5，4，3，2，1秒后到达
   </td>
  </tr>
 </tbody>
</table>



In [74]:
#Create a df to record all the info
other_voice_df = pd.DataFrame(
    columns=['EvaLee_direct', 'Zofia_direct', 'Yuri_direct', 'Other_direct',
             'transcript_Eng',
             'transcript_Chn'])
other_voice_df

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn


In [75]:
#Iterate through each table
for table in tables:
    df_row_list = []

    #Find audio urls
    audios = table.find_all('a', {'class': 'internal'})  #Don't use 'audio'
    # here for find_all since those should be generated by JS and the
    # returned html doesn't have that

    audios = sorted(set(audios), key=audios.index)  # This is a special
    # way to remove
    # duplications while preserve the original order. 1 table should have 3
    # audios but each will be repeated twice in the original audios.

    audios_direct = []
    for audio in audios:
        internal_url = audio['href']
        download_link = 'https://voicewiki.cn' + internal_url  #The URL you can
        # use to download the audio file

        file_name = internal_url.split('/')[-1]  #The name of the original
        # file that you can use to name the file for the download. It'll be
        # like RA2_ceva001.mp3 (for Allies Lieutenant Eva Lee), RA2_csof001
        # .mp3 (for Soviets Lieutenant Zofia),
        # RA2_cyur001.mp3 (for Yuri)

        if 'ev' in file_name:
            folder_directory = './audio_RA2/EvaLee/OtherVoices/'
        elif 'so' in file_name:
            folder_directory = './audio_RA2/Zofia/OtherVoices/'
        elif 'in' in file_name:  #Somehow it's not "yu" anymore
            folder_directory = './audio_RA2/Yuri/OtherVoices/'
        else:
            folder_directory = './audio_RA2/Others/OtherVoices/'

        file_directory = folder_directory + file_name

        #Add in additional nan values so the row match the header of the df
        if 'ev' in file_name:
            audios_direct.append(file_directory)
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
        elif 'so' in file_name:
            audios_direct.append(np.nan)
            audios_direct.append(file_directory)
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
        elif 'in' in file_name:  #Somehow it's not "yu" anymore
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
            audios_direct.append(file_directory)
            audios_direct.append(np.nan)
        else:
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
            audios_direct.append(np.nan)
            audios_direct.append(file_directory)

        download_audio(download_link, file_directory)
    df_row_list += audios_direct

    #Find transcripts
    transcripts = table.find_all('td')  #transcripts will be a list
    # containing 2 stings, 1 as English and the other as Chinese
    cleaned_transcripts = []
    for transcript in transcripts:
        cleaned_transcripts.append(transcript.text.strip())
    df_row_list += cleaned_transcripts

    try:
        other_voice_df.loc[len(other_voice_df)] = df_row_list
    except:
        print(df_row_list)  #This will be the df_row_list containing the error

other_voice_df

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/EvaLee/OtherVoices/RA2_macev02.mp3,,,,"Teleporting troops arriving in 5, 4, 3, 2, 1.",传送部队将在5，4，3，2，1秒后到达
1,,,,./audio_RA2/Others/OtherVoices/RA2_ms9pr03.mp3,They're breaking in! Do something!,他们闯进来了！快做点什么！
2,,,,./audio_RA2/Others/OtherVoices/RA2_ms9pr04.mp3,You don't want the Vice President running the ...,你并不希望副总统管理国家，不是吗？
3,,,,./audio_RA2/Others/OtherVoices/RA2_ms9pr05.mp3,Er...,嗷...
4,./audio_RA2/EvaLee/OtherVoices/RA2_xc0ev01.mp3,,,,Defend America against the Soviet threat by de...,通过摧毁他们在古巴的基地来保护美国免受苏联的威胁，从尤里的军队中拯救洛杉矶并将他赶回俄罗斯，...
5,,,./audio_RA2/Yuri/OtherVoices/RA2_xc0in01.mp3,,Support Yuri's request for world domination by...,通过占领美国首都来完成尤里的世界统治要求，然后指挥尤里在南美洲的军队摧毁苏联在尤卡坦半岛的秘...
6,,./audio_RA2/Zofia/OtherVoices/RA2_xc0so01.mp3,,,Ensure the success of the Soviet invasion by c...,通过粉碎西雅图的盟军力量确保苏联入侵的成功，夺走他的石油储备并摧毁他在埃及的秘密基地来惩罚叛...
7,./audio_RA2/EvaLee/OtherVoices/RA2_xc1ev01.mp3,,,,The Soviet bases in Cuba must be eliminated. O...,必须消灭苏联在古巴的基地，一名指挥官将驻扎在哈瓦那附近，第二名指挥官将被安置在靠近盟军秘密空...
8,,,./audio_RA2/Yuri/OtherVoices/RA2_xc1in01.mp3,,The Allies have constructed defenses around th...,盟军在他们宝贵的白宫周围造满了防御工事，他们像受惊的孩子一样躲在科技背后，没有任何办法对抗我...
9,,./audio_RA2/Zofia/OtherVoices/RA2_xc1so01.mp3,,,The Allied have a stronghold in the city of Se...,盟军在西雅图拥有据点，他们在太空针塔附近建立了一个拥有重炮的基地，第二个基地位于市中心，防御...


In [76]:
#Save the df as csv
other_voice_df.to_csv('./audio_RA2/transcripts/other_voice.csv',
                      index=False,
                      encoding='utf-8-sig'  #Without this option, the Chinese
                      # column will be unreadable
                      )

# Transcript df clean up
* Some voices in soviet_war.csv don't have transcripts. Transcribe them
manually before proceeding.
* For this section, I manually iterated through each audio file that has no
transcript (all from SovietWar), ask Google to transcribe it first, play the
 audio, and I determined the final transcript with some references from
 videos on Bilibili (mainly this person's: https://space.bilibili.com/404470374).
    * This should be the best reseult I can get to

In [143]:
csv_direct = 'audio_RA2/transcripts'

csv_files = glob.glob(os.path.join(csv_direct, "*.csv"))

total_df = pd.DataFrame()
total_len = 0
for csv_file in csv_files:
    temp_df = pd.read_csv(csv_file)
    print('File {} has {} rows'.format(csv_file, len(temp_df)))

    total_len += len(temp_df)
    total_df = pd.concat([total_df, temp_df], axis=0)

print('Total df should have {} rows'.format(total_len))
total_df

File audio_RA2/transcripts\allied_war.csv has 194 rows
File audio_RA2/transcripts\general.csv has 134 rows
File audio_RA2/transcripts\other_voice.csv has 18 rows
File audio_RA2/transcripts\soviet_war.csv has 239 rows
Total df should have 585 rows


Unnamed: 0,EvaLee_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn,Zofia_direct
0,./audio_RA2/EvaLee/AlliedWar/RA2_cevau06.mp3,,,The V3 is a powerful long-range artillery weap...,V3是威力强大的长程火箭。当心，指挥官，V3能在几分钟内将您的基地夷为平地，您甚至连还击的机...,
1,./audio_RA2/EvaLee/AlliedWar/RA2_cevau07.mp3,,,"The most powerful Soviet Tank ever built, the ...",天启坦克是苏联史上最强的坦克，本身就具备一只小型军队的火力，装备对空及对地的武装,
2,./audio_RA2/EvaLee/AlliedWar/RA2_cevau08.mp3,,,"The most powerful air unit ever created, the K...",基洛夫空艇是最强的空中武器，能投掷无限的铁制炸弹，攻击敌人,
3,./audio_RA2/EvaLee/AlliedWar/RA2_cevau13.mp3,,,"The ultimate in Soviet long-range bombardment,...",无畏级战舰是苏联究极的长程炮轰武器，远距攻击力无以伦比，但很容易在近距离被击沉。无畏级战舰所...,
4,./audio_RA2/EvaLee/AlliedWar/RA2_cevau15.mp3,,,The Soviets have taken Tesla Technology to the...,苏联磁能科技的最后阶段，身穿磁能动力装的士兵。这些士兵最擅长对付坦克，也能用他们额外的能源加...,
...,...,...,...,...,...,...
234,,,,,,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3
235,,,,,,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3
236,,,,,,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3
237,,./audio_RA2/Yuri/SovietWar/RA2_xs7yu01.mp3,,,,


## Reorder columns

In [144]:
total_df.columns

Index(['EvaLee_direct', 'Yuri_direct', 'Other_direct', 'transcript_Eng',
       'transcript_Chn', 'Zofia_direct'],
      dtype='object')

In [145]:
total_df = total_df.reindex(columns=['EvaLee_direct', 'Zofia_direct',
                                     'Yuri_direct', 'Other_direct',
                                     'transcript_Eng',
                                     'transcript_Chn'])
total_df

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/EvaLee/AlliedWar/RA2_cevau06.mp3,,,,The V3 is a powerful long-range artillery weap...,V3是威力强大的长程火箭。当心，指挥官，V3能在几分钟内将您的基地夷为平地，您甚至连还击的机...
1,./audio_RA2/EvaLee/AlliedWar/RA2_cevau07.mp3,,,,"The most powerful Soviet Tank ever built, the ...",天启坦克是苏联史上最强的坦克，本身就具备一只小型军队的火力，装备对空及对地的武装
2,./audio_RA2/EvaLee/AlliedWar/RA2_cevau08.mp3,,,,"The most powerful air unit ever created, the K...",基洛夫空艇是最强的空中武器，能投掷无限的铁制炸弹，攻击敌人
3,./audio_RA2/EvaLee/AlliedWar/RA2_cevau13.mp3,,,,"The ultimate in Soviet long-range bombardment,...",无畏级战舰是苏联究极的长程炮轰武器，远距攻击力无以伦比，但很容易在近距离被击沉。无畏级战舰所...
4,./audio_RA2/EvaLee/AlliedWar/RA2_cevau15.mp3,,,,The Soviets have taken Tesla Technology to the...,苏联磁能科技的最后阶段，身穿磁能动力装的士兵。这些士兵最擅长对付坦克，也能用他们额外的能源加...
...,...,...,...,...,...,...
234,,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3,,,,
235,,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3,,,,
236,,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3,,,,
237,,,./audio_RA2/Yuri/SovietWar/RA2_xs7yu01.mp3,,,


## Clean up duplication rows

In [146]:
print('Before cleaning, the df has {} rows'.format(len(total_df)))
total_df = total_df.drop_duplicates()
print('After cleaning, the df has {} rows'.format(len(total_df)))

Before cleaning, the df has 585 rows
After cleaning, the df has 585 rows


## Transcribe voices that lack transcriptions
For this section, I manually iterated through each audio file that has no
transcript (all from SovietWar), ask Google to transcribe it first, play the
 audio, and I determined the final transcript with some references from
 videos on Bilibili (mainly this person's: https://space.bilibili
 .com/404470374).
 * This should be the best I can get to

In [147]:
#Filter out the voices that has no transcripts
no_transcription = total_df[total_df['transcript_Eng'].isnull() == True]
no_transcription = no_transcription.reset_index(drop=True)

no_transcription

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,,./audio_RA2/Zofia/SovietWar/RA2_csofu39.mp3,,,,
1,,./audio_RA2/Zofia/SovietWar/RA2_csofu40.mp3,,,,
2,,./audio_RA2/Zofia/SovietWar/RA2_csofu41.mp3,,,,
3,,./audio_RA2/Zofia/SovietWar/RA2_csofu42.mp3,,,,
4,,./audio_RA2/Zofia/SovietWar/RA2_csofu43.mp3,,,,
...,...,...,...,...,...,...
92,,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3,,,,
93,,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3,,,,
94,,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3,,,,
95,,,./audio_RA2/Yuri/SovietWar/RA2_xs7yu01.mp3,,,


In [148]:
#Filter out columns that contains data more than just all NaN
wanted_col_list = []
for col in no_transcription.columns:
    if len(no_transcription[col].value_counts()) != 0:
        wanted_col_list.append(col)

wanted_col_list += ['transcript_Eng', 'transcript_Chn']
wanted_col_list

['Zofia_direct',
 'Yuri_direct',
 'Other_direct',
 'transcript_Eng',
 'transcript_Chn']

In [None]:
#Slice the no_transcription df by the wanted columns
no_transcription = no_transcription.filter(wanted_col_list, axis=1)
no_transcription

### Select an audio

In [863]:
audio_dict = no_transcription.iloc[96].to_dict()

non_nan_key = ''
for key, value in audio_dict.items():
    if type(value) == str:
        audio_file_direct = value
        non_nan_key = key

print('{}: {}'.format(non_nan_key, audio_file_direct))

Yuri_direct: ./audio_RA2/Yuri/SovietWar/RA2_xs7yu02.mp3


### Determine a transcript

In [864]:
transcribe_audio(audio_file_direct)

{'alternative': [{'transcript': 'I have grown weary of waiting for you to make up your mind wherever you make your base you will be unable to defend it from my forces or those of my Sophia and Allied lb',
   'confidence': 0.84470391},
  {'transcript': 'I have grown weary of waiting for you to make up your mind wherever you make your base you will be unable to defend it from my forces or those of my Sophia and Allied pounds'},
  {'transcript': 'I have grown weary of waiting for you to make up your mind wherever you make your base you will be unable to defend it from my forces or those of my Sofia and Allied lb'},
  {'transcript': 'I have grown weary of waiting for you to make up your mind wherever you make your base you will be unable to defend it from my forces or those of my services and Allied lb'},
  {'transcript': 'I have grown weary of waiting for you to make up your mind forever you make your base you will be unable to defend it from my forces or those of my Sophia and Allied lb'

### Listen to the audio

In [858]:
ipd.Audio(audio_file_direct)

In [859]:
decided_transcript = "I have grown weary of waiting for you to make up your ""mind. Wherever you make your base, you will be unable ""to defend it from my forces or those of my Soviet and"" ""Allied bombs" + '.'

new_entry = pd.DataFrame({non_nan_key: [audio_file_direct], 'transcript_Eng':
    [decided_transcript]})  #Use [] version of the data here to avoid
# ValueError: If
# using all scalar values, you must pass an index
new_entry

Unnamed: 0,Yuri_direct,transcript_Eng
0,./audio_RA2/Yuri/SovietWar/RA2_xs7yu02.mp3,I have grown weary of waiting for you to make ...


In [860]:
no_transcription_save_direct = './audio_RA2/transcripts/no_transcript_audios.csv'

mother_df = pd.read_csv(no_transcription_save_direct)
mother_df

Unnamed: 0,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/Zofia/SovietWar/RA2_csofu39.mp3,,,Yuri's Boomer submarine combines the stuff of ...,
1,./audio_RA2/Zofia/SovietWar/RA2_csofu40.mp3,,,"Commander, this unit is the strongest Yuri has...",
2,./audio_RA2/Zofia/SovietWar/RA2_csofu41.mp3,,,Unfortunately the chaos drone was a promising ...,
3,./audio_RA2/Zofia/SovietWar/RA2_csofu42.mp3,,,Yuri is running his war mostly on stolen Sovie...,
4,./audio_RA2/Zofia/SovietWar/RA2_csofu43.mp3,,,"Beware of these floating discs, Comrade Genera...",
...,...,...,...,...,...
91,./audio_RA2/Zofia/SovietWar/RA2_xs7so01.mp3,,,"Commander, be advised that Yuri is combining h...",
92,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3,,,"Look, Comrade General, at back door! Perhaps i...",
93,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3,,,"Sir, if Yuri's genetic mutator is not taken, h...",
94,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3,,,"Commander, this group of freedom fighters is w...",


In [861]:
for col in ['Zofia_direct', 'Yuri_direct', 'Other_direct']:
    if audio_file_direct in mother_df[col].to_list():
        print('The audio file {} is already in the mother df. The program is'
              ' terminating now to avoid overwriting.'.format
              (audio_file_direct))
        break
    else:
        new_mother_df = pd.concat([mother_df, new_entry])
        new_mother_df.reset_index()
        new_mother_df.to_csv(no_transcription_save_direct, index=False)
        display(new_mother_df)
        break

Unnamed: 0,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/Zofia/SovietWar/RA2_csofu39.mp3,,,Yuri's Boomer submarine combines the stuff of ...,
1,./audio_RA2/Zofia/SovietWar/RA2_csofu40.mp3,,,"Commander, this unit is the strongest Yuri has...",
2,./audio_RA2/Zofia/SovietWar/RA2_csofu41.mp3,,,Unfortunately the chaos drone was a promising ...,
3,./audio_RA2/Zofia/SovietWar/RA2_csofu42.mp3,,,Yuri is running his war mostly on stolen Sovie...,
4,./audio_RA2/Zofia/SovietWar/RA2_csofu43.mp3,,,"Beware of these floating discs, Comrade Genera...",
...,...,...,...,...,...
92,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3,,,"Look, Comrade General, at back door! Perhaps i...",
93,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3,,,"Sir, if Yuri's genetic mutator is not taken, h...",
94,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3,,,"Commander, this group of freedom fighters is w...",
95,,./audio_RA2/Yuri/SovietWar/RA2_xs7yu01.mp3,,"A good thought, Commander. I think my puppets ...",


## Final clean up

### Translate English transcripts into Chinese for audios missing transcripts previously

In [24]:
no_transcription_save_direct = './audio_RA2/transcripts/no_transcript_audios.csv'

no_transcription_audio_df = pd.read_csv(no_transcription_save_direct)
no_transcription_audio_df

Unnamed: 0,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/Zofia/SovietWar/RA2_csofu39.mp3,,,Yuri's Boomer submarine combines the stuff of ...,
1,./audio_RA2/Zofia/SovietWar/RA2_csofu40.mp3,,,"Commander, this unit is the strongest Yuri has...",
2,./audio_RA2/Zofia/SovietWar/RA2_csofu41.mp3,,,Unfortunately the chaos drone was a promising ...,
3,./audio_RA2/Zofia/SovietWar/RA2_csofu42.mp3,,,Yuri is running his war mostly on stolen Sovie...,
4,./audio_RA2/Zofia/SovietWar/RA2_csofu43.mp3,,,"Beware of these floating discs, Comrade Genera...",
...,...,...,...,...,...
92,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3,,,"Look, Comrade General, at back door! Perhaps i...",
93,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3,,,"Sir, if Yuri's genetic mutator is not taken, h...",
94,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3,,,"Commander, this group of freedom fighters is w...",
95,,./audio_RA2/Yuri/SovietWar/RA2_xs7yu01.mp3,,"A good thought, Commander. I think my puppets ...",


In [31]:
if no_transcription_audio_df[
    'transcript_Chn'].isnull().values.all() == True:  #If 'transcript_Chn' contains all NaN
    no_transcription_audio_df['transcript_Chn'] = no_transcription_audio_df[
        'transcript_Eng'].apply(Eng_to_Chn)
    no_transcription_audio_df
else:
    print("The column 'transcript_Chn' contains at least 1 non-NaN value. "
          "Check it up to avoid overwriting.")

Unnamed: 0,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/Zofia/SovietWar/RA2_csofu39.mp3,,,Yuri's Boomer submarine combines the stuff of ...,尤里的 Boomer 潜艇结合了潜艇的特性和我们自己的无畏舰的弹道能力。我们的鱿鱼是唯一可以...
1,./audio_RA2/Zofia/SovietWar/RA2_csofu40.mp3,,,"Commander, this unit is the strongest Yuri has...",指挥官，这支部队是尤里拥有的最强部队。他是野蛮人，他可以毫不费力地砸碎东西
2,./audio_RA2/Zofia/SovietWar/RA2_csofu41.mp3,,,Unfortunately the chaos drone was a promising ...,不幸的是，混沌无人机是尤里偷走的很有前途的苏联设计。它会产生一团致幻气体，使所有受影响的人在...
3,./audio_RA2/Zofia/SovietWar/RA2_csofu42.mp3,,,Yuri is running his war mostly on stolen Sovie...,尤里的战争主要依靠苏联叛逃者维护的被盗苏联技术。他的工程师可能曾经为我们工作过，但他们现在加...
4,./audio_RA2/Zofia/SovietWar/RA2_csofu43.mp3,,,"Beware of these floating discs, Comrade Genera...",当心这些浮盘，将军同志。尤里将它们设计成能够从我们的特斯拉和核反应堆以及我们自己的防御建筑中...
...,...,...,...,...,...
92,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3,,,"Look, Comrade General, at back door! Perhaps i...",看，将军同志，在后门！也许如果你派一些工程师过去夺取灵能信标，你可能会从尤里手中夺取苏联基地...
93,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3,,,"Sir, if Yuri's genetic mutator is not taken, h...",长官，如果尤里的基因突变器不被拿走，他会把你的步兵变成他控制下的野兽。
94,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3,,,"Commander, this group of freedom fighters is w...",指挥官，这群自由战士愿意加入我们，与尤里进行最后的决战。它们应该被证明是最有用的。
95,,./audio_RA2/Yuri/SovietWar/RA2_xs7yu01.mp3,,"A good thought, Commander. I think my puppets ...",好主意，指挥官。我认为我的傀儡们应该效仿你，打造属于他们自己的超级武器。嗯嗯嗯嗯。


In [32]:
no_transcription_audio_df.to_csv(
    './audio_RA2/transcripts/no_transcript_audios.csv',
    index=False,
    encoding='utf-8-sig'  #Otherwise the 'transcript_Chn' column is not
    # readable
)

### Put in the missing transcripts

In [33]:
csv_direct = 'audio_RA2/transcripts'

csv_files = glob.glob(os.path.join(csv_direct, "*.csv"))

all_df = pd.DataFrame()
all_len = 0
for csv_file in csv_files:
    temp_df = pd.read_csv(csv_file)
    print('File {} has {} rows'.format(csv_file, len(temp_df)))

    all_len += len(temp_df)
    all_df = pd.concat([all_df, temp_df], axis=0)

print('Total df should have {} rows'.format(all_len))
all_df

File audio_RA2/transcripts\allied_war.csv has 194 rows
File audio_RA2/transcripts\general.csv has 134 rows
File audio_RA2/transcripts\no_transcript_audios.csv has 97 rows
File audio_RA2/transcripts\other_voice.csv has 18 rows
File audio_RA2/transcripts\soviet_war.csv has 239 rows
Total df should have 682 rows


Unnamed: 0,EvaLee_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn,Zofia_direct
0,./audio_RA2/EvaLee/AlliedWar/RA2_cevau06.mp3,,,The V3 is a powerful long-range artillery weap...,V3是威力强大的长程火箭。当心，指挥官，V3能在几分钟内将您的基地夷为平地，您甚至连还击的机...,
1,./audio_RA2/EvaLee/AlliedWar/RA2_cevau07.mp3,,,"The most powerful Soviet Tank ever built, the ...",天启坦克是苏联史上最强的坦克，本身就具备一只小型军队的火力，装备对空及对地的武装,
2,./audio_RA2/EvaLee/AlliedWar/RA2_cevau08.mp3,,,"The most powerful air unit ever created, the K...",基洛夫空艇是最强的空中武器，能投掷无限的铁制炸弹，攻击敌人,
3,./audio_RA2/EvaLee/AlliedWar/RA2_cevau13.mp3,,,"The ultimate in Soviet long-range bombardment,...",无畏级战舰是苏联究极的长程炮轰武器，远距攻击力无以伦比，但很容易在近距离被击沉。无畏级战舰所...,
4,./audio_RA2/EvaLee/AlliedWar/RA2_cevau15.mp3,,,The Soviets have taken Tesla Technology to the...,苏联磁能科技的最后阶段，身穿磁能动力装的士兵。这些士兵最擅长对付坦克，也能用他们额外的能源加...,
...,...,...,...,...,...,...
234,,,,,,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3
235,,,,,,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3
236,,,,,,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3
237,,./audio_RA2/Yuri/SovietWar/RA2_xs7yu01.mp3,,,,


In [40]:
all_df = all_df.reindex(columns=['EvaLee_direct', 'Zofia_direct',
                                     'Yuri_direct', 'Other_direct',
                                     'transcript_Eng',
                                     'transcript_Chn'])
all_df

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/EvaLee/AlliedWar/RA2_cevau06.mp3,,,,The V3 is a powerful long-range artillery weap...,V3是威力强大的长程火箭。当心，指挥官，V3能在几分钟内将您的基地夷为平地，您甚至连还击的机...
1,./audio_RA2/EvaLee/AlliedWar/RA2_cevau07.mp3,,,,"The most powerful Soviet Tank ever built, the ...",天启坦克是苏联史上最强的坦克，本身就具备一只小型军队的火力，装备对空及对地的武装
2,./audio_RA2/EvaLee/AlliedWar/RA2_cevau08.mp3,,,,"The most powerful air unit ever created, the K...",基洛夫空艇是最强的空中武器，能投掷无限的铁制炸弹，攻击敌人
3,./audio_RA2/EvaLee/AlliedWar/RA2_cevau13.mp3,,,,"The ultimate in Soviet long-range bombardment,...",无畏级战舰是苏联究极的长程炮轰武器，远距攻击力无以伦比，但很容易在近距离被击沉。无畏级战舰所...
4,./audio_RA2/EvaLee/AlliedWar/RA2_cevau15.mp3,,,,The Soviets have taken Tesla Technology to the...,苏联磁能科技的最后阶段，身穿磁能动力装的士兵。这些士兵最擅长对付坦克，也能用他们额外的能源加...
...,...,...,...,...,...,...
234,,./audio_RA2/Zofia/SovietWar/RA2_xs7so02.mp3,,,,
235,,./audio_RA2/Zofia/SovietWar/RA2_xs7so03.mp3,,,,
236,,./audio_RA2/Zofia/SovietWar/RA2_xs7so04.mp3,,,,
237,,,./audio_RA2/Yuri/SovietWar/RA2_xs7yu01.mp3,,,


In [41]:
if len(all_df[all_df['transcript_Eng'].isnull() == True]) != len(
    no_transcription_audio_df):
    print('Something is wrong. The all_df should have the same number of '
          'rows that miss a transcript with the number of rows in '
          'no_transcript_audios.csv. Now all_df has {} rows without '
          'transcripts but no_transcript_audios.csv has {} rows'.format(len
                                                                        (
                                                                            all_df[
                                                                                all_df[
                                                                                    'transcript_Eng'].isnull() == True]),
                                                                        len(no_transcription_audio_df)))
else:
    print('All things match up. You can proceed to the next step to clean up.')

All things match up. You can proceed to the next step to clean up.


In [52]:
non_transcript_col_duplicate_row_num = len(all_df) - len(all_df.drop_duplicates(subset = ['EvaLee_direct', 'Zofia_direct',
                                 'Yuri_direct','Other_direct']))

if non_transcript_col_duplicate_row_num != len(
    no_transcription_audio_df):
    print('Something is wrong. The number of duplicate rows defined by the 4'
          ' non-transcript columns in all_df should be the same as the '
          'number of rows in no_transcript_audios.csv. Now all_df has '
          '{} duplicate rows defined by the 4 non-transcript columns but '
          'no_transcript_audios.csv has {} rows'.format
          (non_transcript_col_duplicate_row_num, len(no_transcription_audio_df)))
else:
    print('All things match up. You can now drop the duplicate rows defined '
          'by the 4 non-transcript columns.')

All things match up. You can now drop the duplicate rows defined by the 4 non-transcript columns.


In [53]:
print('Before dropping the duplicate rows defined by the 4 non-transcript '
      'columns, all_df has {} rows.'.format(len(all_df)))
all_df = all_df.drop_duplicates(subset = ['EvaLee_direct', 'Zofia_direct',
                                 'Yuri_direct','Other_direct'])

print('no_transcript_audios.csv has {} rows.'.format(len(no_transcription_audio_df)))
print('After dropping the duplicate rows defined by the 4 non-transcript '
      'columns, all_df has {} rows.'.format(len(all_df)))


Before dropping the duplicate rows defined by the 4 non-transcript columns, all_df has 682 rows.
no_transcript_audios.csv has 97 rows.
After dropping the duplicate rows defined by the 4 non-transcript columns, all_df has 585 rows.


In [56]:
all_df.to_csv(
    './audio_RA2/transcripts/all.csv',
    index=False,
    encoding='utf-8-sig'  #Otherwise the 'transcript_Chn' column is not
    # readable
)
#In this all.csv, in the transcript_Chn column, as long as a transcript is
# missing a trailing Chinese period punctuation (excluding other special
# punctuations), then that transcript was manually processed by me

### Relocate those 战前过场 audios to correct folder
All those 战前过场 audios are put in the "Other_direct" column since their file
names did not meet the batch filtering rules as I defined before. Some of
those voices are spoken by Eva Lee or Zofia so this step aims to move them
to the right speaker.

In [57]:
all_df = pd.read_csv('./audio_RA2/transcripts/all.csv')
all_df

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/EvaLee/AlliedWar/RA2_cevau06.mp3,,,,The V3 is a powerful long-range artillery weap...,V3是威力强大的长程火箭。当心，指挥官，V3能在几分钟内将您的基地夷为平地，您甚至连还击的机...
1,./audio_RA2/EvaLee/AlliedWar/RA2_cevau07.mp3,,,,"The most powerful Soviet Tank ever built, the ...",天启坦克是苏联史上最强的坦克，本身就具备一只小型军队的火力，装备对空及对地的武装
2,./audio_RA2/EvaLee/AlliedWar/RA2_cevau08.mp3,,,,"The most powerful air unit ever created, the K...",基洛夫空艇是最强的空中武器，能投掷无限的铁制炸弹，攻击敌人
3,./audio_RA2/EvaLee/AlliedWar/RA2_cevau13.mp3,,,,"The ultimate in Soviet long-range bombardment,...",无畏级战舰是苏联究极的长程炮轰武器，远距攻击力无以伦比，但很容易在近距离被击沉。无畏级战舰所...
4,./audio_RA2/EvaLee/AlliedWar/RA2_cevau15.mp3,,,,The Soviets have taken Tesla Technology to the...,苏联磁能科技的最后阶段，身穿磁能动力装的士兵。这些士兵最擅长对付坦克，也能用他们额外的能源加...
...,...,...,...,...,...,...
580,,./audio_RA2/Zofia/SovietWar/RA2_xs4so02.mp3,,,"Sir, if the enemy base near the airport is sti...",指挥官，如果机场附近的敌人基地还在，罗曼诺夫总理的生命就有危险。你不能再带他靠近了。
581,,./audio_RA2/Zofia/SovietWar/RA2_xs4so03.mp3,,,We must get Premier Romanov to the safety of t...,我们必须把罗曼诺夫总理送到机场的安全地带。
582,,./audio_RA2/Zofia/SovietWar/RA2_xs4so04.mp3,,,We believe that Premier Romanov is hiding out ...,我们相信罗曼诺夫总理正躲在城市的东部地区。我们必须在尤里之前找到他。
583,,./audio_RA2/Zofia/SovietWar/RA2_xs4so05.mp3,,,"Excellent, comrade general. The airport's air ...",很好，将军同志。机场的防空设施已经被解除了。我们现在可以安全疏散罗曼诺夫总理了。


In [79]:
non_nan_all_df = all_df[all_df['Other_direct'].isnull() == False]
non_nan_all_df

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
29,,,,./audio_RA2/Others/AlliedWar/RA2_ma2ta01.mp3,All right! This beacon's neutralized! Send in ...,好极了！心灵信标失效了！把装甲车开进来吧
36,,,,./audio_RA2/Others/AlliedWar/RA2_ma5i101.mp3,"Thanks for rescuing us, Sir! Our plane was sho...",感谢您营救我们，长官！我们的飞机被苏联防空炮击坠。说不定这附近还有更多的生还者
37,,,,./audio_RA2/Others/AlliedWar/RA2_ma5i201.mp3,"Glad you found us, Sir! We're freezing out here!",很高兴您找到我们，长官！我们在这好冷！
38,,,,./audio_RA2/Others/AlliedWar/RA2_ma5i301.mp3,"Good to see you, Sir! What are our orders?",很高兴见到您，长官！我们的命令？
39,,,,./audio_RA2/Others/AlliedWar/RA2_ma5i401.mp3,"Be on the lookout for spies, Comrade. It seems...",同志，仔细盯着，不要让间谍混进来。今晚似乎有点太安静了
...,...,...,...,...,...,...
574,,,,./audio_RA2/Others/SovietWar/RA2_xs4ro01.mp3,Err...Yuri is calling me.,呃啊...尤里正在呼唤我（罗曼诺夫被心控时触发）
575,,,,./audio_RA2/Others/SovietWar/RA2_xs4ro02.mp3,These are plans for Iron Curtain. Use on our D...,这些是铁幕装置的建造计划。将铁幕施加在我们的自爆卡车上来制造终极破坏性武器。
576,,,,./audio_RA2/Others/SovietWar/RA2_xs4ro03.mp3,"I am in Yuri's base. Help me, comrade general.",我现在在尤里的基地里。救救我，将军同志。（罗曼诺夫被心控并且带回尤里基地内后触发）
577,,,,./audio_RA2/Others/SovietWar/RA2_xs4ro04.mp3,Ah...I am myself once again.,啊...我终于再次做回自己了。


In [103]:
def find_pre_game_voices(input_str):
    #Use the re pattern to judge whether input_str contains the pattern
    if re.search(r"RA2_..._p..", input_str) != None:
        return True
    else:
        return False

pre_game_voices_df = non_nan_all_df[non_nan_all_df['Other_direct'].apply
(find_pre_game_voices)] #Use the re pattern to filter the strings in the
# "Other_direct" column
pre_game_voices_df = pre_game_voices_df.reset_index() #Reset the indexes of
# pre_game_voices_df. Keep the index so you can change the original row in
# the all.csv
pre_game_voices_df

Unnamed: 0,index,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,129,,,,./audio_RA2/Others/AlliedWar/RA2_a01_p01.mp3,The Golden Gate Bridge entrance was destroyed ...,金门大桥入口在战争中被苏联摧毁。保护时间机器，直到您可以占领足够的发电厂以使其恢复供电。小心...
1,130,,,,./audio_RA2/Others/AlliedWar/RA2_a02_p01.mp3,You should establish your own base before atte...,在尝试定位尤里的部队之前，你应该建立自己的基地
2,131,,,,./audio_RA2/Others/AlliedWar/RA2_a03_p01.mp3,Taking out those nukes is priority one! We hav...,除掉那些核武器是第一要务！我们有自己的超级武器：爱因斯坦的天气控制机便是其中之一。对着那个设...
3,132,,,,./audio_RA2/Others/AlliedWar/RA2_a04_p01.mp3,You'll have to construct your own base. Specia...,你必须建立自己的基地。特工谭雅将协助您解救教授。他被关押在大金字塔附近的一个秘密基地。东边有...
4,133,,,,./audio_RA2/Others/AlliedWar/RA2_a05_p01.mp3,We'll launch our offensive near the Opera Hous...,我们将在歌剧院附近发动进攻。尤里的实验室就在这里，情报还说他在该地区的某个地方拥有一支潜艇舰队
5,134,,,,./audio_RA2/Others/AlliedWar/RA2_a05_p02.mp3,This horribly disfigured soldier is called a B...,这种可怕的毁容士兵被称为狂兽人，他们的力量足以把一个正常大小的坦克掀翻。小心他们，指挥官
6,135,,,,./audio_RA2/Others/AlliedWar/RA2_a06_p01.mp3,We'll setup our defenses by the Parliament bui...,我们将在议会大楼旁设置防御。我的钱说尤里会从东方发动进攻，他甚至可能会偷偷潜入泰晤士河。我们...
7,136,,,,./audio_RA2/Others/AlliedWar/RA2_a06_p04.mp3,We've tracked down the location of Yuri's Lond...,我们已经找到了尤里伦敦基地的位置。苏联军队将随时加入我们，利用他们！把那个基地烧成灰烬！
8,137,,,,./audio_RA2/Others/AlliedWar/RA2_a07_p01.mp3,You'll have to repair this abandoned Soviet ba...,你必须修复这个废弃的苏联基地。南极海岸线周围的悬崖使海军进攻成为不可能。一旦您启动并运行雷达...
9,383,,,,./audio_RA2/Others/SovietWar/RA2_s01_p01.mp3,Yuri's forces are seizing power generating pla...,尤里的部队正在夺取发电厂，以激活恶魔岛上的精神支配者。我们的部队在这里。要想成功夺取美国时间...


In [138]:
change_list = []
speaker = 'Zofia'

In [165]:
#I cycled this cell with iteration through pre_game_voices_df to determine
# who speaks that voice
audio_dict = pre_game_voices_df.iloc[16].to_dict()
audio_direct = audio_dict['Other_direct']
ipd.Audio(audio_direct)

In [162]:
change_dict = {}
change_dict['index'] = audio_dict['index']
change_dict['original_direct'] = audio_dict['Other_direct']
change_dict['new_direct'] = audio_dict['Other_direct'].replace('Others',
                                                               speaker)
change_dict

{'index': 390,
 'original_direct': './audio_RA2/Others/SovietWar/RA2_s07_p01.mp3',
 'new_direct': './audio_RA2/Zofia/SovietWar/RA2_s07_p01.mp3'}

In [163]:
change_list.append(change_dict)
change_list

[{'index': 129,
  'original_direct': './audio_RA2/Others/AlliedWar/RA2_a01_p01.mp3',
  'new_direct': './audio_RA2/EvaLee/AlliedWar/RA2_a01_p01.mp3'},
 {'index': 130,
  'original_direct': './audio_RA2/Others/AlliedWar/RA2_a02_p01.mp3',
  'new_direct': './audio_RA2/EvaLee/AlliedWar/RA2_a02_p01.mp3'},
 {'index': 132,
  'original_direct': './audio_RA2/Others/AlliedWar/RA2_a04_p01.mp3',
  'new_direct': './audio_RA2/EvaLee/AlliedWar/RA2_a04_p01.mp3'},
 {'index': 133,
  'original_direct': './audio_RA2/Others/AlliedWar/RA2_a05_p01.mp3',
  'new_direct': './audio_RA2/EvaLee/AlliedWar/RA2_a05_p01.mp3'},
 {'index': 134,
  'original_direct': './audio_RA2/Others/AlliedWar/RA2_a05_p02.mp3',
  'new_direct': './audio_RA2/EvaLee/AlliedWar/RA2_a05_p02.mp3'},
 {'index': 137,
  'original_direct': './audio_RA2/Others/AlliedWar/RA2_a07_p01.mp3',
  'new_direct': './audio_RA2/EvaLee/AlliedWar/RA2_a07_p01.mp3'},
 {'index': 383,
  'original_direct': './audio_RA2/Others/SovietWar/RA2_s01_p01.mp3',
  'new_direct':

In [189]:
for change_dict in change_list:
    index = change_dict['index']
    original_direct = change_dict['original_direct']
    new_direct = change_dict['new_direct']

    #Change the file directory in all_df
    if 'EvaLee' in new_direct:
        all_df.at[index,'EvaLee_direct'] = new_direct
        all_df.at[index, 'Other_direct'] = np.nan
    elif 'Zofia' in new_direct:
        all_df.at[index,'Zofia_direct'] = new_direct
        all_df.at[index, 'Other_direct'] = np.nan
    else:
        print('Neither EvaLee or Zofia is in the new_direct. Something is '
              'wrong.')

    #Change the file directory in file system
    os.rename(original_direct, new_direct)
    print('File has been moved to new_direct {}'.format(new_direct))

File has been moved to new_direct ./audio_RA2/EvaLee/AlliedWar/RA2_a01_p01.mp3
File has been moved to new_direct ./audio_RA2/EvaLee/AlliedWar/RA2_a02_p01.mp3
File has been moved to new_direct ./audio_RA2/EvaLee/AlliedWar/RA2_a04_p01.mp3
File has been moved to new_direct ./audio_RA2/EvaLee/AlliedWar/RA2_a05_p01.mp3
File has been moved to new_direct ./audio_RA2/EvaLee/AlliedWar/RA2_a05_p02.mp3
File has been moved to new_direct ./audio_RA2/EvaLee/AlliedWar/RA2_a07_p01.mp3
File has been moved to new_direct ./audio_RA2/Zofia/SovietWar/RA2_s01_p01.mp3
File has been moved to new_direct ./audio_RA2/Zofia/SovietWar/RA2_s01_p03.mp3
File has been moved to new_direct ./audio_RA2/Zofia/SovietWar/RA2_s02_p01.mp3
File has been moved to new_direct ./audio_RA2/Zofia/SovietWar/RA2_s03_p01.mp3
File has been moved to new_direct ./audio_RA2/Zofia/SovietWar/RA2_s04_p01.mp3
File has been moved to new_direct ./audio_RA2/Zofia/SovietWar/RA2_s05_p01.mp3
File has been moved to new_direct ./audio_RA2/Zofia/Soviet

In [195]:
#Final check on all_df before saving
non_nan_all_df = all_df[all_df['Other_direct'].isnull() == False]
pre_game_voices_df = non_nan_all_df[non_nan_all_df['Other_direct'].apply
(find_pre_game_voices)]
pre_game_voices_df = pre_game_voices_df.reset_index()
pre_game_voices_df

Unnamed: 0,index,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,131,,,,./audio_RA2/Others/AlliedWar/RA2_a03_p01.mp3,Taking out those nukes is priority one! We hav...,除掉那些核武器是第一要务！我们有自己的超级武器：爱因斯坦的天气控制机便是其中之一。对着那个设...
1,135,,,,./audio_RA2/Others/AlliedWar/RA2_a06_p01.mp3,We'll setup our defenses by the Parliament bui...,我们将在议会大楼旁设置防御。我的钱说尤里会从东方发动进攻，他甚至可能会偷偷潜入泰晤士河。我们...
2,136,,,,./audio_RA2/Others/AlliedWar/RA2_a06_p04.mp3,We've tracked down the location of Yuri's Lond...,我们已经找到了尤里伦敦基地的位置。苏联军队将随时加入我们，利用他们！把那个基地烧成灰烬！


In [198]:
#I cycled this cell to iterate through all the remaining 战前过场 audio files in
# Other_direct to ensure they're not spoken by Eva Lee or Zofia
audio_dict = pre_game_voices_df.iloc[2].to_dict()
audio_direct = audio_dict['Other_direct']
ipd.Audio(audio_direct)

In [199]:
#I also checked manually in EvaLee and Zofia folders to ensure those 战前过场
# audios are really spoken by them

### Save the final all_df

In [200]:
all_df #The total length of this df should not change

Unnamed: 0,EvaLee_direct,Zofia_direct,Yuri_direct,Other_direct,transcript_Eng,transcript_Chn
0,./audio_RA2/EvaLee/AlliedWar/RA2_cevau06.mp3,,,,The V3 is a powerful long-range artillery weap...,V3是威力强大的长程火箭。当心，指挥官，V3能在几分钟内将您的基地夷为平地，您甚至连还击的机...
1,./audio_RA2/EvaLee/AlliedWar/RA2_cevau07.mp3,,,,"The most powerful Soviet Tank ever built, the ...",天启坦克是苏联史上最强的坦克，本身就具备一只小型军队的火力，装备对空及对地的武装
2,./audio_RA2/EvaLee/AlliedWar/RA2_cevau08.mp3,,,,"The most powerful air unit ever created, the K...",基洛夫空艇是最强的空中武器，能投掷无限的铁制炸弹，攻击敌人
3,./audio_RA2/EvaLee/AlliedWar/RA2_cevau13.mp3,,,,"The ultimate in Soviet long-range bombardment,...",无畏级战舰是苏联究极的长程炮轰武器，远距攻击力无以伦比，但很容易在近距离被击沉。无畏级战舰所...
4,./audio_RA2/EvaLee/AlliedWar/RA2_cevau15.mp3,,,,The Soviets have taken Tesla Technology to the...,苏联磁能科技的最后阶段，身穿磁能动力装的士兵。这些士兵最擅长对付坦克，也能用他们额外的能源加...
...,...,...,...,...,...,...
580,,./audio_RA2/Zofia/SovietWar/RA2_xs4so02.mp3,,,"Sir, if the enemy base near the airport is sti...",指挥官，如果机场附近的敌人基地还在，罗曼诺夫总理的生命就有危险。你不能再带他靠近了。
581,,./audio_RA2/Zofia/SovietWar/RA2_xs4so03.mp3,,,We must get Premier Romanov to the safety of t...,我们必须把罗曼诺夫总理送到机场的安全地带。
582,,./audio_RA2/Zofia/SovietWar/RA2_xs4so04.mp3,,,We believe that Premier Romanov is hiding out ...,我们相信罗曼诺夫总理正躲在城市的东部地区。我们必须在尤里之前找到他。
583,,./audio_RA2/Zofia/SovietWar/RA2_xs4so05.mp3,,,"Excellent, comrade general. The airport's air ...",很好，将军同志。机场的防空设施已经被解除了。我们现在可以安全疏散罗曼诺夫总理了。


In [201]:
all_df.to_csv(
    './audio_RA2/transcripts/all.csv',
    index=False,
    encoding='utf-8-sig'  #Otherwise the 'transcript_Chn' column is not
    # readable
)