In [36]:
import requests as re
from bs4 import BeautifulSoup
import pandas as pd

In [43]:
# site that contains game's japanese script, official ted woolsey translation, and unofficial kwhazit translation
base_url = 'https://kwhazit.ucoz.net/trans/ff6/'

# link to the next page to scrape - also functions as unofficial chapter names
chapter_page = '01intro.html'

# store a "line" of the script (may somtimes contain multiple lines)
rows = []

while chapter_page != 'index.html': 
    
    # download the page
    page = re.get(base_url + chapter_page)
    
    # parse the page
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # get all possible table rows that may contain script text
    chapter_rows = soup.find_all('tr')
    
    # add each row of text, along with its chapter id, to a list
    rows.extend([(chapter_page, row) for row in chapter_rows])
    
    # prepare to go to the next chapter's page
    chapter_page = soup.find('div', class_='next').find('a').attrs['href']

In [62]:
jp = []  # japanese text
tw = []  # ted woolsey's translation
kw = []  # kwhazit's translation

chapters = []  # stores the chapter id for each row

for row in rows:
    chapter_id = row[0]
    script_data = row[1]
    
    td = script_data.find_all('td')
    
    # script online appears in lines with 4 <td> objects, anything else is descriptive text
    if len(td) != 4:
        continue
        
    # append the text to the corresponding list
    jp.append(td[1].text)
    tw.append(td[2].text)
    kw.append(td[3].text)
    
    # obtain the chapter id
    ch = chapter_id[:-5]
    chapters.append(ch)

# put it all in a DataFrame
df = pd.DataFrame({'jp': jp, 'tw': tw, 'kw': kw}, index=chapters)

The text should be considered raw, as it has unwanted  

In [64]:
df

Unnamed: 0,jp,tw,kw
01intro,魔大戦\r\n すべてを焼きつくした、その戦いが\r\n 終わった時、世界から\r\n「...,The Great Magic War:\r\n\tWhen that conflict t...,"Long ago, the War of the Magi\r\n reduced th..."
01intro,そして１０００年…\r\n 鉄、火薬、蒸気機関\r\n 人々は機械の力を使い、世界を...,"Then 1000 years passed...\r\n\tIron, gunpowder...","1000 years have passed... Iron,\r\n gunpowder..."
01intro,今またここに、伝説となった\r\n 「魔法」の力を復活させ\r\n その強大な武力によっ...,"In this time when the power of ""magic""\r\n\tha...",But there are some who\r\n would enslave the ...
01intro,\r\n\t人はまた\r\n そのあやまちを\r\n くり返そうとしているのか…,\r\n\tAre people about to\r\n\trepeat their mi...,Can it be that those\r\n in power are on the ...
01intro,「あの都市か？\r\n\t「魔大戦で氷づけになった１０００年前\r\n\t　の幻獣か…,: That's the city?\r\n\t: A Genjuu trapped in ...,VICKS: There's the town...\r\n\tWEDGE: Hard to...
...,...,...,...
51end,まだ……がんばれる？,Can still... keep going?,Just a little longer...
51end,{ティナ}は！？,{Tina}!?,Where's {TERRA}!?
51end,{ｾﾘｽ}「{ティナ}！,{Celes}: {Tina}!,{CELES}: {TERRA}!
51end,{ﾃｨﾅ}「ありがとう\r\n\t　{セッツァー}……,"{Tina}: Thank you, {Setzer}...","{TERRA}: Thank you, \r\n\t{SETZER}!"


In [19]:
df.to_csv('scripts.csv', index=False)

In [18]:
from laserembeddings import Laser

laser = Laser()

embeddings = laser.embed_sentences(df['tw'], lang='eng')