In [106]:
from bs4 import BeautifulSoup as bsoup
import requests
import re

inf_url = "https://transcripts.fandom.com/wiki/Avengers:_Infinity_War"
end_url = "https://transcripts.fandom.com/wiki/Avengers:_Endgame"

# Gathering Data from Website

In [107]:
"""Given URL from transcripts fandom site, return the script as a string"""
def get_script(url):
    out_site = requests.get(url)
    out_soup = bsoup(out_site.content, 'html.parser')
    out_script = out_soup.find(id = 'mw-content-text').text
    return out_script

In [108]:
inf_script = get_script(inf_url)
end_script = get_script(end_url)

# Cleaning the script to only get dialogue

In [109]:
"""Given a string of the script, remove actions and other misc text that isn't dialogue
returns a string of only the dialogue"""
def clean_script_text(script):
    #Removes all the [action] in the script
    actless_text = re.sub('(\[.+\])', '', script)
    return actless_text

In [110]:
dialogue_pattern = "(\s*[a-zA-Z\\s]+)+[:](\s*[a-zA-Z\s]+[,.?!]+)+"
actions_pattern = '(\[.+\])'

In [112]:
inf_dialogue_text = clean_script_text(inf_script)
end_dialogue_text = clean_script_text(end_script)


In [175]:
"""Given a multi language dialogue line as a string, find and return only English text as string"""
def get_english_text(line):
    eng_pattern = "(\(English[:])(.+)(\))"
    out = re.findall(eng_pattern, line)
    return (out[0][1])

In [193]:
"""Given a list of lines with multiple colons, return the correctly formatted line as as tring"""
def get_mult_col_text(line):
    patched_up = ":".join(line[1:])
    
    #check if line is in different language
    if "(English:" in patched_up:
        translate = get_english_text(patched_up)
        return translate.strip("(").strip(")")
    else:
        return patched_up.strip("(").strip(")")
    

In [224]:
def check_if_dialogue(line):
    if line[1] == "":
        return False
    else:
        return True

# Writing out the script into txt files

In [261]:
def write_out_to_csv(dialogue_text, file):
    list_lines = dialogue_text.split("\n")
    file.write("character|line\n")
    for each in list_lines:
        line = each.strip('\t').split(":")
        #This line has 2 parts, character name and dialogue

        if len(line) == 2:
            if check_if_dialogue(line) == True:
                file.write("{}|{}\n".format(line[0], line[1]))

        #does not have 2 parts, instead there exists a semi colon in the sentence
        elif len(line) > 2:
            #pass
            file.write("{}|{}\n".format(line[0], get_mult_col_text(line[1:])))
    


In [262]:
inf_file = open("infinity_war_script.txt", "w")
write_out_to_csv(inf_dialogue_text, inf_file)
inf_file.close()

end_file = open("endgame_script.txt", "w")
write_out_to_csv(end_dialogue_text, end_file)
end_file.close()