In [2]:
import requests
import string
import time
from bs4 import BeautifulSoup
import re
import pprint 
pp = pprint.PrettyPrinter(indent=4)



def get_page(url):
    response = requests.get(url)
    page = response.text
    return page

'''
Parse film list
'''

def get_film_list(url):
    page = get_page(url)
    films_list = {}
    
    list_soup=BeautifulSoup(page,"html5lib")
    tables=list_soup.find_all("table")
    
    tables=list_soup.find_all("table")
    rows=[row for row in tables[2].find_all('tr')]
    rows.pop(0) #remove header
    for i,r in enumerate(rows):
        items = r.find_all('td')
        link=items[-1].find('a')['href']
        title= items[-1].find('a').text
        films_list[title] = "%s%s"%(base_url,link)
    return films_list

'''
Parse film details
'''
def get_film_details(url,title):
    page = get_page(url)
    film = {}
    film['title'] = title

    page_soup = BeautifulSoup(page,"html5lib")
    cast_table = get_table_values(page_soup,'The Players')
    sub_genre_table = get_table_values(page_soup,'Genres')
    
    tables = page_soup.find_all("table")
    parse_image(film,page_soup)
    parse_details(film,tables[5])
    parse_finances2(film,page_soup)
    if cast_table is not None:
        parse_cast(film,cast_table,page_soup)
    if sub_genre_table is not None:
        parse_sub_genre(film,sub_genre_table)
    return film
    
def clean_string(raw_string):
    printable = set(string.printable)
    clean=[s for s in raw_string if s in printable]
    return ''.join(clean)

'''Parse poster image'''
def parse_image(film,page_soup):
    poster_img = page_soup.find_all("img")[6]['src']
    film['image'] = poster_img
    
'''Parse details table'''
def parse_details(film,table):
    rows=[row for row in table.find_all('tr')]
    rows.pop(0) # remove header
    prop_map = {"Distributor":"distributor", "Release Date": "release_date","Genre":"genre","Runtime":"duration","MPAA Rating":"rating","Production Budget":"budget"}
    for row in rows:
        # each row will have a few cells: (td~ cell tag)
        items=row.find_all('td')
        [parse_details_row(item,film,prop_map) for item in items]

'''Parse dtails table rows'''
def parse_details_row(row,film,prop_map):
    data = row.text.split(':')
    text = data[1]
    prop = prop_map[data[0]]
    film[prop] = text
    
'''Parse finance table'''
def parse_finances(film,table):
    rows=[row for row in table.find_all('tr')]
    prop_map = {0:"gross_domestic",1:"gross_foreign"}
    for i,row in enumerate(rows):
    # prevent out of index error on irregular cell
        if i < len(rows)-2:
            items=row.find_all('td')
            film[prop_map[i]] = clean_string(items[1].text)
            
def parse_finances2(film,page_soup):
    film['gross_domestic'] = get_movie_value(page_soup,'Domestic')
    film['gross_foreign'] = get_movie_value(page_soup,'Foreign')
    film['gross_worldwide'] = get_movie_value(page_soup,'Worldwide')
    
    


'''Parse cast table'''
def parse_cast(film,table,page_soup):
    rows=[row for row in table.find_all('tr')]
    #print(rows)

    film['director'] = rows[0].text.split(':')[1]
    cast = [row.text for row in rows[1].find_all('a')]
    cast.pop(0)
    film['cast'] = ", ".join(cast) # comma seperated list

'''Parse genre table'''
def parse_sub_genre(film,table):
    rows=[row for row in table.find_all('tr')]
    rows.pop(0) # remove header
    sub_genres = []
    for row in rows:
        g = row.find('a').text
        sub_genres.append(g)
    sg = clean_string(', '.join(sub_genres)) # comma seperated list
    film['sub_genres'] = sg

    
base_url = 'http://www.boxofficemojo.com/'
url = 'http://www.boxofficemojo.com/yearly/'

def get_movie_value(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text 
    else:
        return None

def get_table_values(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find('div',text=re.compile(field_name))
    if obj == None:
        return None
    table = obj.findNext('table')
    return table


'''Get film list by metric'''
film_list = get_film_list(url)

url = 'http://www.boxofficemojo.com//movies/?id=marvel2017b.htm'
films = []

for film,url in film_list.items():
    print(film)
    title = film
    film = get_film_details(url,title)
    films.append(film)
    #time.sleep(1)

pp.pprint(films)



    


Black Panther
Star Wars: The Last Jedi
Rogue One
Star Wars: The Force Awakens
American Sniper
Catching Fire
The Avengers
Harry Potter / Deathly Hallows (P2)
Toy Story 3
Avatar
The Dark Knight
Spider-Man 3
Dead Man's Chest
Revenge of the Sith
Shrek 2
Return of the King
Spider-Man
Harry Potter / Sorcerer's Stone
The Grinch
The Phantom Menace
Saving Private Ryan
Titanic
Independence Day
Toy Story
Forrest Gump
Jurassic Park
Aladdin
Terminator 2
Home Alone
Batman
Rain Man
Three Men and a Baby
Top Gun
Back to the Future
Beverly Hills Cop
Return of the Jedi
E.T.
Raiders / Lost Ark
The Empire Strikes Back
[   {   'budget': ' N/A',
        'cast': "Chadwick Boseman, Lupita Nyong'o, Michael B. Jordan, Angela "
                'Bassett, Martin Freeman, Forest Whitaker, Andy Serkis',
        'director': 'Ryan Coogler',
        'distributor': ' Buena Vista',
        'duration': ' 2 hrs. 20 min.',
        'genre': ' Action / Adventure',
        'gross_domestic': None,
        'gross_foreign': None,


In [33]:
films[7:]

[{'budget': ' N/A',
  'cast': 'Steve Kloves',
  'director': 'David Yates',
  'distributor': ' Warner Bros.',
  'duration': ' 2 hrs. 10 min.',
  'genre': ' Fantasy',
  'gross_domestic': '$381,011,219',
  'gross_foreign': None,
  'gross_worldwide': None,
  'image': 'https://ia.media-imdb.com/images/M/MV5BMjIyZGU4YzUtNDkzYi00ZDRhLTljYzctYTMxMDQ4M2E0Y2YxXkEyXkFqcGdeQXVyNTIzOTk5ODM@._V1_UY222_CR0,0,150,222_AL.jpg',
  'rating': ' PG-13',
  'release_date': ' July 15, 2011',
  'sub_genres': "3D, Dragon - Supporting Role, Family - Children's Book Adaptation, Fantasy - Live Action, IMAX (Feature-length), Young-Adult Book Adaptations",
  'title': 'Harry Potter / Deathly Hallows (P2)'},
 {'budget': ' $200 million',
  'cast': 'Michael Arndt, John Lasseter, Andrew Stanton, Lee Unkrich',
  'director': 'Lee Unkrich',
  'distributor': ' Buena Vista',
  'duration': ' 1 hrs. 43 min.',
  'genre': ' Animation',
  'gross_domestic': '$415,004,880',
  'gross_foreign': None,
  'gross_worldwide': None,
  'image

In [102]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException  
from selenium.webdriver.chrome.options import Options
import time
import os
import zipfile
from os import listdir
from os.path import isfile, join
from urllib.request import urlretrieve


chromedriver = "/Users/justinblinder/dev/drivers/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

'''instantiate a chrome options object so you can set the size and headless preference'''
chrome_options = Options()
chrome_options.add_argument("--headless")

''' add missing support for chrome "send_command"  to selenium webdriver (https://stackoverflow.com/a/47366981/190155)'''
#driver.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
#params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': '/Users/justinblinder/Downloads/srt/'}}
#command_result = driver.execute("send_command", params)

#driver = webdriver.Chrome(chromedriver,chrome_options=chrome_options)
driver = webdriver.Chrome(chromedriver)
driver.get("https://subscene.com/")

# set wait time
driver.implicitly_wait(4) 

In [None]:
'''Collect a download url for each film'''
# Chrome headless mode does not allow dowloading files
# Cannot curl file url because it's a cloudflare redirect

download_path = '/Users/justinblinder/Downloads/'
download_links = {}

'''Main subtitle search page'''
def search_film(film_title,year):
    form_input = driver.find_element_by_name("q")
    form_input.send_keys(film_title)
    '''Search for film'''
    form_input.send_keys(Keys.RETURN)
    get_filtered_film(film_title,year)

def parse_subtitle_section(film_title):
    try:
        return driver.find_element_by_xpath('//h2[text()="Exact"]/following-sibling::ul/descendant::div[@class="title"]/descendant::a')
    except NoSuchElementException:
        print("FILM WAS NOT EXACT",film_title)    
    try:
        print("FILM WAS CLOSE",film_title)
        
        link = driver.find_element_by_xpath('//h2[text()="Close"]/following-sibling::ul/descendant::div[@class="title"]/descendant::a')
    except NoSuchElementException:
        print("none")
    try:
        print("FILM WAS POPULAR",film_title)    
        return driver.find_element_by_xpath('//h2[text()="Popular"]/following-sibling::ul/descendant::div[@class="title"]/descendant::a')
    except NoSuchElementException:
        print("none")

def parse_subtitle_section_list(link,film_title,year):
    desired_link = None
    for i,li in enumerate(link):
        a_tag =  li.find_element_by_xpath('div[@class="title"]/descendant::a')
        li_title = a_tag.text.lower().replace(' ','-')
        b_missing = False
        if film_title not in li_title: 
            if year in li_title:
                desired_link = a_tag
                break
    print(desired_link)
    print(desired_link.get_attribute('href'))
    return desired_link

def parse_subtitle_section_test(film_title,year):
    try:
        return driver.find_element_by_xpath('//h2[text()="Exact"]/following-sibling::ul/descendant::div[@class="title"]/descendant::a')
    except NoSuchElementException:
        print("FILM WAS NOT EXACT",film_title)    
    
    try:
        print("FILM WAS CLOSE",film_title)
        film_list = driver.find_elements_by_xpath('//h2[text()="Popular"]/following-sibling::ul/li')
        link = parse_subtitle_section_list(film_list,film_title,year)
        return link
    except NoSuchElementException:
        print("none")

    try:
        print("FILM WAS POPULAR",film_title)    
        film_list = driver.find_element_by_xpath('//h2[text()="Popular"]/following-sibling::ul/descendant::div[@class="title"]/descendant::a')
        link = parse_subtitle_section_list(film_list,film_title,year)
        return link
    except NoSuchElementException:
        print("none")
    
def get_filtered_film(film_title,year):
    '''Similar film list page'''
    link = parse_subtitle_section_test(film_title,year)
    link.click()
    search_subtitles(film_title)


def check_exists_by_xpath(el,xpath):
    try:
        el.find_element_by_xpath(xpath)
    except NoSuchElementException:
        return False
    return True

'''Individual film subtitle list page'''

"""Find all engish subtitles that have a quality flag and aren't for hearing impaired"""
def search_subtitles(film_title):
    english_selector = '//a[contains(@href,"english")]'
    subtitle_links = driver.find_elements_by_xpath(english_selector)
    valid_links = []
    for link in subtitle_links:
        is_quality = check_exists_by_xpath(link,'span[contains(@class,"positive-icon")]')
        hearing_impaired_el = link.find_element_by_xpath('../following-sibling::td/following-sibling::td')     
        is_hearing_impaired = False if hearing_impaired_el.get_attribute('class') == "a40" else True
        if is_quality is True and is_hearing_impaired is False:
            valid_links.append(link)
    valid_links[0].click()
    get_download_link(film_title)
    
'''Find and click download link'''
def get_download_link(film_title):
    download_button = driver.find_element_by_id('downloadButton')
    # TODO: if link is zip then click, else go back to revious page and get a new one
    download_links[film_title] = download_button.get_attribute('href')
    download_button.click()
    unzip_file(film_title)

''' Locally unzip file'''
def unzip_file(film_title):
    time.sleep(2)
    for file in os.listdir(download_path):
        if file.endswith(".zip"):
            filename = os.path.join(download_path, file)
            unzip(filename,film_title)
    driver.get("https://subscene.com/")
    
''' Rename unzipped srt file / zip file and mov to seperate folders'''
def unzip(filename,film_title):
    # unzip file to srt dir
    zip_ref = zipfile.ZipFile(filename, 'r')
    zip_ref.extractall(download_path)
    zip_ref.close()

    # move zip to archive folder
    os.rename(filename, download_path+"archive/"+film_title.replace(' ','-').replace(':','')+'.zip')

    # get unzipped srt file
    srtfile = None
    for file in os.listdir(download_path):
        if file.endswith(".srt"):
            srtfile = os.path.join(download_path, file)
            os.rename(srtfile , download_path + "srt/"+film_title.replace(' ','-').replace(':','')+'.srt')



#films = ["almost famous", 'black panther','Star Wars: The Last Jedi','Rogue One']
for film in films[16:]:
    print(film)
    film_title = film['title']
    year = film['release_date'].split(',')[1].replace(' ', '')
    print(year)
    film['year'] = year #TODO: DO This ELSEWHERE
    search_film(film_title.replace('/',':'),film['year'])



{'title': 'Spider-Man', 'image': 'https://ia.media-imdb.com/images/M/MV5BZDEyN2NhMjgtMjdhNi00MmNlLWE5YTgtZGE4MzNjMTRlMGEwXkEyXkFqcGdeQXVyNDUyOTg3Njg@._V1_UY222_CR0,0,150,222_AL.jpg', 'distributor': ' Sony / Columbia', 'release_date': ' May 3, 2002', 'genre': ' Action / Adventure', 'duration': ' 2 hrs. 1 min.', 'rating': ' PG-13', 'budget': ' $139 million', 'gross_domestic': '$403,706,375', 'gross_foreign': None, 'gross_worldwide': None, 'director': 'Sam Raimi', 'cast': 'David Koepp', 'sub_genres': 'Comic Book Adaptation, Superhero, Superhero - Origin', 'year': '2002'}
2002
FILM WAS NOT EXACT Spider-Man
FILM WAS CLOSE Spider-Man
<selenium.webdriver.remote.webelement.WebElement (session="8319d6fcc4b3ebba0e51dcdb5b841dcb", element="0.7461602894815549-17")>
https://subscene.com/subtitles/spider-man-spiderman
{'title': "Harry Potter / Sorcerer's Stone", 'image': 'https://ia.media-imdb.com/images/M/MV5BNjQ3NWNlNmQtMTE5ZS00MDdmLTlkZjUtZTBlM2UxMGFiMTU3XkEyXkFqcGdeQXVyNjUwNzk3NDc@._V1_UY222_CR0

In [10]:
link = driver.find_element_by_xpath('//h2[text()="Close"]/following-sibling::ul/descendant::div[@class="title"]/descendant::a')
link

<selenium.webdriver.remote.webelement.WebElement (session="7f151f456adef4f7b08696ed8e20e0d8", element="0.8731757835243388-1")>

In [96]:
link = driver.find_elements_by_xpath('//h2[text()="Popular"]/following-sibling::ul/li')
desired_link = None
for i,li in enumerate(link):
    xval =  li.find_element_by_xpath('div[@class="title"]/descendant::a')
    
    t = xval.text.lower().replace(' ','-')
    b_missing = False
    if "spider-man" not in t: 
        b_missing
    if '2002' in t:
        print(t,i)
        desired_link = xval
        break
print(xval.get_attribute('href'))
    


spider-man-(spiderman)-(2002) 4
https://subscene.com/subtitles/spider-man-spiderman
