In [None]:
import pandas as pd
from tqdm import tqdm
import time
from splinter import Browser
from bs4 import BeautifulSoup as bs

In [None]:
#*   Prohibits truncatation of data in prints.
pd.set_option('display.max_colwidth', -1)

In [None]:
titleBasics = pd.read_csv("/Users/nicolespaar/Documents/title.basics.tsv", sep='\t', header=0, low_memory=False)

#todo:::   this should be a function
lowercased_titles = []
count = 0
for row in titleBasics.iterrows():
    try:
        lower = row[1].primaryTitle.lower()
        lowercased_titles.append(lower)
    except:
        count += 1
        lowercased_titles.append("n/a")
print(f"{count} movies had no 'primaryTitle'")
titleBasics['lowercasedTitle'] = lowercased_titles

titleBasics.head(3)

In [None]:
titleCrew = pd.read_csv("/Users/nicolespaar/Documents/title.crew.tsv", sep='\t', header=0)
titleCrew.head(3)

In [None]:
nameBasics = pd.read_csv("/Users/nicolespaar/Documents/name.basics.tsv", sep='\t', header=0, index_col='nconst')
nameBasics.head(3)

In [None]:
titlePrincipals = pd.read_csv("/Users/nicolespaar/Documents/title.principals.tsv", sep='\t', header=0, index_col='nconst')
titlePrincipals.head(3)

In [None]:
#*   User input of which movie to munge attributes for then predict against the model. The
#*   input is turned to all lower case to account for any sort of user casing input.
movieToSearch = str(input("Enter a movie title: ")).lower()  #"Toy Story 4"

In [None]:
def bugFixer(affectedList):
    '''
    bugFixer() receives a python list/array and checks each item to see if its 0th character
    is whitespace. If it is, the whitespace is removed and appended to a new list. If it is
    not, the item is appended to the same new list. The new list with no items' first
    character being whitespace is then returned.
    '''
    fixedList = []
    for item in affectedList:
        if item[0] == " ":
            fixedItem = item[1:]
            fixedList.append(fixedItem)
        else:
            fixedList.append(item)
    return fixedList

def lengths(x):
    if isinstance(x, list):
        yield len(x)
        for y in x:
            yield from lengths(y)

def iAmount(anArray):
    return max(lengths(anArray))

def init_splinter():
    '''
    init_splinter() initializes a selenium chrome webdriver using a Tor proxy and returns it
    as a splinter browser wrapped object.
    '''
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False, incognito=True)
    return browser

def simmer_soup():
    '''
    simmer_soup() receives the browser object and returns the current page's parsed html as "Soup".
    '''
    html = browser.html
    soup = bs(html, 'html.parser')
    return html, soup

#!---- ---- ---- ---- If $movieToSearch exists in db ---- ---- ---- ----!#
def tconstMunge():
    tconst = titleBasicsRow.tconst.to_string(index=False)[1:]
    return tconst

def genresMunge():
    genres_raw = titleBasicsRow.genres.to_string(index=False).split(",")
    genres = bugFixer(genres_raw)
    return genres

def entitiesMunge():
    titleCrewRow = titleCrew[titleCrew['tconst'] == tconst]
    crew_directors = titleCrewRow.directors.to_string(index=False).split(",")
    crew_writers = titleCrewRow.writers.to_string(index=False).split(",")
    titlePrincipalsRow = titlePrincipals[titlePrincipals['tconst'] == tconst]
    
    cast_actors = []
    cast_actors_jobs = []
    cast_other = []
    cast_other_jobs = []
    
    for row in titlePrincipalsRow.iterrows():
        x = row[1].category
        if (x in ('actor', 'actress')):
            cast_actors.append(row[0])
            
            cast_actors_jobs.append(row[1].job)
        else:
            cast_other.append(row[0])
            cast_other_jobs.append(row[1].job)
            
    return cast_actors, cast_actors_jobs, cast_other, cast_other_jobs, crew_directors, crew_writers
    
def entitiesMunge_Parser(cast_actors, cast_other, crew_directors, crew_writers):
    actor_names = []
    cast_names = []
    director_names = []
    writer_names = []
    
    for i in cast_actors:
        actor_names.append(nameBasics.loc[i].primaryName)
    for i in cast_other:
        cast_names.append(nameBasics.loc[i].primaryName)
    for i in crew_directors:
        director_names.append(nameBasics.loc[i].primaryName)
    for i in crew_writers:
        writer_names.append(nameBasics.loc[i].primaryName)
    
    return actor_names, cast_names, director_names, writer_names    
    
#!---- ---- ---- ---- If $movieToSearch does NOT exist in db ---- ---- ---- ----!#






In [None]:
#!---- ---- ---- ---- $movieToSearch regardless of existensce in db ---- ---- ---- ----!#
def companiesGet(tconst):
    imdbURL_base = "https://www.imdb.com/title/"
    imdbURL = str(imdbURL_base + tconst)

    browser = init_splinter()
    browser.visit(imdbURL)
    html, soup = simmer_soup()
    
    attribute_parents = soup.find_all('div', class_='txt-block')
    prod_companies_parent = attribute_parents[-9]
    links = prod_companies_parent.find_all('a')
    
    prod_companies_raw = []
    for i in links:
        prod_companies_raw.append(i.text[1:])
        
    browser.quit()
    
    prod_companies = prod_companies_raw[0:-1]  
    return prod_companies

In [None]:
prod_companies = companiesGet(tconst)
print(prod_companies)

In [None]:
#   Try to match the user's input movie title to an existing row in the IMDB dataset. If it is
#   found, the row is isolated and its tconst and genres are extracted.
titleBasicsArray = titleBasics[titleBasics['lowercasedTitle'] == movieToSearch]
titleBasicsRow = titleBasicsArray[titleBasicsArray['titleType'] == 'movie']
try:
    #todo:::   this `try` should all be a function
    #print(titleBasicsRow.iloc[0])
    tconst = tconstMunge()
    genres = genresMunge()
    cast_actors, cast_actors_jobs, cast_other, cast_other_jobs, crew_directors, crew_writers = entitiesMunge()
    cast_actors, cast_other, crew_directors, crew_writers = bugFixer(cast_actors), bugFixer(cast_other), bugFixer(crew_directors), bugFixer(crew_writers)      
    actor_names, cast_names, director_names, writer_names = entitiesMunge_Parser(cast_actors, cast_other, crew_directors, crew_writers)    
    prod_companies = companiesGet(tconst)
    print("Movie Found!\n")
    print("title: ", movieToSearch, "\ntconst: ", tconst, "\ngenres: ",
             genres, "\nactors' nconsts: ", cast_actors, "\nactors' names", actor_names, 
          "\nactors' jobs: ", cast_actors_jobs, "\nnon-actors' nconsts: ", cast_other,
          "\nnon-actors' names: ", cast_names, "\nnon-actors' jobs: ", cast_other_jobs, 
          "\ndirectors' nsconsts: ", crew_directors, "\ndirectors' names: ", director_names,
          "\nwriters' nconsts: ", crew_writers, "\nwriters' names: ", writer_names,
          "\nproduction companies: ", prod_companies)
except:
    #todo:::   this `except` should all be a function
    print("[!] Movie not Found, Searching Interwebs..")



