In [1]:
import pandas as pd
import time, requests, json, random, os
from splinter import Browser
from bs4 import BeautifulSoup as bs
from subprocess import call

In [2]:
#*   Prohibits truncatation of data in prints.
pd.set_option('display.max_colwidth', -1)

In [3]:
# All Academy Awards winners .csv loaded as a DataFrame 
# (SOURCE: https://datahub.io/rufuspollock/oscars-nominees-and-winners)
allWinAndNom = pd.read_csv("/Users/nicolespaar/Desktop/oscars_winners.csv")

def getWinnersDF_Film(allWinAndNOm):
    allWinners_year = []
    allWinners_category = []
    allWinners_entity = []
    
    # Rows are iterrated and those that have a True bool in the `winner` attribute are
    # appended to the above declared holding arrays.
    for row in allWinAndNom.iterrows():
        if row[1]['winner'] == True:
            allWinners_year.append(row[1]['year'])
            allWinners_category.append(row[1]['category'])
            allWinners_entity.append(row[1]['entity'])
    
    # A new $winners_data dict is created from the above iterrator's resulting arrays. Then a
    # DataFrame is created from the dict.
    winners_data = {'Year': allWinners_year, 'Category': allWinners_category, 'Entity': allWinners_entity}
    winnersDF = pd.DataFrame(data=winners_data)
    
    # The name of the current `Best Picture` Academy Award was called several things
    # throughout the years. Each variation is extracted as a new DataFrame.
    winnersDF_films_A = winnersDF[winnersDF['Category'] == 'BEST PICTURE'].copy()
    winnersDF_films_B = winnersDF[winnersDF['Category'] == 'OUTSTANDING PRODUCTION'].copy()
    winnersDF_films_C = winnersDF[winnersDF['Category'] == 'OUTSTANDING MOTION PICTURE'].copy()
    winnersDF_films_D = winnersDF[winnersDF['Category'] == 'BEST MOTION PICTURE'].copy()
    winnersDF_films_E = winnersDF[winnersDF['Category'] == 'OUTSTANDING PICTURE'].copy()
    
    winners_years = []
    winners_entities = []
    
    # The first two years have incorrect `Entity` values so they are manually altered.
    for row in winnersDF_films_E.iterrows():
        winners_years.append(row[1]['Year'])
    winners_entities.append("wings")
    winners_entities.append("the broadway melody")

    # Each of the other DataFrames holding best picture winners by varying category name are
    # iterrated and added to the above winners arrays. The entity is also changed to lowercase.
    for row in winnersDF_films_B.iterrows():
        winners_years.append(row[1]['Year'])
        winners_entities.append(row[1]['Entity'].lower())
    for row in winnersDF_films_C.iterrows():
        winners_years.append(row[1]['Year'])
        winners_entities.append(row[1]['Entity'].lower())
    for row in winnersDF_films_D.iterrows():
        winners_years.append(row[1]['Year'])
        winners_entities.append(row[1]['Entity'].lower())
    for row in winnersDF_films_A.iterrows():
        winners_years.append(row[1]['Year'])
        winners_entities.append(row[1]['Entity'].lower())
    winners_film_data = {'Year': winners_years, 'Entity': winners_entities}

    # A DataFrame of the cleaned and lowercased winners is created.
    winnersDF_films = pd.DataFrame(data=winners_film_data)
    
    return winnersDF_films, winners_entities, winners_years

In [4]:
winnersDF_Film, entities, years = getWinnersDF_Film(allWinAndNom)
winnersDF_Film.head(3)

Unnamed: 0,Year,Entity
0,1927,wings
1,1928,the broadway melody
2,1929,all quiet on the western front


In [5]:
#! --- --- --- This creates a csv containing all of the titleBasics' movie titles lowercased
#! --- --- --- and as a new column. It takes a while so I'm leaving it commented out for now.

#print("titleBasics..")
#titleBasics = pd.read_csv("/Users/nicolespaar/Desktop/oscarDataAnalysis/Spaar/resources/data/title.basics.tsv", sep='\t', header=0, low_memory=False)
#lowercased_titles = []
#count = 0
#for row in titleBasics.iterrows():
#    try:
#        lower = row[1].primaryTitle.lower()
#        lowercased_titles.append(lower)
#    except:
#        count += 1
#        lowercased_titles.append("n/a")
#print(f"{count} movies had no 'primaryTitle'")
#titleBasics['lowercasedTitle'] = lowercased_titles
#titleBasics.to_csv("titleBasics_spaar.csv")

In [6]:
# IMdB title information .csv + my own munging of `title.lower()` is loaded in as $titleBasics.
# (SOURCE: https://www.imdb.com/interfaces/)
titleBasics = pd.read_csv("/Users/nicolespaar/Desktop/titleBasics_spaar.csv", index_col='tconst', low_memory=False)
titleBasics.head(3)

Unnamed: 0_level_0,Unnamed: 0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,lowercasedTitle
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
tt0000001,0,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",carmencita
tt0000002,1,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",le clown et ses chiens
tt0000003,2,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance",pauvre pierrot


In [8]:
#!---- ---- ---- ---- From the Past ---- ---- ---- ----!#

def init_splinter():
    '''
    init_splinter() initializes a selenium chrome webdriver using a Tor proxy and returns it
    as a splinter browser wrapped object.
    '''
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False, incognito=True)
    return browser



def simmer_soup():
    '''
    simmer_soup() receives the browser object and returns the current page's parsed html as "Soup".
    '''
    html = browser.html
    soup = bs(html, 'html.parser')
    return html, soup

#!---- ---- ---- ---- From Scratch ---- ---- ---- ----!#

bechdel_base = "http://bechdeltest.com/api/v1/getMoviesByTitle?title="
bechdel_bools = []

def callAPI(movieToSearch):
    '''
    callAPI() calls the above url (Bechdel test API) with a movie title argument. The
    response is returned as a JSON in a variable ($conn). If the length of the response
    is less than or equal to 2, the response is a False (Either hasn't been Bechdel rated,
    or is rated a False for passing) and a 0 is returned. Else a True 1 is returned.
    '''
    conn = str(requests.get(bechdel_base + movieToSearch).json())
    if len(conn) <= 2:
        return 0
    else:
        return 1
    
    
     
def monthToNum(MonthName):
    '''
    monthtoNum() receives a Month name string and returns its corresponding int.
    '''
    return{
        'January': 1,
        'February': 2,
        'March': 3,
        'April': 4,
        'May': 5,
        'June': 6,
        'July': 7,
        'August': 8,
        'September': 9,
        'October': 10,
        'November': 11,
        'December': 12
    }[MonthName]



def getRateLimit():
    '''
    getRateLimit() returns a random number between 2.66 and 6.33, rounded to 2 decimals 
    places. The result is used as a rate-limiter for our scraping calls as to slow down
    requests and not be rude to IMdB.
    '''
    return round(random.uniform(2.66, 6.33), 2)

In [9]:
scraped = []

for row in winnersDF_Film.iterrows():
    
    # A random amount of time to throttle our requests is generated for each row iterration.
    throttle = getRateLimit()
    movieToSearch = row[1].Entity
    movieYear = row[1].Year
    
    # The downloaded IMdB database >> DataFrame is searched for the current iterration's movie.
    titleBasicsArray = titleBasics[titleBasics['lowercasedTitle'] == movieToSearch]
    titleBasicsRow_RAW = titleBasicsArray[titleBasicsArray['titleType'] == 'movie']
    titleBasicsRow = titleBasicsRow_RAW[titleBasicsRow_RAW['startYear'] == str(movieYear)]
    tconst = titleBasicsRow.index.to_list()
    tconst_str = ''.join(tconst)
    tconst_str = tconst_str[0:9]
    
    # If the length of the result of the search's tconst string is NOT 0 (empty/no result
    # found), then the associated details as well as the result of its Bechdel rating API
    # caller are appended as a list. That list is then appended into the $scraped array.
    if len(tconst_str) != 0:
        print(f"APPENDING info of: '{row[1].Entity}' ({row[1].Year}) with a throttle of: {throttle}s")
        temp = []
        temp.append(movieToSearch)
        temp.append(tconst_str) 
        temp.append(movieYear)
        time.sleep(float(throttle / 2))
        bechdelBool = callAPI(movieToSearch)
        temp.append(bechdelBool)
        scraped.append(temp)  
        
    # If the length of the result of the search's tconst string IS 0 (empty/no results found),
    # then the needed details are scraped from IMdB's website.
    else:
        print(f"   [!] SCRAPING info of: '{row[1].Entity}' ({row[1].Year}) with a throttle of: {throttle}s")
    
        temp = []
        time.sleep(throttle)
        
        # The below URL can be used as an API of sorts by appending the $movieToSearch to it,
        # we must only replace the whitespace with a plus sign.
        base_url = "https://www.imdb.com/search/title/?title=" 
        movieToSearch = movieToSearch.replace(" ", "+")
        full_url = (base_url + movieToSearch)
        browser = init_splinter()
        browser.driver.minimize_window()
        browser.visit(full_url)
        
        # The results page is parsed with Beautifulsoup4 and a `soup` object returned. Then
        # the plus signs are replaced back with whitespace in our $movieToSearch. The `soup`
        # object is parsed and iterrated for the correct result that corresponds to the movie
        # we are after. Its <href> characters [7:16] are spliced as its `tconst` and appended
        # along with the rest of its associated information to a list. 
        html, soup = simmer_soup()
        movieToSearch = movieToSearch.replace("+", " ")
        results = soup.find_all('h3', class_='lister-item-header')
        for r in results:
            rYear = soup.select('h3 > span')[1].get_text(strip=True)
            rYear = rYear.replace("(", "")
            rYear = rYear.replace(")", "")
            if (r.a.text.lower() == movieToSearch) and movieToSearch not in temp:
                link_end = r.a['href']
                tconst_str = link_end[7:16]
                temp.append(movieToSearch)
                temp.append(tconst_str) 
                temp.append(movieYear)   
        browser.quit()
        
        #? The Bechdel rating for our new movie is acquired via the Bechdel callAPI() function,
        #? and appended to the rest of the information we scraped's list. Once complete, that
        #? list is then appended into the $scraped array.
        bechdelBool = callAPI(movieToSearch)
        temp.append(bechdelBool)
        scraped.append(temp)

print("\n\nHead of scrapes:")
print(scraped[0:5])
print("\nTail of scrapes:")
print(scraped[-6:-1])

APPENDING info of: 'wings' (1927) with a throttle of: 3.62s
   [!] SCRAPING info of: 'the broadway melody' (1928) with a throttle of: 3.54s
   [!] SCRAPING info of: 'all quiet on the western front' (1929) with a throttle of: 4.18s
   [!] SCRAPING info of: 'cimarron' (1930) with a throttle of: 4.44s
   [!] SCRAPING info of: 'grand hotel' (1931) with a throttle of: 4.87s
   [!] SCRAPING info of: 'cavalcade' (1932) with a throttle of: 5.54s
APPENDING info of: 'it happened one night' (1934) with a throttle of: 4.79s
APPENDING info of: 'mutiny on the bounty' (1935) with a throttle of: 5.78s
APPENDING info of: 'the great ziegfeld' (1936) with a throttle of: 4.07s
APPENDING info of: 'the life of emile zola' (1937) with a throttle of: 4.29s
APPENDING info of: 'you can't take it with you' (1938) with a throttle of: 3.72s
APPENDING info of: 'gone with the wind' (1939) with a throttle of: 2.79s
APPENDING info of: 'rebecca' (1940) with a throttle of: 2.73s
APPENDING info of: 'how green was my vall

In [10]:
# Check to see if TGF 2 && Cimarron are b0rked per usual:
print(scraped[45:48], "\n")
print(scraped[2:5])

[['the sting', 'tt0070735', 1973, 0], [0], ["one flew over the cuckoo's nest", 'tt0073486', 1975, 0]] 

[['all quiet on the western front', 'tt0020629', 1929, 1], ['cimarron', 'tt0053715', 1930, 1], ['grand hotel', 'tt7671068', 1931, 1]]


In [11]:
# Insert its data manually since it is.
scraped[46] = ['the godfather part ii', 'tt0071562', 1974, 0]
scraped[3] = ['cimarron', 'tt0021746', 1931, 1]

In [12]:
# Double check that it looks okay after manual insertion.
print(scraped[45:48], "\n")
print(scraped[2:5])

[['the sting', 'tt0070735', 1973, 0], ['the godfather part ii', 'tt0071562', 1974, 0], ["one flew over the cuckoo's nest", 'tt0073486', 1975, 0]] 

[['all quiet on the western front', 'tt0020629', 1929, 1], ['cimarron', 'tt0021746', 1931, 1], ['grand hotel', 'tt7671068', 1931, 1]]


In [13]:
Model_finalTitles = []
Model_finalTconsts = []
Model_finalYear = []
Model_finalBechdel = []

# The scraped data and Bechdel scores are appended to arrays, then they're made into a dict 
# that is used to create a DataFrame.
for winner in scraped:
    Model_finalTitles.append(winner[0])
    Model_finalTconsts.append(winner[1])
    Model_finalYear.append(winner[2])
    Model_finalBechdel.append(winner[3])

model_data = {'TCONST': Model_finalTconsts, 'TITLE': Model_finalTitles, 'YEAR': Model_finalYear, 'BECHDEL': Model_finalBechdel}    
modelDF = pd.DataFrame(data=model_data)
print("HEAD of [modelDF] results from `tconst` collection/scraping:\n")
print(modelDF.head())
print("\n\n\nTAIL of [modelDF] results from `tconst` collection/scraping:\n")
print(modelDF.tail())

HEAD of [modelDF] results from `tconst` collection/scraping:

      TCONST                           TITLE  YEAR  BECHDEL
0  tt0018578  wings                           1927  1      
1  tt0019729  the broadway melody             1928  0      
2  tt0020629  all quiet on the western front  1929  1      
3  tt0021746  cimarron                        1931  1      
4  tt7671068  grand hotel                     1931  1      



TAIL of [modelDF] results from `tconst` collection/scraping:

       TCONST                                            TITLE  YEAR  BECHDEL
85  tt2024544  12 years a slave                                 2013  1      
86  tt2562232  birdman or (the unexpected virtue of ignorance)  2014  0      
87  tt1895587  spotlight                                        2015  1      
88  tt4975722  moonlight                                        2016  1      
89  tt5580390  the shape of water                               2017  0      


In [14]:
def getModelDF_TITLE(count):
    return modelDF.loc[int(count - 1)].TITLE

def getModelDF_YEAR(count):
    return modelDF.loc[int(count - 1)].YEAR

def getModelDF_BECHDEL(count):
    return modelDF.loc[int(count - 1)].BECHDEL

tconstDoOvers = []
titleDoOvers = []
yearDoOvers = []
bechdelDoOvers = []
def changeTCONST_LIST(NEW_tconst, CHECK_title, CHECK_year, CHECK_bechdel):
    tconstDoOvers.append(NEW_tconst)
    titleDoOvers.append(CHECK_title)
    yearDoOvers.append(int(CHECK_year - 1))
    bechdelDoOvers.append(CHECK_bechdel)
    
def clear():
    _ = call('clear' if os.name == 'posix' else 'cls')

In [15]:
# The below [empty] arrays declared are what we go and scrape for with the above created
# Dataframe of movies (modelDF).
scores = []
votecount = []
budgets = []
companies = []
releasedDays = []
releasedMonths = []
releasedYears = []
releasedCountries = []

imdb_base = "https://www.imdb.com/title/"

count = 0
iterLength = len(Model_finalTconsts)

for pk in Model_finalTconsts:
    
    count += 1
    CHECK_title = getModelDF_TITLE(count)
    CHECK_year = getModelDF_YEAR(count)
    CHECK_bechdel = getModelDF_BECHDEL(count)
    
    # A random amount of time to throttle our requests is generated for each `tconst` 
    # iterration. That `tconst` is concactenated onto the base IMdB movie page URL, and
    # it is then visited. That page's html is parsed with Beautifulsoup4 and a `soup`
    # object is returned.
    throttle = getRateLimit()
    imdb_end = pk
    url = (imdb_base + imdb_end)
    print(f"SCRAPE {count}/{iterLength}   >>>>   tconst: '{imdb_end}'   throttled by: {throttle}s   title: '{CHECK_title}'")
    time.sleep(throttle)
    browser = init_splinter()
    browser.visit(url)
    html, soup = simmer_soup()
    
    # The movie's ratings and vote count parent tag is extracted from the `soup` object and 
    # declared as $ratings.
    ratings_parent = soup.find('div', class_='ratingValue').strong['title']
    ratings = ratings_parent.split()
    
    # The movie's rating's score is appended to the current iterration's extraction list.
    score = ratings[0]
    scores.append(score)

    # The movie's rating's vote count is appended to the current iterration's extraction list.
    totalVotes = ratings[3]
    votecount.append(totalVotes.replace(",", ""))
    
    # Booleans for each of the troublesome features are declared as nil before attempting to
    # extract their associated categorical data from IMdB.
    budgetBOOL = False
    dayBOOL = False
    monthBOOL = False
    yearBOOL = False
    countryBOOL = False
    
    # The movie's certs parent tag is extracted from the `soup` object and declared as
    # $cert_raw. It is then iterrated through looking for <h4> tags' text that equal
    # `Budget`, `Production Co`, and `Release Date` to extract as attributes to append
    # as associated information/features of the dataset.
    cert_raw = soup.find_all('div', class_='txt-block')
    for cert in cert_raw:
        try:
            
            # If the `Budget` <h4> text is encountered within the iterration of the <div>s of
            # .txt-block, and if its length is not null, its Boolean is set to true and its
            # value from the 8th character on are appended to the current iterration's 
            # extraction list.
            if cert.h4.text == "Budget:":
                budget_RAW = cert.text.split()
                budget = budget_RAW[0]               
                if len(budget[8:]) != 0:
                    budgetBOOL = True
                    budgets.append(budget[8:].replace(",", ""))

            # If the `Production Co:` <h4> text is encountered within the iterration of the 
            # <div>s of .txt-block, its Boolean is set to true and its value from the 3rd 
            # indice/element on are appended to the current iterration's extraction list.
            elif cert.h4.text == "Production Co:":
                companies_RAW = cert.text.split()
                company = companies_RAW[2]            
                companies.append(company)   

            # If the `Production Co:` <h4> text is encountered within the iterration of the 
            # <div>s of .txt-block, its 3rd, 4th, and 5th indices/elements are extracted out
            # as new $VARs. The month ($release2) is converted to a numerical representation
            # of its string value, too.
            elif cert.h4.text == "Release Date:":
                release_RAW = cert.text.split()
                release1 = release_RAW[2]
                release2 = release_RAW[3]
                release3 = release_RAW[4]
                release2 = monthToNum(release2)
             
                # If the day ($release1) is not null, its Boolean is set to true and its value
                # is appended to the current iterration's extraction list.
                if len(release1) != 0:
                    dayBOOL = True
                    releasedDays.append(release1)
                
                # If the month ($release2) is not 1-12, its Boolean is set to true and its 
                # value is appended to the current iterration's extraction list.
                if release2 in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]:
                    monthBOOL = True
                    releasedMonths.append(release2)
                
                # If the year ($release3) is not null, its Boolean is set to true and its 
                # value is appended to the current iterration's extraction list.
                if len(release3) != 0:
                    yearBOOL = True
                    releasedYears.append(release3)
                
                # The 6th indice/element of $release_RAW is extracted out as a new variable,
                # $release_country. It also has its parens removed from its contents.
                release_country = release_RAW[5]
                release_country = release_country.replace("(", "")
                release_country = release_country.replace(")", "")
                
                # If the country ($release_country) is not null, its Boolean is set to true
                # and its value is appended to the current iterration's extraction list.
                if len(release_country) != 0:
                    countryBOOL = True
                    releasedCountries.append(release_country)
                    
        # Each iterration errors out at some point, so this allows the program to continue
        # regardless of what errors are being thrown about it.
        except:
            pass
    
    # I tried many ways to make this more elegant of code, but had to cut my losses for this
    # version that, while ugly: does work as intended. This is a refactored version of what
    # was all individual $___BOOL checks. If any $___BOOL $VARs are not True, then the user
    # is prompted if the existing `tconst` is indeed correct.
    if budgetBOOL != True or dayBOOL!= True or monthBOOL != True or yearBOOL != True or countryBOOL != True:
        print(f"   [!] ERROR [!]")
        tconstCheck = None
        while tconstCheck not in ("y", "n"):
            tconstCheck = str(input(f"   [!] Is {pk} the correct TCONST for '{CHECK_title}' ({CHECK_year})? [!]    [y]/[n] "))
            
            # If YES (the existing `tconst` is correct), the user is prompted to manually
            # input the data that was unable to be scraped.
            if tconstCheck == "y":
                if budgetBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $BUDGET$   (As a whole number): [!] "))
                    budgets.append(manualInsertion)
                if dayBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $RELEASE DAY$   (As a whole number, no leading zeros): [!] "))
                    releasedDays.append(manualInsertion)
                if monthBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $RELEASE_MONTH$   (As a whole number, no leading zeros): [!] "))
                    releasedMonths.append(manualInsertion)
                if yearBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $RELEASE_YEAR$   (As a whole number, 4 digits): [!] "))
                    releasedYears.append(manualInsertion)
                if countryBOOL != True:
                    manualInsertion = str(input(f"   [!] ENTER {pk}'s   $RELEASE_COUNTRY$   (As an uppercase abbr if +1 word eg: 'USA', else as a Titlecase string like 'Japan'): [!] "))
                    releasedCountries.append(manualInsertion)
            
            # If NO (the existing `tconst` is incorrect), the user is prompted to manually
            # input the correct `tconst`, and if it has not already been appended to the
            # $tconstDoOvers, then it is added. 
            elif tconstCheck == "n":
                NEW_tconst = str(input("   [!] Enter the correct TCONST: [!] "))         
                if NEW_tconst not in tconstDoOvers:
                    changeTCONST_LIST(NEW_tconst, CHECK_title, CHECK_year, CHECK_bechdel)
                    
                # Also, each bool that was left False (the scraper couldn't locate the correct
                # data) signals a value of `None` to be appended to the current iterration's
                # extraction list so that its row can later be dropped in favor of $tconstDoOvers    
                if budgetBOOL != True:
                    budgets.append(None)
                if dayBOOL != True:
                    releasedDays.append(None)
                if monthBOOL != True:
                    releasedMonths.append(None)
                if yearBOOL != True:
                    releasedYears.append(None)
                if countryBOOL != True:
                    releasedCountries.append(None)
            else:
                clear()
                print("      [!] [!]    You must enter 'y' or 'n'    [!] [!]")
                time.sleep(4)
                                    
    
    browser.quit()
    
os.system('say "skeet skeet"')

SCRAPE 1/90   >>>>   tconst: 'tt0018578'   throttled by: 3.72s   title: 'wings'
SCRAPE 2/90   >>>>   tconst: 'tt0019729'   throttled by: 2.99s   title: 'the broadway melody'
SCRAPE 3/90   >>>>   tconst: 'tt0020629'   throttled by: 6.21s   title: 'all quiet on the western front'
SCRAPE 4/90   >>>>   tconst: 'tt0021746'   throttled by: 4.37s   title: 'cimarron'
SCRAPE 5/90   >>>>   tconst: 'tt7671068'   throttled by: 5.24s   title: 'grand hotel'
   [!] ERROR [!]
   [!] Is tt7671068 the correct TCONST for 'grand hotel' (1931)? [!]    [y]/[n] n
   [!] Enter the correct TCONST: [!] tt0022958
SCRAPE 6/90   >>>>   tconst: 'tt0023876'   throttled by: 4.4s   title: 'cavalcade'
SCRAPE 7/90   >>>>   tconst: 'tt0025316'   throttled by: 4.3s   title: 'it happened one night'
SCRAPE 8/90   >>>>   tconst: 'tt0026752'   throttled by: 3.16s   title: 'mutiny on the bounty'
   [!] ERROR [!]
   [!] Is tt0026752 the correct TCONST for 'mutiny on the bounty' (1935)? [!]    [y]/[n] y
   [!] ENTER tt0026752's 

SCRAPE 55/90   >>>>   tconst: 'tt0083987'   throttled by: 3.17s   title: 'gandhi'
SCRAPE 56/90   >>>>   tconst: 'tt0086425'   throttled by: 3.9s   title: 'terms of endearment'
SCRAPE 57/90   >>>>   tconst: 'tt0086879'   throttled by: 5.96s   title: 'amadeus'
SCRAPE 58/90   >>>>   tconst: 'tt0089755'   throttled by: 2.82s   title: 'out of africa'
SCRAPE 59/90   >>>>   tconst: 'tt0091763'   throttled by: 4.85s   title: 'platoon'
SCRAPE 60/90   >>>>   tconst: 'tt0093389'   throttled by: 4.13s   title: 'the last emperor'
SCRAPE 61/90   >>>>   tconst: 'tt0095953'   throttled by: 4.99s   title: 'rain man'
SCRAPE 62/90   >>>>   tconst: 'tt0097239'   throttled by: 4.55s   title: 'driving miss daisy'
SCRAPE 63/90   >>>>   tconst: 'tt0099348'   throttled by: 3.17s   title: 'dances with wolves'
SCRAPE 64/90   >>>>   tconst: 'tt0102926'   throttled by: 2.93s   title: 'the silence of the lambs'
SCRAPE 65/90   >>>>   tconst: 'tt0105695'   throttled by: 4.39s   title: 'unforgiven'
SCRAPE 66/90   >>>>

In [16]:
modelDF['RATING'] = scores
modelDF['VOTES'] = votecount
modelDF['PRODUCTION_CO'] = companies

modelDF['BUDGET'] = budgets
modelDF['RELEASE_DAY'] = releasedDays
modelDF['RELEASE_MON'] = releasedMonths
modelDF['RELEASE_YEAR'] = releasedYears
modelDF['COUNTRY'] = releasedCountries

modelDF.head()

Unnamed: 0,TCONST,TITLE,YEAR,BECHDEL,RATING,VOTES,PRODUCTION_CO,BUDGET,RELEASE_DAY,RELEASE_MON,RELEASE_YEAR,COUNTRY
0,tt0018578,wings,1927,1,7.7,10234,Paramount,2000000.0,5,1.0,1929,USA
1,tt0019729,the broadway melody,1928,0,6.1,5836,Metro-Goldwyn-Mayer,379000.0,6,6.0,1929,USA
2,tt0020629,all quiet on the western front,1929,1,8.0,53729,Universal,1448864.0,24,8.0,1930,USA
3,tt0021746,cimarron,1931,1,5.9,4837,RKO,1433000.0,9,2.0,1931,USA
4,tt7671068,grand hotel,1931,1,6.6,1128,Unbelievable,,17,6.0,2019,USA


In [17]:
# The below [empty] arrays declared are what we go and scrape for with the $tconstDoOvers
# (they originally had the incorrect `tconst`.
scores = []
votecount = []
budgets = []
companies = []
releasedDays = []
releasedMonths = []
releasedYears = []
releasedCountries = []

imdb_base = "https://www.imdb.com/title/"

count = 0
iterLength = len(tconstDoOvers)

for pk in tconstDoOvers:
    
    count += 1
    
    # A random amount of time to throttle our requests is generated for each `tconst` 
    # iterration. That `tconst` is concactenated onto the base IMdB movie page URL, and
    # it is then visited. That page's html is parsed with Beautifulsoup4 and a `soup`
    # object is returned.
    throttle = getRateLimit()
    imdb_end = pk
    url = (imdb_base + imdb_end)
    print(f"SCRAPE {count}/{iterLength}   >>>>   tconst: '{imdb_end}'   throttled by: {throttle}s")   #title: '{CHECK_title}'")
    time.sleep(throttle)
    browser = init_splinter()
    browser.visit(url)
    html, soup = simmer_soup()
    
    # The movie's ratings and vote count parent tag is extracted from the `soup` object and 
    # declared as $ratings.
    ratings_parent = soup.find('div', class_='ratingValue').strong['title']
    ratings = ratings_parent.split()
    
    # The movie's rating's score is appended to the current iterration's extraction list.
    score = ratings[0]
    scores.append(score)

    # The movie's rating's vote count is appended to the current iterration's extraction list.
    totalVotes = ratings[3]
    votecount.append(totalVotes.replace(",", ""))
    
    # Booleans for each of the troublesome features are declared as nil before attempting to
    # extract their associated categorical data from IMdB.
    budgetBOOL = False
    dayBOOL = False
    monthBOOL = False
    yearBOOL = False
    countryBOOL = False
    
    # The movie's certs parent tag is extracted from the `soup` object and declared as
    # $cert_raw. It is then iterrated through looking for <h4> tags' text that equal
    # `Budget`, `Production Co`, and `Release Date` to extract as attributes to append
    # as associated information/features of the dataset.
    cert_raw = soup.find_all('div', class_='txt-block')
    for cert in cert_raw:
        try:
            
            # If the `Budget` <h4> text is encountered within the iterration of the <div>s of
            # .txt-block, and if its length is not null, its Boolean is set to true and its
            # value from the 8th character on are appended to the current iterration's 
            # extraction list.
            if cert.h4.text == "Budget:":
                budget_RAW = cert.text.split()
                budget = budget_RAW[0]               
                if len(budget[8:]) != 0:
                    budgetBOOL = True
                    budgets.append(budget[8:].replace(",", ""))

            # If the `Production Co:` <h4> text is encountered within the iterration of the 
            # <div>s of .txt-block, its Boolean is set to true and its value from the 3rd 
            # indice/element on are appended to the current iterration's extraction list.
            elif cert.h4.text == "Production Co:":
                companies_RAW = cert.text.split()
                company = companies_RAW[2]            
                companies.append(company)   

            # If the `Production Co:` <h4> text is encountered within the iterration of the 
            # <div>s of .txt-block, its 3rd, 4th, and 5th indices/elements are extracted out
            # as new $VARs. The month ($release2) is converted to a numerical representation
            # of its string value, too.
            elif cert.h4.text == "Release Date:":
                release_RAW = cert.text.split()
                release1 = release_RAW[2]
                release2 = release_RAW[3]
                release3 = release_RAW[4]
                release2 = monthToNum(release2)
             
                # If the day ($release1) is not null, its Boolean is set to true and its value
                # is appended to the current iterration's extraction list.
                if len(release1) != 0:
                    dayBOOL = True
                    releasedDays.append(release1)
                
                # If the month ($release2) is not 1-12, its Boolean is set to true and its 
                # value is appended to the current iterration's extraction list.
                if release2 in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]:
                    monthBOOL = True
                    releasedMonths.append(release2)
                
                # If the year ($release3) is not null, its Boolean is set to true and its 
                # value is appended to the current iterration's extraction list.
                if len(release3) != 0:
                    yearBOOL = True
                    releasedYears.append(release3)
                
                # The 6th indice/element of $release_RAW is extracted out as a new variable,
                # $release_country. It also has its parens removed from its contents.
                release_country = release_RAW[5]
                release_country = release_country.replace("(", "")
                release_country = release_country.replace(")", "")
                
                # If the country ($release_country) is not null, its Boolean is set to true
                # and its value is appended to the current iterration's extraction list.
                if len(release_country) != 0:
                    countryBOOL = True
                    releasedCountries.append(release_country)
                    
        # Each iterration errors out at some point, so this allows the program to continue
        # regardless of what errors are being thrown about it.
        except:
            pass
    
    # I tried many ways to make this more elegant of code, but had to cut my losses for this
    # version that, while ugly: does work as intended. This is a refactored version of what
    # was all individual $___BOOL checks. If any $___BOOL $VARs are not True, then the user
    # is prompted if the existing `tconst` is indeed correct.
    if budgetBOOL != True or dayBOOL!= True or monthBOOL != True or yearBOOL != True or countryBOOL != True:
        print(f"   [!] ERROR [!]")
        tconstCheck = None
        while tconstCheck not in ("y", "n"):
            tconstCheck = str(input(f"   [!] Is {pk} the correct TCONST for '{CHECK_title}' ({CHECK_year})? [!]    [y]/[n] "))
            
            # If YES (the existing `tconst` is correct), the user is prompted to manually
            # input the data that was unable to be scraped.
            if tconstCheck == "y":
                if budgetBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $BUDGET$   MANUALLY (As a whole number): [!] "))
                    budgets.append(manualInsertion)
                if dayBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $RELEASE DAY$   MANUALLY (As a whole number, no leading zeros): [!] "))
                    releasedDays.append(manualInsertion)
                if monthBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $RELEASE_MONTH$   MANUALLY (As a whole number, no leading zero): [!] "))
                    releasedMonths.append(manualInsertion)
                if yearBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $RELEASE_YEAR$   MANUALLY (As a whole number, 4 digits): [!] "))
                    releasedYears.append(manualInsertion)
                if countryBOOL != True:
                    manualInsertion = str(input(f"   [!] ENTER {pk}'s   $RELEASE_COUNTRY$   MANUALLY (As a string abbreviation, eg: 'USA'): [!] "))
                    releasedCountries.append(manualInsertion)
            
            # If NO (the existing `tconst` is incorrect), the user is prompted to manually
            # input the correct `tconst`, and if it has not already been appended to the
            # $tconstDoOvers, then it is added. 
            elif tconstCheck == "n":
                NEW_tconst = str(input("   [!] Enter the correct TCONST: [!] "))         
                if NEW_tconst not in tconstDoOvers:
                    changeTCONST_LIST(NEW_tconst)
                    
                # Also, each bool that was left False (the scraper couldn't locate the correct
                # data) signals a value of `None` to be appended to the current iterration's
                # extraction list so that its row can later be dropped in favor of $tconstDoOvers    
                if budgetBOOL != True:
                    budgets.append(None)
                if dayBOOL != True:
                    releasedDays.append(None)
                if monthBOOL != True:
                    releasedMonths.append(None)
                if yearBOOL != True:
                    releasedYears.append(None)
                if countryBOOL != True:
                    releasedCountries.append(None)
            else:
                clear()
                print("      [!] [!]    You must enter 'y' or 'n'    [!] [!]")
                time.sleep(4)
                                    
    
    browser.quit()

os.system('say "skeet skeet"')

SCRAPE 1/1   >>>>   tconst: 'tt0022958'   throttled by: 5.62s


In [26]:
##########yearDoOvers[0] = 1930

In [27]:
# Peep at the DataFrame created from received information from previous (main) big iterrator.
redoData = {'TCONST': tconstDoOvers, 'TITLE': titleDoOvers, 'YEAR': yearDoOvers, 'BECHDEL': bechdelDoOvers}
redoDF = pd.DataFrame(data=redoData)
redoDF

Unnamed: 0,TCONST,TITLE,YEAR,BECHDEL
0,tt0022958,grand hotel,1930,1


In [28]:
# Add in the new data scraped from IMdB now with the correct `tconst` to the $redoDF.
redoDF['RATING'] = scores
redoDF['VOTES'] = votecount
redoDF['PRODUCTION_CO'] = companies

redoDF['BUDGET'] = budgets
redoDF['RELEASE_DAY'] = releasedDays
redoDF['RELEASE_MON'] = releasedMonths
redoDF['RELEASE_YEAR'] = releasedYears
redoDF['COUNTRY'] = releasedCountries

redoDF

Unnamed: 0,TCONST,TITLE,YEAR,BECHDEL,RATING,VOTES,PRODUCTION_CO,BUDGET,RELEASE_DAY,RELEASE_MON,RELEASE_YEAR,COUNTRY
0,tt0022958,grand hotel,1930,1,7.5,15822,Metro-Goldwyn-Mayer,700000,11,9,1932,USA


In [96]:
## All rows containing our appended value of $None (the rows that had an incorrect `tconst`)
## are dropped from the $modelDF dataframe.
#modelDF.dropna(inplace=True)
#modelDF.head()

In [97]:
## Both Dataframes are joined upon each other for the final encompassing dataset consisting of 
## $modelDF and $redoDF. We can now run a model through it.
#allDF = modelDF.append(redoDF)
#allDF.tail()

In [98]:
## The $allDF DataFrame is ordered by the values in the `YEAR` column
#allDF = allDF.sort_values(by = 'YEAR')
#allDF.head()

In [99]:
## Now we will label all of our entries as `winners` for classification.
#winningBools = []
#for i in range(len(allDF)):
#    winningBools.append(1)
#    
#winningRanks = []
#for i in range(len(allDF)):
#    winningRanks.append(1)
#
## The list of 1s we created are appended as a new column to $allDF.
#allDF['RANK'] = winningRanks
#allDF['WINNER'] = winningBools
#
#winnerTitles = allDF['TITLE'].tolist()
#
#allDF.to_csv("/Users/nicolespaar/Desktop/allDF.csv")
#allDF.head()

In [34]:
#////////////////////////////////////////////////////////////////////////////////////////////#
#////////////////////////////////////////////////////////////////////////////////////////////#
#////////////////////////////////////////////////////////////////////////////////////////////#
#////////////////////////////////////////////////////////////////////////////////////////////#
#////////////////////////////////////////////////////////////////////////////////////////////#

In [67]:
#! --- Now we load in the non-winners (9 more per year that did *not* win) obtained by Chris' 
#! --- scraping... then clean up and drop columns that wouldn't match existing data. A .csv
#! --- is output, so we'll comment this all out and load the .csv directly.
#! --- (SOURCE: http://www.films101.com/year10y.htm)

#def getLosers_Film(loserDF_RAW):
#    
#    # We drop all rows that aren't of year 1927 or higher before dropping the columns 
#    # `Directors` and `Countries`.
#    loserDF = loserDF_RAW[(loserDF_RAW.Year >= 1927)].copy()
#    loserDF = loserDF.drop(['Directors', 'Countries'], axis=1).copy()
#    
#    # This creates a csv containing all of the losers' titles lowercased instead of titlecased.
#    lowercased_losers = []
#    years_losers = []
#    rank_losers = []
#    count = 0
#    for row in loserDF.iterrows():   
#        # Ensure we are not doing duplicate work on our Best Picture winners already munged.
#        if row[1].Title.lower() not in winnerTitles:
#            try:
#                lower = row[1].Title.lower()
#                lowercased_losers.append(lower)        
#                year = row[1].Year
#                years_losers.append(year)       
#                rank = row[1].Rank
#                rank_losers.append(rank)
#            except:
#                count += 1
#                lowercased_losers.append(None)
#        else:
#            pass
#        
#    print(f"{count} appendments errored out..")
#    
#    # Now we create a fresh DataFrame from our cleaning, then return it.
#    losers_data = {'TITLE': lowercased_losers, 'YEAR': years_losers, 'RANK': rank_losers}
#    loser_DF = pd.DataFrame(data=losers_data)
#    return loser_DF
#
#
#
## The original scraped list is read in and passed into `getLosers_Film()` to return a cleaned
## up new DataFrame that we export into a new .csv file.
#loserDF_RAW = pd.read_csv("/Users/nicolespaar/Desktop/films101.csv")
#losersDF = getLosers_Film(loserDF_RAW)
#losersDF.to_csv("/Users/nicolespaar/Desktop/losers.csv")
#losersDF.head(3)

In [105]:
# We read in the above created `losers.csv`, drop the random column that appeared, then sort
# the csv's data as a DataFrame ($losersDF) by `YEAR` ascending.
losersDF = pd.read_csv("/Users/nicolespaar/Desktop/losers.csv")
losersDF = losersDF.drop(['Unnamed: 0'], axis=1)
losersDF = losersDF.sort_values(by = 'YEAR', ascending=True).copy()

winnerBools = []
for i in range(len(losersDF)):
    winnerBools.append(0)
losersDF['WINNER'] = winnerBools
losersDF.head()

Unnamed: 0,TITLE,YEAR,RANK,WINNER
861,it,1927,10,0
853,metropolis,1927,1,0
854,sunrise,1927,2,0
855,napoléon,1927,3,0
860,underworld,1927,9,0


In [106]:
loserTitles = losersDF['TITLE'].tolist()
loserYears = losersDF['YEAR'].tolist()
loserRanks = losersDF['RANK'].tolist()



scraped = []
count = 0
iterLen = len(loserTitles)

#! This will take ~1 hour with the rate limits throttling our HTTP as well as Bechdel API gets.
for row in losersDF.iterrows():
    
    count += 1
    
    # A random amount of time to throttle our requests is generated for each row iterration.
    throttle = getRateLimit()
    movieToSearch = row[1].TITLE
    movieYear = row[1].YEAR
    
    # The downloaded IMdB database >> DataFrame is searched for the current iterration's movie.
    titleBasicsArray = titleBasics[titleBasics['lowercasedTitle'] == movieToSearch]
    titleBasicsRow_RAW = titleBasicsArray[titleBasicsArray['titleType'] == 'movie']
    titleBasicsRow = titleBasicsRow_RAW[titleBasicsRow_RAW['startYear'] == str(movieYear)]
    tconst = titleBasicsRow.index.to_list()
    tconst_str = ''.join(tconst)
    tconst_str = tconst_str[0:9]
    
    # If the length of the result of the search's tconst string is NOT 0 (empty/no result
    # found), then the associated details as well as the result of its Bechdel rating API
    # caller are appended as a list. That list is then appended into the $scraped array.
    if len(tconst_str) != 0:
        print(f"APPENDING {count}/{iterLen} info of: '{row[1].TITLE}' ({row[1].YEAR}) with a reduced throttle of: {float(throttle / 2)}s")
        temp = []
        temp.append(movieToSearch)
        temp.append(tconst_str) 
        temp.append(movieYear)
        
        # The throttle is halved of its normal value when we call an API for a JSON instead of
        # sending an HTTP request for a whole bunch of html from IMdB's server.
        time.sleep(float(throttle / 2))
        bechdelBool = callAPI(movieToSearch)
        temp.append(bechdelBool)
        scraped.append(temp)
        
    # If the length of the result of the search's tconst string IS 0 (empty/no results found),
    # then the needed details are scraped from IMdB's website.
    else:
        print(f"   [!] SCRAPING {count}/{iterLen} info of: '{row[1].TITLE}' ({row[1].YEAR}) with a throttle of: {throttle}s")
        temp = []
        time.sleep(throttle)
        
        # The below URL can be used as an API of sorts by appending the $movieToSearch to it,
        # we must only replace the whitespace with a plus sign.
        base_url = "https://www.imdb.com/search/title/?title=" 
        movieToSearch = movieToSearch.replace(" ", "+")
        full_url = (base_url + movieToSearch)
        browser = init_splinter()
        browser.driver.minimize_window()
        browser.visit(full_url)
    
        # The results page is parsed with Beautifulsoup4 and a `soup` object returned. Then
        # the plus signs are replaced back with whitespace in our $movieToSearch. The `soup`
        # object is parsed and iterrated for the correct result that corresponds to the movie
        # we are after. Its <href> characters [7:16] are spliced as its `tconst` and appended
        # along with the rest of its associated information to a list. 
        html, soup = simmer_soup()
        movieToSearch = movieToSearch.replace("+", " ")
        results = soup.find_all('h3', class_='lister-item-header')
        for r in results:
            rYear = soup.select('h3 > span')[1].get_text(strip=True)
            rYear = rYear.replace("(", "")
            rYear = rYear.replace(")", "")
            if (r.a.text.lower() == movieToSearch) and movieToSearch not in temp:
                link_end = r.a['href']
                tconst_str = link_end[7:16]
                temp.append(movieToSearch)
                temp.append(tconst_str) 
                temp.append(movieYear)   
        browser.quit()
        
        #? The Bechdel rating for our new movie is acquired via the Bechdel callAPI() function,
        #? and appended to the rest of the information we scraped's list. Once complete, that
        #? list is then appended into the $scraped array.
        bechdelBool = callAPI(movieToSearch)
        temp.append(bechdelBool)
        scraped.append(temp)
        
print("\n\nHead of scrapes:")
print(scraped[0:5])
print("\nTail of scrapes:")
print(scraped[-6:-1])

os.system('say "to the WINDOW!"')

APPENDING 1/862 info of: 'it' (1927) with a reduced throttle of: 2.77s
APPENDING 2/862 info of: 'metropolis' (1927) with a reduced throttle of: 2.46s
APPENDING 3/862 info of: 'sunrise' (1927) with a reduced throttle of: 2.605s
   [!] SCRAPING 4/862 info of: 'napoléon' (1927) with a throttle of: 4.55s
APPENDING 5/862 info of: 'underworld' (1927) with a reduced throttle of: 2.0s
APPENDING 6/862 info of: 'the kid brother' (1927) with a reduced throttle of: 2.405s
   [!] SCRAPING 7/862 info of: 'the lodger' (1927) with a throttle of: 2.67s
APPENDING 8/862 info of: 'the jazz singer' (1927) with a reduced throttle of: 2.32s
   [!] SCRAPING 9/862 info of: 'seventh heaven' (1927) with a throttle of: 6.22s
APPENDING 10/862 info of: 'the docks of new york' (1928) with a reduced throttle of: 2.07s
APPENDING 11/862 info of: 'the last command' (1928) with a reduced throttle of: 2.92s
   [!] SCRAPING 12/862 info of: 'an italian straw hat' (1928) with a throttle of: 6.19s
   [!] SCRAPING 13/862 info 

APPENDING 99/862 info of: 'make way for tomorrow' (1937) with a reduced throttle of: 1.46s
APPENDING 100/862 info of: 'the good earth' (1937) with a reduced throttle of: 1.465s
APPENDING 101/862 info of: 'a star is born' (1937) with a reduced throttle of: 2.74s
APPENDING 102/862 info of: 'the prisoner of zenda' (1937) with a reduced throttle of: 2.845s
APPENDING 103/862 info of: 'lost horizon' (1937) with a reduced throttle of: 1.945s
APPENDING 104/862 info of: 'stage door' (1937) with a reduced throttle of: 1.425s
APPENDING 105/862 info of: 'alexander nevsky' (1938) with a reduced throttle of: 3.005s
APPENDING 106/862 info of: 'jezebel' (1938) with a reduced throttle of: 1.39s
APPENDING 107/862 info of: 'olympia part one: festival of the nations' (1938) with a reduced throttle of: 1.92s
APPENDING 108/862 info of: 'bringing up baby' (1938) with a reduced throttle of: 2.79s
APPENDING 109/862 info of: 'olympia part two: festival of beauty' (1938) with a reduced throttle of: 2.115s
APPEND

APPENDING 193/862 info of: 'miracle on 34th street' (1947) with a reduced throttle of: 2.45s
APPENDING 194/862 info of: 'body and soul' (1947) with a reduced throttle of: 1.745s
APPENDING 195/862 info of: 'boomerang!' (1947) with a reduced throttle of: 2.955s
APPENDING 196/862 info of: 'the lady from shanghai' (1947) with a reduced throttle of: 2.805s
APPENDING 197/862 info of: 'bicycle thieves' (1948) with a reduced throttle of: 1.79s
APPENDING 198/862 info of: 'the treasure of the sierra madre' (1948) with a reduced throttle of: 1.56s
APPENDING 199/862 info of: 'red river' (1948) with a reduced throttle of: 2.28s
APPENDING 200/862 info of: 'the red shoes' (1948) with a reduced throttle of: 2.825s
APPENDING 201/862 info of: 'force of evil' (1948) with a reduced throttle of: 1.67s
APPENDING 202/862 info of: 'oliver twist' (1948) with a reduced throttle of: 2.865s
APPENDING 203/862 info of: 'the snake pit' (1948) with a reduced throttle of: 2.745s
APPENDING 204/862 info of: 'the fallen 

APPENDING 288/862 info of: 'funny face' (1957) with a reduced throttle of: 2.57s
APPENDING 289/862 info of: 'vertigo' (1958) with a reduced throttle of: 1.67s
APPENDING 290/862 info of: 'touch of evil' (1958) with a reduced throttle of: 2.215s
APPENDING 291/862 info of: 'ashes and diamonds' (1958) with a reduced throttle of: 2.91s
APPENDING 292/862 info of: 'mon oncle' (1958) with a reduced throttle of: 2.7s
APPENDING 293/862 info of: 'horror of dracula' (1958) with a reduced throttle of: 1.95s
APPENDING 294/862 info of: 'a night to remember' (1958) with a reduced throttle of: 2.375s
APPENDING 295/862 info of: 'cat on a hot tin roof' (1958) with a reduced throttle of: 1.69s
   [!] SCRAPING 296/862 info of: 'ivan the terrible, part two' (1958) with a throttle of: 5.15s
APPENDING 297/862 info of: 'the defiant ones' (1958) with a reduced throttle of: 1.79s
APPENDING 298/862 info of: 'the world of apu' (1959) with a reduced throttle of: 3.03s
APPENDING 299/862 info of: 'the 400 blows' (195

   [!] SCRAPING 382/862 info of: 'if...' (1968) with a throttle of: 5.68s
APPENDING 383/862 info of: 'planet of the apes' (1968) with a reduced throttle of: 2.05s
APPENDING 384/862 info of: 'rosemary's baby' (1968) with a reduced throttle of: 2.055s
APPENDING 385/862 info of: 'shame' (1968) with a reduced throttle of: 1.815s
APPENDING 386/862 info of: 'stolen kisses' (1968) with a reduced throttle of: 3.025s
APPENDING 387/862 info of: 'bullitt' (1968) with a reduced throttle of: 1.99s
APPENDING 388/862 info of: 'faces' (1968) with a reduced throttle of: 1.8s
APPENDING 389/862 info of: 'the wild bunch' (1969) with a reduced throttle of: 2.585s
APPENDING 390/862 info of: 'butch cassidy and the sundance kid' (1969) with a reduced throttle of: 2.06s
APPENDING 391/862 info of: 'easy rider' (1969) with a reduced throttle of: 1.525s
APPENDING 392/862 info of: 'z' (1969) with a reduced throttle of: 1.625s
APPENDING 393/862 info of: 'kes' (1969) with a reduced throttle of: 2.045s
APPENDING 394/

APPENDING 477/862 info of: 'grease' (1978) with a reduced throttle of: 2.16s
APPENDING 478/862 info of: 'national lampoon's animal house' (1978) with a reduced throttle of: 2.215s
APPENDING 479/862 info of: 'apocalypse now' (1979) with a reduced throttle of: 1.41s
APPENDING 480/862 info of: 'manhattan' (1979) with a reduced throttle of: 2.995s
APPENDING 481/862 info of: 'alien' (1979) with a reduced throttle of: 1.4s
   [!] SCRAPING 482/862 info of: 'the marriage of maria braun' (1979) with a throttle of: 3.85s
APPENDING 483/862 info of: 'monty python's life of brian' (1979) with a reduced throttle of: 1.395s
APPENDING 484/862 info of: 'all that jazz' (1979) with a reduced throttle of: 2.09s
APPENDING 485/862 info of: 'the tin drum' (1979) with a reduced throttle of: 2.135s
APPENDING 486/862 info of: 'breaking away' (1979) with a reduced throttle of: 1.85s
APPENDING 487/862 info of: 'being there' (1979) with a reduced throttle of: 2.365s
APPENDING 488/862 info of: 'atlantic city' (1980

APPENDING 574/862 info of: 'sex, lies, and videotape' (1989) with a reduced throttle of: 2.895s
APPENDING 575/862 info of: 'born on the fourth of july' (1989) with a reduced throttle of: 2.58s
APPENDING 576/862 info of: 'my left foot' (1989) with a reduced throttle of: 1.945s
APPENDING 577/862 info of: 'the killer' (1989) with a reduced throttle of: 1.46s
APPENDING 578/862 info of: 'field of dreams' (1989) with a reduced throttle of: 1.71s
APPENDING 579/862 info of: 'goodfellas' (1990) with a reduced throttle of: 1.915s
APPENDING 580/862 info of: 'cyrano de bergerac' (1990) with a reduced throttle of: 1.74s
APPENDING 581/862 info of: 'close-up' (1990) with a reduced throttle of: 1.475s
APPENDING 582/862 info of: 'the grifters' (1990) with a reduced throttle of: 1.645s
APPENDING 583/862 info of: 'europa europa' (1990) with a reduced throttle of: 2.535s
APPENDING 584/862 info of: 'reversal of fortune' (1990) with a reduced throttle of: 1.49s
APPENDING 585/862 info of: 'the nasty girl' (1

APPENDING 671/862 info of: 'in the mood for love' (2000) with a reduced throttle of: 2.85s
APPENDING 672/862 info of: 'memento' (2000) with a reduced throttle of: 1.55s
APPENDING 673/862 info of: 'yi yi' (2000) with a reduced throttle of: 3.075s
APPENDING 674/862 info of: 'amores perros' (2000) with a reduced throttle of: 1.475s
APPENDING 675/862 info of: 'billy elliot' (2000) with a reduced throttle of: 1.685s
APPENDING 676/862 info of: 'chicken run' (2000) with a reduced throttle of: 2.185s
APPENDING 677/862 info of: 'almost famous' (2000) with a reduced throttle of: 1.85s
APPENDING 678/862 info of: 'traffic' (2000) with a reduced throttle of: 1.68s
APPENDING 679/862 info of: 'mulholland dr.' (2001) with a reduced throttle of: 3.055s
APPENDING 680/862 info of: 'spirited away' (2001) with a reduced throttle of: 1.515s
APPENDING 681/862 info of: 'amélie' (2001) with a reduced throttle of: 2.285s
   [!] SCRAPING 682/862 info of: 'the lord of the rings: fellowship of the ring' (2001) wit

APPENDING 766/862 info of: 'toy story 3' (2010) with a reduced throttle of: 2.635s
APPENDING 767/862 info of: 'the fighter' (2010) with a reduced throttle of: 2.56s
APPENDING 768/862 info of: 'incendies' (2010) with a reduced throttle of: 1.455s
APPENDING 769/862 info of: 'the kids are all right' (2010) with a reduced throttle of: 2.78s
APPENDING 770/862 info of: 'winter's bone' (2010) with a reduced throttle of: 1.37s
APPENDING 771/862 info of: 'a separation' (2011) with a reduced throttle of: 2.845s
APPENDING 772/862 info of: 'the tree of life' (2011) with a reduced throttle of: 1.56s
APPENDING 773/862 info of: 'hugo' (2011) with a reduced throttle of: 2.73s
APPENDING 774/862 info of: 'the descendants' (2011) with a reduced throttle of: 1.545s
APPENDING 775/862 info of: 'melancholia' (2011) with a reduced throttle of: 2.61s
APPENDING 776/862 info of: 'war horse' (2011) with a reduced throttle of: 2.77s
APPENDING 777/862 info of: 'the kid with a bike' (2011) with a reduced throttle of

0

In [129]:
b0rkedIndices = []
b0rkedBechdels = []

count = 0
for entry in scraped:
    count += 1
    if len(entry) == 1:
        b0rkedIndices.append(count)
        b0rkedBechdels.append(entry)
        
if len(b0rkedIndices) != len(b0rkedBechdels):
    print("   [!] Above For-Loop broke")

In [164]:
#goofsToFix = []
#
#for i in b0rkedIndices:
#    try:
#        goofsToFix.append([i-1, scraped[i-1], loserTitles[i-1]])
#    except:
#        goofsToFix.append([i-1, scraped[i-1], loserTitles[i-1]])
#
#for goof in goofsToFix:
#    print(goof, "\n")

In [161]:
scraped[11] = ['an italian straw hat', 'tt0018523', 1928, 0]
scraped[64] = ['the private life of henry viii', 'tt0024473', 1933, 0]
scraped[116] = ['le jour se lève', 'tt0031514', 1939, 0]
scraped[173] = ['ivan the terrible, part one', 'tt0037824', 1944, 0]
scraped[176] = ['rome open city', 'tt0038890', 1945, 0]
scraped[212] = ['whisky galore!', 'tt0042040', 1949, 0]
scraped[236] = ['europa \'51', 'tt0043511', 1952, 0]
scraped[285] = ['nights of cabiria', 'tt0050783', 1957, 1]
scraped[295] = ['ivan the terrible, part two', 'tt0051790', 1958, 0]
scraped[361] = ['the good, the bad, and the ugly', 'tt0060196', 1966, 0]
scraped[362] = ['andrei roublev', 'tt0060107', 1966, 0]
scraped[365] = ['au hasard balthazar', 'tt0060138', 1966, 0]
scraped[367] = ['masculin, féminin', 'tt0060675', 1966, 0]
scraped[381] = ['if...', 'tt0063850', 1968, 1]
scraped[419] = ['cries and whispers', 'tt0069467', 1972, 1]
scraped[438] = ['lacombe lucien', 'tt0071733', 1974, 0]
scraped[507] = ['fanny & alexander', 'tt0083922', 1982, 1]
scraped[551] = ['au revoir, les enfants', 'tt0092593', 1987, 0]
scraped[554] = ['yeelen', 'tt0094349', 1987, 0] #alternate title
scraped[617] = ['white', 'tt0111507', 1994, 1] # Drasticly diff titles if foreign
scraped[681] = ['the lord of the rings: fellowship of the ring', 'tt0120737', 2001, 0]
scraped[683] = ['lagaan', 'tt0169102', 2001, 0]
scraped[697] = ['the lord of the rings: return of the king', 'tt0167260', 2003, 0]
scraped[706] = ['good bye, lenin!', 'tt0301357', 2003, 0]
scraped[750] = ['wall•e', 'tt0910970', 2008, 0]
scraped[788] = ['blue is te warmest colour', 'tt2278871', 2013, 0]
scraped[844] = ['the mustang', 'tt5952594', 2019, 0] # bunch of these had hidden html of "\xa0" in the title
scraped[845] = ['apollo 11', 'tt8760684', 2019, 0]
scraped[846] = ['us', 'tt6857112', 2019, 0]
scraped[849] = ['knock down the house', 'tt9358052', 2019, 0]
scraped[850] = ['shazam!', 'tt0448115', 2019, 0]
scraped[854] = ['hail satan?', 'tt9358044', 2019, 0]
scraped[856] = ['long shot', 'tt2139881', 2019, 0]
scraped[857] = ['penguins', 'tt8080302', 2019, 0]
scraped[858] = ['pet sematary', 'tt0837563', 2019, 0]
scraped[861] = ['avengers: endgame', 'tt4154796', 2019, 0]

In [269]:
losers_titles = []
losers_tconsts = []
losers_years = []
losers_bechdels = []

for loser in scraped:
    losers_titles.append(loser[0])
    losers_tconsts.append(loser[1])
    losers_years.append(loser[2])
    losers_bechdels.append(loser[3])
    
d_losers = {'TCONST': losers_tconsts, 'TITLE': losers_titles, 'YEAR': losers_years, 'BECHDEL': losers_bechdels, 'RANK': loserRanks}
loseDF = pd.DataFrame(data=d_losers)
print("HEAD of [loseDF] results from `tconst` collection/scraping:\n")
print(loseDF.head())
print("\n\n\nTAIL of [loseDF] results from `tconst` collection/scraping:\n")
print(loseDF.tail())

HEAD of [loseDF] results from `tconst` collection/scraping:

      TCONST       TITLE  YEAR  BECHDEL  RANK
0  tt0018033  it          1927  1        10  
1  tt0017136  metropolis  1927  1        1   
2  tt0018455  sunrise     1927  1        2   
3  tt0253839  napoléon    1927  0        3   
4  tt0018526  underworld  1927  1        9   



TAIL of [loseDF] results from `tconst` collection/scraping:

        TCONST                    TITLE  YEAR  BECHDEL  RANK
857  tt8080302  penguins                 2019  0        9   
858  tt0837563  pet sematary             2019  0        9   
859  tt6513120  fighting with my family  2019  1        10  
860  tt8991268  honeyland                2019  0        7   
861  tt4154796  avengers: endgame        2019  0        1   


In [368]:
loser_Tconsts = loseDF['TCONST'].tolist()

def getloseDF_TITLE(count):
    return loseDF.loc[int(count - 1)].TITLE

def getloseDF_YEAR(count):
    return loseDF.loc[int(count - 1)].YEAR

def getloseDF_BECHDEL(count):
    return loseDF.loc[int(count - 1)].BECHDEL

tconstDoOvers = []
titleDoOvers = []
yearDoOvers = []
bechdelDoOvers = []
def changeTCONST_LIST(NEW_tconst, CHECK_title, CHECK_year, CHECK_bechdel):
    tconstDoOvers.append(NEW_tconst)
    titleDoOvers.append(CHECK_title)
    yearDoOvers.append(int(CHECK_year - 1))
    bechdelDoOvers.append(CHECK_bechdel)
    
    
    
loser_Tconsts[19] = "tt0019760"
loser_Tconsts[77] = "tt0026138"

In [369]:
# The below [empty] arrays declared are what we go and scrape for with the $tconstDoOvers
# (they originally had the incorrect `tconst`.
scores = []
votecount = []
budgets = []
companies = []
releasedDays = []
releasedMonths = []
releasedYears = []
releasedCountries = []

imdb_base = "https://www.imdb.com/title/"

count = 833
iterLength = len(loser_Tconsts)

for pk in loser_Tconsts[833:]:
    
    count += 1
    CHECK_title = getloseDF_TITLE(count)
    CHECK_year = getloseDF_YEAR(count)
    CHECK_bechdel = getloseDF_BECHDEL(count)
    
    # A random amount of time to throttle our requests is generated for each `tconst` 
    # iterration. That `tconst` is concactenated onto the base IMdB movie page URL, and
    # it is then visited. That page's html is parsed with Beautifulsoup4 and a `soup`
    # object is returned.
    throttle = getRateLimit()
    imdb_end = pk
    url = (imdb_base + imdb_end)
    print(f"SCRAPE {count}/{iterLength} >>>> tconst: '{imdb_end}' throttled by: {throttle}s title: '{CHECK_title}'")
    time.sleep(throttle)
    browser = init_splinter()
    browser.visit(url)
    html, soup = simmer_soup()
    
    # The movie's ratings and vote count parent tag is extracted from the `soup` object and 
    # declared as $ratings.
    ratings_parent = soup.find('div', class_='ratingValue').strong['title']
    ratings = ratings_parent.split()
    
    # The movie's rating's score is appended to the current iterration's extraction list.
    score = ratings[0]
    scores.append(score)

    # The movie's rating's vote count is appended to the current iterration's extraction list.
    totalVotes = ratings[3]
    votecount.append(totalVotes.replace(",", ""))
    
    # Booleans for each of the troublesome features are declared as nil before attempting to
    # extract their associated categorical data from IMdB.
    budgetBOOL = False
    dayBOOL = False
    monthBOOL = False
    yearBOOL = False
    countryBOOL = False
    
    # The movie's certs parent tag is extracted from the `soup` object and declared as
    # $cert_raw. It is then iterrated through looking for <h4> tags' text that equal
    # `Budget`, `Production Co`, and `Release Date` to extract as attributes to append
    # as associated information/features of the dataset.
    cert_raw = soup.find_all('div', class_='txt-block')
    for cert in cert_raw:
        try:
            
            # If the `Budget` <h4> text is encountered within the iterration of the <div>s of
            # .txt-block, and if its length is not null, its Boolean is set to true and its
            # value from the 8th character on are appended to the current iterration's 
            # extraction list.
            if cert.h4.text == "Budget:":
                budget_RAW = cert.text.split()
                budget = budget_RAW[0]               
                if len(budget[8:]) != 0 and len(budget[8:]) > 5: # New antiStr check
                    budgetBOOL = True
                    budgets.append(budget[8:].replace(",", ""))

            # If the `Production Co:` <h4> text is encountered within the iterration of the 
            # <div>s of .txt-block, its Boolean is set to true and its value from the 3rd 
            # indice/element on are appended to the current iterration's extraction list.
            elif cert.h4.text == "Production Co:":
                companies_RAW = cert.text.split()
                company = companies_RAW[2]            
                companies.append(company)   

            # If the `Production Co:` <h4> text is encountered within the iterration of the 
            # <div>s of .txt-block, its 3rd, 4th, and 5th indices/elements are extracted out
            # as new $VARs. The month ($release2) is converted to a numerical representation
            # of its string value, too.
            elif cert.h4.text == "Release Date:":
                release_RAW = cert.text.split()
                release1 = release_RAW[2]
                release2 = release_RAW[3]
                release3 = release_RAW[4]
                release2 = monthToNum(release2)
             
                # If the day ($release1) is not null, its Boolean is set to true and its value
                # is appended to the current iterration's extraction list.
                if len(release1) != 0:
                    dayBOOL = True
                    releasedDays.append(release1)
                
                # If the month ($release2) is not 1-12, its Boolean is set to true and its 
                # value is appended to the current iterration's extraction list.
                if release2 in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]:
                    monthBOOL = True
                    releasedMonths.append(release2)
                
                # If the year ($release3) is not null, its Boolean is set to true and its 
                # value is appended to the current iterration's extraction list.
                if len(release3) != 0:
                    yearBOOL = True
                    releasedYears.append(release3)
                
                # The 6th indice/element of $release_RAW is extracted out as a new variable,
                # $release_country. It also has its parens removed from its contents.
                release_country = release_RAW[5]
                release_country = release_country.replace("(", "")
                release_country = release_country.replace(")", "")
                
                # If the country ($release_country) is not null, its Boolean is set to true
                # and its value is appended to the current iterration's extraction list.
                if len(release_country) != 0:
                    countryBOOL = True
                    releasedCountries.append(release_country)
                    
        # Each iterration errors out at some point, so this allows the program to continue
        # regardless of what errors are being thrown about it.
        except:
            pass
    
    # I tried many ways to make this more elegant of code, but had to cut my losses for this
    # version that, while ugly: does work as intended. This is a refactored version of what
    # was all individual $___BOOL checks. If any $___BOOL $VARs are not True, then the user
    # is prompted if the existing `tconst` is indeed correct.
    if budgetBOOL != True or dayBOOL!= True or monthBOOL != True or yearBOOL != True or countryBOOL != True:
        print(f"   [!] ERROR [!]")
        tconstCheck = None
        while tconstCheck not in ("y", "n"):
            tconstCheck = str(input(f"   [!] Is {pk} the correct TCONST for '{CHECK_title}' ({CHECK_year})? [!]    [y]/[n] "))
            
            # If YES (the existing `tconst` is correct), the user is prompted to manually
            # input the data that was unable to be scraped.
            if tconstCheck == "y":
                if budgetBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $BUDGET$   MANUALLY (As a whole number): [!] "))
                    budgets.append(manualInsertion)
                if dayBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $RELEASE DAY$   MANUALLY (As a whole number, no leading zeros): [!] "))
                    releasedDays.append(manualInsertion)
                if monthBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $RELEASE_MONTH$   MANUALLY (As a whole number, no leading zero): [!] "))
                    releasedMonths.append(manualInsertion)
                if yearBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $RELEASE_YEAR$   MANUALLY (As a whole number, 4 digits): [!] "))
                    releasedYears.append(manualInsertion)
                if countryBOOL != True:
                    manualInsertion = str(input(f"   [!] ENTER {pk}'s   $RELEASE_COUNTRY$   MANUALLY (As a string abbreviation, eg: 'USA'): [!] "))
                    releasedCountries.append(manualInsertion)
            
            # If NO (the existing `tconst` is incorrect), the user is prompted to manually
            # input the correct `tconst`, and if it has not already been appended to the
            # $tconstDoOvers, then it is added. 
            elif tconstCheck == "n":
                NEW_tconst = str(input("   [!] Enter the correct TCONST: [!] "))         
                if NEW_tconst not in tconstDoOvers:
                    changeTCONST_LIST(NEW_tconst, CHECK_title, CHECK_year, CHECK_bechdel)
            
                # Also, each bool that was left False (the scraper couldn't locate the correct
                # data) signals a value of `None` to be appended to the current iterration's
                # extraction list so that its row can later be dropped in favor of $tconstDoOvers    
                if budgetBOOL != True:
                    budgets.append(None)
                if dayBOOL != True:
                    releasedDays.append(None)
                if monthBOOL != True:
                    releasedMonths.append(None)
                if yearBOOL != True:
                    releasedYears.append(None)
                if countryBOOL != True:
                    releasedCountries.append(None)
            else:
                clear()
                print("      [!] [!]    You must enter 'y' or 'n'    [!] [!]")
                time.sleep(4)
                                    
    
    browser.quit()

os.system('say "skeet skeet skeet skeet skeet"')

SCRAPE 834/862 >>>> tconst: 'tt6135348' throttled by: 4.85s title: 'bpm (beats per minute)'
   [!] ERROR [!]
   [!] Is tt6135348 the correct TCONST for 'bpm (beats per minute)' (2017)? [!]    [y]/[n] y
   [!] ENTER tt6135348's   $BUDGET$   MANUALLY (As a whole number): [!] 6200000
SCRAPE 835/862 >>>> tconst: 'tt6155172' throttled by: 5.66s title: 'roma'
   [!] ERROR [!]
   [!] Is tt6155172 the correct TCONST for 'roma' (2018)? [!]    [y]/[n] y
   [!] ENTER tt6155172's   $BUDGET$   MANUALLY (As a whole number): [!] 15000000
SCRAPE 836/862 >>>> tconst: 'tt6543652' throttled by: 3.36s title: 'cold war'
SCRAPE 837/862 >>>> tconst: 'tt8075192' throttled by: 4.88s title: 'shoplifters'
   [!] ERROR [!]
   [!] Is tt8075192 the correct TCONST for 'shoplifters' (2018)? [!]    [y]/[n] y
   [!] ENTER tt8075192's   $BUDGET$   MANUALLY (As a whole number): [!] 200000
SCRAPE 838/862 >>>> tconst: 'tt6966692' throttled by: 5.74s title: 'green book'
SCRAPE 839/862 >>>> tconst: 'tt7349662' throttled by: 

0

In [496]:
#!   BATCH #1 == 0-77 .  !#
# Big attribution scraper errored out at #78, so I'm saving progress with what I do have since
# These older movies take a long time to track down the missing budget info.
#
#batch1_data = {'TCONST': loser_Tconsts[0:77], 'TITLE': losers_titles[0:77], 'YEAR': losers_years[0:77], 'BECHDEL': losers_bechdels[0:77]}
#batch1DF = pd.DataFrame(data=batch1_data)
#
#- - - - - - - - - - - - - - - - - - -#
####################batch1_companies = companies   # missing 2 values
#batch1_tconstDoOvers = tconstDoOvers
#batch1_titleDoOvers = titleDoOvers
#batch1_yearDoOvers = yearDoOvers
#batch1_bechdelDoOvers = bechdelDoOvers
#- - - - - - - - - - - - - - - - - - -#
#
#batch1DF['SCORES'] = scores
#batch1DF['VOTES'] = votecount
#batch1DF['BUDGETS'] = budgets
#batch1DF['RELEASED_DAY'] = releasedDays
#batch1DF['RELEASED_MON'] = releasedMonths
#batch1DF['RELEASED_YEAR'] = releasedYears
#batch1DF['COUNTRY'] = releasedCountries

#batch1DF.head()

In [468]:
#!   BATCH #2 == 77-214 .  !#
# Big attribution scraper errored out at #215, so I'm saving progress with what I do have since
# These older movies take a long time to track down the missing budget info.
#
#batch2_data = {'TCONST': loser_Tconsts[77:214], 'TITLE': losers_titles[77:214], 'YEAR': losers_years[77:214], 'BECHDEL': losers_bechdels[77:214]}
#batch2DF = pd.DataFrame(data=batch2_data)
#
#- - - - - - - - - - - - - - - - - - -#  
#batch2_tconstDoOvers = tconstDoOvers
#batch2_titleDoOvers = titleDoOvers
#batch2_yearDoOvers = yearDoOvers
#batch2_bechdelDoOvers = bechdelDoOvers
#- - - - - - - - - - - - - - - - - - -#

#batch2DF['SCORES'] = scores
#batch2DF['VOTES'] = votecount
#batch2DF['BUDGETS'] = budgets
#batch2DF['PRODUCTION_CO'] = companies
#batch2DF['RELEASED_DAY'] = releasedDays
#batch2DF['RELEASED_MON'] = releasedMonths
#batch2DF['RELEASED_YEAR'] = releasedYears
#batch2DF['COUNTRY'] = releasedCountries

# DROPPING PRODUCTION_CO FROM ALL FOR THE SAKE OF TIME
#batch2DF = batch2DF.drop(['PRODUCTION_CO'], axis=1).copy()
#batch2DF.head()

In [495]:
batches_1_2 = batch1DF.append(batch2DF)
#batches_1_2.head(10)

In [469]:
#!   BATCH #3 == 217-332 .  !#
# Big attribution scraper errored out at #333, so I'm saving progress with what I do have since
# These older movies take a long time to track down the missing budget info.
#
#batch3_data = {'TCONST': loser_Tconsts[217:333], 'TITLE': losers_titles[217:333], 'YEAR': losers_years[217:333], 'BECHDEL': losers_bechdels[217:333]}
#batch3DF = pd.DataFrame(data=batch3_data)
#
#batch3DF['SCORES'] = scores
#batch3DF['VOTES'] = votecount
#batch3DF['BUDGETS'] = budgets
#batch3DF['PRODUCTION_CO'] = companies
#batch3DF['RELEASED_DAY'] = releasedDays
#batch3DF['RELEASED_MON'] = releasedMonths
#batch3DF['RELEASED_YEAR'] = releasedYears
#batch3DF['COUNTRY'] = releasedCountries

# DROPPING PRODUCTION_CO FROM ALL FOR THE SAKE OF TIME
#batch3DF = batch3DF.drop(['PRODUCTION_CO'], axis=1).copy()
#batch3DF.head()

In [470]:
#!   BATCH #4 == 334-500 .  !#
# Big attribution scraper errored out at #500, so I'm saving progress with what I do have since
# These older movies take a long time to track down the missing budget info.
#
#batch4_data = {'TCONST': loser_Tconsts[334:500], 'TITLE': losers_titles[334:500], 'YEAR': losers_years[334:500], 'BECHDEL': losers_bechdels[334:500]}
#batch4DF = pd.DataFrame(data=batch4_data)
#
#- - - - - - - - - - - - - - - - - - -# 
########################batch4_companies = companies   # missing 2 values
#batch4_tconstDoOvers = tconstDoOvers
#batch4_titleDoOvers = titleDoOvers
#batch4_yearDoOvers = yearDoOvers
#batch4_bechdelDoOvers = bechdelDoOvers
#- - - - - - - - - - - - - - - - - - -#

#batch4DF['SCORES'] = scores
#batch4DF['VOTES'] = votecount
#batch4DF['BUDGETS'] = budgets
#batch4DF['RELEASED_DAY'] = releasedDays
#batch4DF['RELEASED_MON'] = releasedMonths
#batch4DF['RELEASED_YEAR'] = releasedYears
#batch4DF['COUNTRY'] = releasedCountries

#batch4DF.head()

In [497]:
batches_3_4 = batch3DF.append(batch4DF)
#batches_3_4.head(10)

In [500]:
batches_1234 = batches_1_2.append(batches_3_4)
#batches_1234.head(10)

In [473]:
#!   BATCH #5 == 501-570 .  !#
# Big attribution scraper errored out at #78, so I'm saving progress with what I do have since
# These older movies take a long time to track down the missing budget info.
#
#batch5_data = {'TCONST': loser_Tconsts[501:571], 'TITLE': losers_titles[501:571], 'YEAR': losers_years[501:571], 'BECHDEL': losers_bechdels[501:571]}
#batch5DF = pd.DataFrame(data=batch5_data)
#
#- - - - - - - - - - - - - - - - - - -#
################################batch5_companies = companies   # missing 1 value
#batch5_tconstDoOvers = tconstDoOvers
#batch5_titleDoOvers = titleDoOvers
#batch5_yearDoOvers = yearDoOvers
#batch5_bechdelDoOvers = bechdelDoOvers
#- - - - - - - - - - - - - - - - - - -#
#
#batch5DF['SCORES'] = scores
#batch5DF['VOTES'] = votecount
#batch5DF['BUDGETS'] = budgets
#batch5DF['RELEASED_DAY'] = releasedDays
#batch5DF['RELEASED_MON'] = releasedMonths
#batch5DF['RELEASED_YEAR'] = releasedYears
#batch5DF['COUNTRY'] = releasedCountries

#batch5DF.head()

In [474]:
#!   BATCH #6 == 572-831 .  !#
# Big attribution scraper errored out at #78, so I'm saving progress with what I do have since
# These older movies take a long time to track down the missing budget info.
#
#batch6_data = {'TCONST': loser_Tconsts[572:832], 'TITLE': losers_titles[572:832], 'YEAR': losers_years[572:832], 'BECHDEL': losers_bechdels[572:832]}
#batch6DF = pd.DataFrame(data=batch6_data)
#
#- - - - - - - - - - - - - - - - - - -#
#####################################batch6_companies = companies   # missing 1 value
#batch6_tconstDoOvers = tconstDoOvers
#batch6_titleDoOvers = titleDoOvers
#batch6_yearDoOvers = yearDoOvers
#batch6_bechdelDoOvers = bechdelDoOvers
#- - - - - - - - - - - - - - - - - - -#
#
#batch6DF['SCORES'] = scores
#batch6DF['VOTES'] = votecount
#batch6DF['BUDGETS'] = budgets
#batch6DF['RELEASED_DAY'] = releasedDays
#batch6DF['RELEASED_MON'] = releasedMonths
#batch6DF['RELEASED_YEAR'] = releasedYears
#batch6DF['COUNTRY'] = releasedCountries


#batch6DF.head()

In [501]:
batches_5_6 = batch5DF.append(batch6DF)
#batches_5_6.head(10)

In [503]:
batches_123456 = batches_1234.append(batches_5_6)
#batches_123456.head(10)

In [504]:
#!   BATCH #7 == 833-862 .  !#
#
#batch7_data = {'TCONST': loser_Tconsts[833:], 'TITLE': losers_titles[833:], 'YEAR': losers_years[833:], 'BECHDEL': losers_bechdels[833:]}
#batch7DF = pd.DataFrame(data=batch7_data)
#
#- - - - - - - - - - - - - - - - - - -#
#batch7_tconstDoOvers = tconstDoOvers
#batch7_titleDoOvers = titleDoOvers
#batch7_yearDoOvers = yearDoOvers
#batch7_bechdelDoOvers = bechdelDoOvers
#- - - - - - - - - - - - - - - - - - -#
#
#batch7DF['SCORES'] = scores
#batch7DF['VOTES'] = votecount
#batch7DF['BUDGETS'] = budgets
#batch7DF['PRODUCTION_CO'] = companies
#batch7DF['RELEASED_DAY'] = releasedDays
#batch7DF['RELEASED_MON'] = releasedMonths
#batch7DF['RELEASED_YEAR'] = releasedYears
#batch7DF['COUNTRY'] = releasedCountries

# DROPPING PRODUCTION_CO FROM ALL FOR THE SAKE OF TIME
#batch7DF = batch7DF.drop(['PRODUCTION_CO'], axis=1).copy()
#batch7DF.head(5)

In [516]:
batches_1234567 = batches_123456.append(batch7DF)
#batches_1234567.head(10)

In [520]:
# We'll fix all the DoOvers we happened upon and saved for this moment. Each batch's lists
# are extended into a brand new one before final cleaning is performed and they're then set
# into a new DataFrame $doOversDF before scraping for their attributes again now with correct
# data associated with them!
#redo_tconsts = []
#redo_tconsts.extend(batch1_tconstDoOvers)
#redo_tconsts.extend(batch2_tconstDoOvers)
#redo_tconsts.extend(batch3_tconstDoOvers)
#redo_tconsts.extend(batch4_tconstDoOvers)
#redo_tconsts.extend(batch5_tconstDoOvers)
#redo_tconsts.extend(batch6_tconstDoOvers)
#redo_tconsts.extend(batch7_tconstDoOvers)
#
#redo_titles = []
#redo_titles.extend(batch1_titleDoOvers)
#redo_titles.extend(batch2_titleDoOvers)
#redo_titles.extend(batch3_titleDoOvers)
#redo_titles.extend(batch4_titleDoOvers)
#redo_titles.extend(batch5_titleDoOvers)
#redo_titles.extend(batch6_titleDoOvers)
#redo_titles.extend(batch7_titleDoOvers)
#
#redo_years = []
#redo_years.extend(batch1_yearDoOvers)
#redo_years.extend(batch2_yearDoOvers)
#redo_years.extend(batch3_yearDoOvers)
#redo_years.extend(batch4_yearDoOvers)
#redo_years.extend(batch5_yearDoOvers)
#redo_years.extend(batch6_yearDoOvers)
#edo_years.extend(batch7_yearDoOvers)
#
#redo_bechdels = []
#redo_bechdels.extend(batch1_bechdelDoOvers)
#redo_bechdels.extend(batch2_bechdelDoOvers)
#edo_bechdels.extend(batch3_bechdelDoOvers)
#redo_bechdels.extend(batch4_bechdelDoOvers)
#edo_bechdels.extend(batch5_bechdelDoOvers)
#edo_bechdels.extend(batch6_bechdelDoOvers)
#edo_bechdels.extend(batch7_bechdelDoOvers)
#
#redo_tconsts[0] = "tt0018379"

#doOvers_data = {'TCONST': redo_tconsts, 'TITLE': redo_titles, 'YEAR': redo_years, 'BECHDEL': redo_bechdels}
#doOversDF = pd.DataFrame(data=doOvers_data)
#doOversDF.head()


In [521]:
batches_1234567.head()

Unnamed: 0,TCONST,TITLE,YEAR,BECHDEL,SCORES,VOTES,BUDGETS,RELEASED_DAY,RELEASED_MON,RELEASED_YEAR,COUNTRY
0,tt0018033,it,1927,1,7.4,2726,2000000.0,15,2.0,1927,USA
1,tt0017136,metropolis,1927,1,8.3,145916,3407040.0,13,3.0,1927,USA
2,tt0018455,sunrise,1927,1,8.1,43717,200000.0,4,11.0,1927,USA
3,tt0253839,napoléon,1927,0,7.4,3851,41000000.0,17,9.0,2002,Italy
4,tt0018526,underworld,1927,1,7.6,2228,100000.0,29,10.0,1927,USA


In [413]:
# The below [empty] arrays declared are what we go and scrape for with the $redo_tconsts
# (they originally had the incorrect `tconst` values in the big scrape).
scores = []
votecount = []
budgets = []
companies = []
releasedDays = []
releasedMonths = []
releasedYears = []
releasedCountries = []

imdb_base = "https://www.imdb.com/title/"

count = 0
iterLength = len(redo_tconsts)

for pk in redo_tconsts:
    
    count += 1
    
    # A random amount of time to throttle our requests is generated for each `tconst` 
    # iterration. That `tconst` is concactenated onto the base IMdB movie page URL, and
    # it is then visited. That page's html is parsed with Beautifulsoup4 and a `soup`
    # object is returned.
    throttle = getRateLimit()
    imdb_end = pk
    url = (imdb_base + imdb_end)
    print(f"SCRAPE {count}/{iterLength}   >>>>   tconst: '{imdb_end}'   throttled by: {throttle}s")
    time.sleep(throttle)
    browser = init_splinter()
    browser.visit(url)
    html, soup = simmer_soup()
    
    # The movie's ratings and vote count parent tag is extracted from the `soup` object and 
    # declared as $ratings.
    ratings_parent = soup.find('div', class_='ratingValue').strong['title']
    ratings = ratings_parent.split()
    
    # The movie's rating's score is appended to the current iterration's extraction list.
    score = ratings[0]
    scores.append(score)

    # The movie's rating's vote count is appended to the current iterration's extraction list.
    totalVotes = ratings[3]
    votecount.append(totalVotes.replace(",", ""))
    
    # Booleans for each of the troublesome features are declared as nil before attempting to
    # extract their associated categorical data from IMdB.
    budgetBOOL = False
    dayBOOL = False
    monthBOOL = False
    yearBOOL = False
    countryBOOL = False
    
    # The movie's certs parent tag is extracted from the `soup` object and declared as
    # $cert_raw. It is then iterrated through looking for <h4> tags' text that equal
    # `Budget`, `Production Co`, and `Release Date` to extract as attributes to append
    # as associated information/features of the dataset.
    cert_raw = soup.find_all('div', class_='txt-block')
    for cert in cert_raw:
        try:
            
            # If the `Budget` <h4> text is encountered within the iterration of the <div>s of
            # .txt-block, and if its length is not null, its Boolean is set to true and its
            # value from the 8th character on are appended to the current iterration's 
            # extraction list.
            if cert.h4.text == "Budget:":
                budget_RAW = cert.text.split()
                budget = budget_RAW[0]               
                if len(budget[8:]) != 0:
                    budgetBOOL = True
                    budgets.append(budget[8:].replace(",", ""))

            # If the `Production Co:` <h4> text is encountered within the iterration of the 
            # <div>s of .txt-block, its Boolean is set to true and its value from the 3rd 
            # indice/element on are appended to the current iterration's extraction list.
            elif cert.h4.text == "Production Co:":
                companies_RAW = cert.text.split()
                company = companies_RAW[2]            
                companies.append(company)   

            # If the `Production Co:` <h4> text is encountered within the iterration of the 
            # <div>s of .txt-block, its 3rd, 4th, and 5th indices/elements are extracted out
            # as new $VARs. The month ($release2) is converted to a numerical representation
            # of its string value, too.
            elif cert.h4.text == "Release Date:":
                release_RAW = cert.text.split()
                release1 = release_RAW[2]
                release2 = release_RAW[3]
                release3 = release_RAW[4]
                release2 = monthToNum(release2)
             
                # If the day ($release1) is not null, its Boolean is set to true and its value
                # is appended to the current iterration's extraction list.
                if len(release1) != 0:
                    dayBOOL = True
                    releasedDays.append(release1)
                
                # If the month ($release2) is not 1-12, its Boolean is set to true and its 
                # value is appended to the current iterration's extraction list.
                if release2 in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]:
                    monthBOOL = True
                    releasedMonths.append(release2)
                
                # If the year ($release3) is not null, its Boolean is set to true and its 
                # value is appended to the current iterration's extraction list.
                if len(release3) != 0:
                    yearBOOL = True
                    releasedYears.append(release3)
                
                # The 6th indice/element of $release_RAW is extracted out as a new variable,
                # $release_country. It also has its parens removed from its contents.
                release_country = release_RAW[5]
                release_country = release_country.replace("(", "")
                release_country = release_country.replace(")", "")
                
                # If the country ($release_country) is not null, its Boolean is set to true
                # and its value is appended to the current iterration's extraction list.
                if len(release_country) != 0:
                    countryBOOL = True
                    releasedCountries.append(release_country)
                    
        # Each iterration errors out at some point, so this allows the program to continue
        # regardless of what errors are being thrown about it.
        except:
            pass
    
    # I tried many ways to make this more elegant of code, but had to cut my losses for this
    # version that, while ugly: does work as intended. This is a refactored version of what
    # was all individual $___BOOL checks. If any $___BOOL $VARs are not True, then the user
    # is prompted if the existing `tconst` is indeed correct.
    if budgetBOOL != True or dayBOOL!= True or monthBOOL != True or yearBOOL != True or countryBOOL != True:
        print(f"   [!] ERROR [!]")
        tconstCheck = None
        while tconstCheck not in ("y", "n"):
            tconstCheck = str(input(f"   [!] Is {pk} the correct TCONST? [!]    [y]/[n] "))
            
            # If YES (the existing `tconst` is correct), the user is prompted to manually
            # input the data that was unable to be scraped.
            if tconstCheck == "y":
                if budgetBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $BUDGET$   MANUALLY (As a whole number): [!] "))
                    budgets.append(manualInsertion)
                if dayBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $RELEASE DAY$   MANUALLY (As a whole number, no leading zeros): [!] "))
                    releasedDays.append(manualInsertion)
                if monthBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $RELEASE_MONTH$   MANUALLY (As a whole number, no leading zero): [!] "))
                    releasedMonths.append(manualInsertion)
                if yearBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $RELEASE_YEAR$   MANUALLY (As a whole number, 4 digits): [!] "))
                    releasedYears.append(manualInsertion)
                if countryBOOL != True:
                    manualInsertion = str(input(f"   [!] ENTER {pk}'s   $RELEASE_COUNTRY$   MANUALLY (As a string abbreviation, eg: 'USA'): [!] "))
                    releasedCountries.append(manualInsertion)
            
            # If NO (the existing `tconst` is incorrect), the user is prompted to manually
            # input the correct `tconst`, and if it has not already been appended to the
            # $redo_tconsts, then it is added. 
            elif tconstCheck == "n":
                NEW_tconst = str(input("   [!] Enter the correct TCONST: [!] "))         
                if NEW_tconst not in redo_tconsts:
                    changeTCONST_LIST(NEW_tconst)
                    
                # Also, each bool that was left False (the scraper couldn't locate the correct
                # data) signals a value of `None` to be appended to the current iterration's
                # extraction list so that its row can later be dropped in favor of $redo_tconsts    
                if budgetBOOL != True:
                    budgets.append(None)
                if dayBOOL != True:
                    releasedDays.append(None)
                if monthBOOL != True:
                    releasedMonths.append(None)
                if yearBOOL != True:
                    releasedYears.append(None)
                if countryBOOL != True:
                    releasedCountries.append(None)
            else:
                clear()
                print("      [!] [!]    You must enter 'y' or 'n'    [!] [!]")
                time.sleep(4)
                                    
    
    browser.quit()

os.system('say "to the WINDOW!"')

SCRAPE 1/9   >>>>   tconst: 'tt0018379'   throttled by: 3.29s
SCRAPE 2/9   >>>>   tconst: 'tt0019688'   throttled by: 4.47s
   [!] ERROR [!]
   [!] Is tt0019688 the correct TCONST? [!]    [y]/[n] y
   [!] ENTER tt0019688's   $BUDGET$   MANUALLY (As a whole number): [!] 2500
SCRAPE 3/9   >>>>   tconst: 'tt0021577'   throttled by: 4.83s
   [!] ERROR [!]
   [!] Is tt0021577 the correct TCONST? [!]    [y]/[n] y
   [!] ENTER tt0021577's   $BUDGET$   MANUALLY (As a whole number): [!] 169309
SCRAPE 4/9   >>>>   tconst: 'tt0024803'   throttled by: 3.27s
   [!] ERROR [!]
   [!] Is tt0024803 the correct TCONST? [!]    [y]/[n] y
   [!] ENTER tt0024803's   $BUDGET$   MANUALLY (As a whole number): [!] 33862
SCRAPE 5/9   >>>>   tconst: 'tt0028950'   throttled by: 3.58s
   [!] ERROR [!]
   [!] Is tt0028950 the correct TCONST? [!]    [y]/[n] y
   [!] ENTER tt0028950's   $BUDGET$   MANUALLY (As a whole number): [!] 70000
SCRAPE 6/9   >>>>   tconst: 'tt0066026'   throttled by: 4.49s
   [!] ERROR [!]
   

0

In [526]:
#batches_1234567.head(2)

In [525]:
#doOversDF['RATING'] = scores
#doOversDF['VOTES'] = votecount
#doOversDF['PRODUCTION_CO'] = companies
#doOversDF['BUDGET'] = budgets
#doOversDF['RELEASE_DAY'] = releasedDays
#doOversDF['RELEASE_MON'] = releasedMonths
#doOversDF['RELEASE_YEAR'] = releasedYears
#doOversDF['COUNTRY'] = releasedCountries

# DROPPING PRODUCTION_CO FROM ALL FOR THE SAKE OF TIME
#doOversDF = doOversDF.drop(['PRODUCTION_CO'], axis=1).copy()
doOversDF.rename(columns={'RATING': 'SCORES',
                         'BUDGET': 'BUDGETS',
                         'RELEASE_DAY': 'RELEASED_DAY',
                         'RELEASE_MON': 'RELEASED_MON',
                         'RELEASE_YEAR': 'RELEASED_YEAR'},
                inplace=True)
#doOversDF.head(2)

In [528]:
allBatches_AND_doOvers = batches_1234567.append(doOversDF)
#allBatches_AND_doOvers.head()

Unnamed: 0,TCONST,TITLE,YEAR,BECHDEL,SCORES,VOTES,BUDGETS,RELEASED_DAY,RELEASED_MON,RELEASED_YEAR,COUNTRY
0,tt0018033,it,1927,1,7.4,2726,2000000.0,15,2.0,1927,USA
1,tt0017136,metropolis,1927,1,8.3,145916,3407040.0,13,3.0,1927,USA
2,tt0018455,sunrise,1927,1,8.1,43717,200000.0,4,11.0,1927,USA
3,tt0253839,napoléon,1927,0,7.4,3851,41000000.0,17,9.0,2002,Italy
4,tt0018526,underworld,1927,1,7.6,2228,100000.0,29,10.0,1927,USA


In [430]:
# Now we make one out of all the duds we encountered along the way....

#-[Trouble Indices]-------[My remedies found]-#
#loser_Tconsts[214]     correct pk => tt0042876
#loser_Tconsts[215]     correct pk => tt0043014
#loser_Tconsts[216]     correct pk => tt0042804
#loser_Tconsts[333]     correct pk => tt0056801
#loser_Tconsts[500]     correct pk => tt0082694
#loser_Tconsts[571]     correct pk => tt0098635
#loser_Tconsts[831]     correct pk => tt6304162

formerDuds = ['tt0042876', 'tt0043014', 'tt0042804', 'tt0056801', 'tt0082694', 'tt0098635', 'tt6304162']
duds_titles = [losers_titles[214], losers_titles[215], losers_titles[216], losers_titles[333], losers_titles[500], losers_titles[571], losers_titles[831]]
duds_years = [losers_years[214], losers_years[215], losers_years[216], losers_years[333], losers_years[500], losers_years[571], losers_years[831]]
duds_bechdels = [losers_bechdels[214], losers_bechdels[215], losers_bechdels[216], losers_bechdels[333], losers_bechdels[500], losers_bechdels[571], losers_bechdels[831]]

duds_data = {'TCONST': formerDuds, 'TITLE': duds_titles, 'YEAR': duds_years, 'BECHDEL': duds_bechdels}
dudsDF = pd.DataFrame(data=duds_data)
#dudsDF

In [431]:
# The below [empty] arrays declared are what we go and scrape for with the $redo_tconsts
# (they originally had the incorrect `tconst` values in the big scrape).
scores = []
votecount = []
budgets = []
companies = []
releasedDays = []
releasedMonths = []
releasedYears = []
releasedCountries = []

imdb_base = "https://www.imdb.com/title/"

count = 0
iterLength = len(formerDuds)

for pk in formerDuds:
    
    count += 1
    
    # A random amount of time to throttle our requests is generated for each `tconst` 
    # iterration. That `tconst` is concactenated onto the base IMdB movie page URL, and
    # it is then visited. That page's html is parsed with Beautifulsoup4 and a `soup`
    # object is returned.
    throttle = getRateLimit()
    imdb_end = pk
    url = (imdb_base + imdb_end)
    print(f"SCRAPE {count}/{iterLength}   >>>>   tconst: '{imdb_end}'   throttled by: {throttle}s")
    time.sleep(throttle)
    browser = init_splinter()
    browser.visit(url)
    html, soup = simmer_soup()
    
    # The movie's ratings and vote count parent tag is extracted from the `soup` object and 
    # declared as $ratings.
    ratings_parent = soup.find('div', class_='ratingValue').strong['title']
    ratings = ratings_parent.split()
    
    # The movie's rating's score is appended to the current iterration's extraction list.
    score = ratings[0]
    scores.append(score)

    # The movie's rating's vote count is appended to the current iterration's extraction list.
    totalVotes = ratings[3]
    votecount.append(totalVotes.replace(",", ""))
    
    # Booleans for each of the troublesome features are declared as nil before attempting to
    # extract their associated categorical data from IMdB.
    budgetBOOL = False
    dayBOOL = False
    monthBOOL = False
    yearBOOL = False
    countryBOOL = False
    
    # The movie's certs parent tag is extracted from the `soup` object and declared as
    # $cert_raw. It is then iterrated through looking for <h4> tags' text that equal
    # `Budget`, `Production Co`, and `Release Date` to extract as attributes to append
    # as associated information/features of the dataset.
    cert_raw = soup.find_all('div', class_='txt-block')
    for cert in cert_raw:
        try:
            
            # If the `Budget` <h4> text is encountered within the iterration of the <div>s of
            # .txt-block, and if its length is not null, its Boolean is set to true and its
            # value from the 8th character on are appended to the current iterration's 
            # extraction list.
            if cert.h4.text == "Budget:":
                budget_RAW = cert.text.split()
                budget = budget_RAW[0]               
                if len(budget[8:]) != 0:
                    budgetBOOL = True
                    budgets.append(budget[8:].replace(",", ""))

            # If the `Production Co:` <h4> text is encountered within the iterration of the 
            # <div>s of .txt-block, its Boolean is set to true and its value from the 3rd 
            # indice/element on are appended to the current iterration's extraction list.
            elif cert.h4.text == "Production Co:":
                companies_RAW = cert.text.split()
                company = companies_RAW[2]            
                companies.append(company)   

            # If the `Production Co:` <h4> text is encountered within the iterration of the 
            # <div>s of .txt-block, its 3rd, 4th, and 5th indices/elements are extracted out
            # as new $VARs. The month ($release2) is converted to a numerical representation
            # of its string value, too.
            elif cert.h4.text == "Release Date:":
                release_RAW = cert.text.split()
                release1 = release_RAW[2]
                release2 = release_RAW[3]
                release3 = release_RAW[4]
                release2 = monthToNum(release2)
             
                # If the day ($release1) is not null, its Boolean is set to true and its value
                # is appended to the current iterration's extraction list.
                if len(release1) != 0:
                    dayBOOL = True
                    releasedDays.append(release1)
                
                # If the month ($release2) is not 1-12, its Boolean is set to true and its 
                # value is appended to the current iterration's extraction list.
                if release2 in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]:
                    monthBOOL = True
                    releasedMonths.append(release2)
                
                # If the year ($release3) is not null, its Boolean is set to true and its 
                # value is appended to the current iterration's extraction list.
                if len(release3) != 0:
                    yearBOOL = True
                    releasedYears.append(release3)
                
                # The 6th indice/element of $release_RAW is extracted out as a new variable,
                # $release_country. It also has its parens removed from its contents.
                release_country = release_RAW[5]
                release_country = release_country.replace("(", "")
                release_country = release_country.replace(")", "")
                
                # If the country ($release_country) is not null, its Boolean is set to true
                # and its value is appended to the current iterration's extraction list.
                if len(release_country) != 0:
                    countryBOOL = True
                    releasedCountries.append(release_country)
                    
        # Each iterration errors out at some point, so this allows the program to continue
        # regardless of what errors are being thrown about it.
        except:
            pass
    
    # I tried many ways to make this more elegant of code, but had to cut my losses for this
    # version that, while ugly: does work as intended. This is a refactored version of what
    # was all individual $___BOOL checks. If any $___BOOL $VARs are not True, then the user
    # is prompted if the existing `tconst` is indeed correct.
    if budgetBOOL != True or dayBOOL!= True or monthBOOL != True or yearBOOL != True or countryBOOL != True:
        print(f"   [!] ERROR [!]")
        tconstCheck = None
        while tconstCheck not in ("y", "n"):
            tconstCheck = str(input(f"   [!] Is {pk} the correct TCONST? [!]    [y]/[n] "))
            
            # If YES (the existing `tconst` is correct), the user is prompted to manually
            # input the data that was unable to be scraped.
            if tconstCheck == "y":
                if budgetBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $BUDGET$   MANUALLY (As a whole number): [!] "))
                    budgets.append(manualInsertion)
                if dayBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $RELEASE DAY$   MANUALLY (As a whole number, no leading zeros): [!] "))
                    releasedDays.append(manualInsertion)
                if monthBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $RELEASE_MONTH$   MANUALLY (As a whole number, no leading zero): [!] "))
                    releasedMonths.append(manualInsertion)
                if yearBOOL != True:
                    manualInsertion = float(input(f"   [!] ENTER {pk}'s   $RELEASE_YEAR$   MANUALLY (As a whole number, 4 digits): [!] "))
                    releasedYears.append(manualInsertion)
                if countryBOOL != True:
                    manualInsertion = str(input(f"   [!] ENTER {pk}'s   $RELEASE_COUNTRY$   MANUALLY (As a string abbreviation, eg: 'USA'): [!] "))
                    releasedCountries.append(manualInsertion)
            
            # If NO (the existing `tconst` is incorrect), the user is prompted to manually
            # input the correct `tconst`, and if it has not already been appended to the
            # $formerDuds, then it is added. 
            elif tconstCheck == "n":
                NEW_tconst = str(input("   [!] Enter the correct TCONST: [!] "))         
                if NEW_tconst not in formerDuds:
                    changeTCONST_LIST(NEW_tconst)
                    
                # Also, each bool that was left False (the scraper couldn't locate the correct
                # data) signals a value of `None` to be appended to the current iterration's
                # extraction list so that its row can later be dropped in favor of $formerDuds    
                if budgetBOOL != True:
                    budgets.append(None)
                if dayBOOL != True:
                    releasedDays.append(None)
                if monthBOOL != True:
                    releasedMonths.append(None)
                if yearBOOL != True:
                    releasedYears.append(None)
                if countryBOOL != True:
                    releasedCountries.append(None)
            else:
                clear()
                print("      [!] [!]    You must enter 'y' or 'n'    [!] [!]")
                time.sleep(4)
                                    
    
    browser.quit()

os.system('say "to the WALL!"')

SCRAPE 1/7   >>>>   tconst: 'tt0042876'   throttled by: 5.92s
SCRAPE 2/7   >>>>   tconst: 'tt0043014'   throttled by: 2.71s
SCRAPE 3/7   >>>>   tconst: 'tt0042804'   throttled by: 3.76s
   [!] ERROR [!]
   [!] Is tt0042804 the correct TCONST? [!]    [y]/[n] y
   [!] ENTER tt0042804's   $BUDGET$   MANUALLY (As a whole number): [!] 23092
SCRAPE 4/7   >>>>   tconst: 'tt0056801'   throttled by: 4.15s
   [!] ERROR [!]
   [!] Is tt0056801 the correct TCONST? [!]    [y]/[n] y
   [!] ENTER tt0056801's   $BUDGET$   MANUALLY (As a whole number): [!] 900000
SCRAPE 5/7   >>>>   tconst: 'tt0082694'   throttled by: 4.15s
SCRAPE 6/7   >>>>   tconst: 'tt0098635'   throttled by: 3.69s
SCRAPE 7/7   >>>>   tconst: 'tt6304162'   throttled by: 5.6s
   [!] ERROR [!]
   [!] Is tt6304162 the correct TCONST? [!]    [y]/[n] y
   [!] ENTER tt6304162's   $BUDGET$   MANUALLY (As a whole number): [!] 6000000


0

In [532]:
#allBatches_AND_doOvers.head(2)

In [533]:
#dudsDF['RATING'] = scores
#dudsDF['VOTES'] = votecount
#dudsDF['PRODUCTION_CO'] = companies
#dudsDF['BUDGET'] = budgets
#dudsDF['RELEASE_DAY'] = releasedDays
#dudsDF['RELEASE_MON'] = releasedMonths
#dudsDF['RELEASE_YEAR'] = releasedYears
#dudsDF['COUNTRY'] = releasedCountries

# DROPPING PRODUCTION_CO FROM ALL FOR THE SAKE OF TIME
#dudsDF = dudsDF.drop(['PRODUCTION_CO'], axis=1).copy()
dudsDF.rename(columns={'RATING': 'SCORES',
                         'BUDGET': 'BUDGETS',
                         'RELEASE_DAY': 'RELEASED_DAY',
                         'RELEASE_MON': 'RELEASED_MON',
                         'RELEASE_YEAR': 'RELEASED_YEAR'},
                inplace=True)
#dudsDF.head(2)

In [569]:
# This is like a periscope:   allLosers[allLosers['TITLE'] == "meet me in st. louis"]

# This is now commented out so that we can use its csv output instead. All DataFrames created
# from the non-winning (LOSERS) films (~9 in addition the the winner for each year) are joined
# , if any Null values are found, their entire row is dropped by design. Corrected rows are
# then appeneded, various features are cleaned/manipulated, before once again droping any more
# Null rows, sorting the entire DataFrame by 'YEAR', reseting the index, and finally adding
# on the 'WINNER' column with appropriate bools.

#allLosers = allBatches_AND_doOvers.append(dudsDF)
#allLosers.dropna(inplace=True)
#
#allLosers = allLosers.sort_values(by='YEAR').copy()
#allLosers.reset_index(drop=True, inplace=True)
#
#
#
#allLosers.set_value(846, 'BUDGETS', 40000000)
#
#allLosers.set_value(5, 'TCONST', 'tt0018192')
#allLosers.set_value(5, 'SCORES', 7.7)
#allLosers.set_value(5, 'VOTES', 6295)
#allLosers.set_value(5, 'BUDGETS', 152000)
#allLosers.set_value(5, 'RELEASED_DAY', 7)
#allLosers.set_value(5, 'RELEASED_MON', 4)
#allLosers.set_value(5, 'RELEASED_YEAR', 1927)
#allLosers.set_value(5, 'COUNTRY', 'Germany')
#
#allLosers.set_value(167, 'RELEASED_DAY', 22)
#
#
#
#allLosers = allLosers.sort_values(by='YEAR').copy()
#allLosers.reset_index(drop=True, inplace=True)
#
#winnersCol = []
#for row in allLosers.iterrows():
#    if row[1].TITLE != "green book":
#        winnersCol.append(0)
#    else:
#        winnersCol.append(1)
#allLosers['WINNER'] = winnersCol
#    
#allLosers.to_csv("/Users/nicolespaar/Desktop/allLosers.csv")

  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':
  del sys.path[0]
  
  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [576]:
# X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X #
# X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X #
# X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X #
# X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X #
# X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X #
# X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X #
# X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X #
# X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X #
# X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X #
# X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X #
# X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X - X #

In [3]:
# I import the Best Picture losing data's csv, ensuring all numerical representaions are
# brought in as type float (the machine learning models require it to be as such.)
LOSE = pd.read_csv("/Users/nicolespaar/Desktop/allLosers.csv", dtype={'YEAR': float,
                                                                      'BECHDEL': float,
                                                                      'SCORES': float,
                                                                      'VOTES': float,
                                                                      'BUDGETS': float,
                                                                      'RELEASED_DAY': float,
                                                                      'RELEASED_MON': float,
                                                                      'RELEASED_YEAR': float,
                                                                      'WINNER': float})
DF_LOSE = LOSE.drop(['Unnamed: 0'], axis=1).copy()
DF_LOSE.head()

Unnamed: 0,TCONST,TITLE,YEAR,BECHDEL,SCORES,VOTES,BUDGETS,RELEASED_DAY,RELEASED_MON,RELEASED_YEAR,COUNTRY,WINNER
0,tt0018379,seventh heaven,1926.0,0.0,7.7,2801.0,1300000.0,30.0,10.0,1927.0,USA,0.0
1,tt0018037,the jazz singer,1927.0,0.0,6.7,8254.0,422000.0,6.0,10.0,1927.0,USA,0.0
2,tt0037024,the lodger,1927.0,0.0,7.2,2751.0,800000.0,19.0,1.0,1944.0,USA,0.0
3,tt0018051,the kid brother,1927.0,0.0,7.6,3448.0,250000.0,17.0,1.0,1927.0,USA,0.0
4,tt0018033,it,1927.0,1.0,7.4,2726.0,2000000.0,15.0,2.0,1927.0,USA,0.0


In [4]:
# I import the Best Picture winning data's csv, ensuring all numerical representaions are
# brought in as type float (the machine learning models require it to be as such.)
WIN = pd.read_csv("/Users/nicolespaar/Desktop/allDF.csv", dtype={'YEAR': float,
                                                                'BECHDEL': float,
                                                                'RATING': float,
                                                                'VOTES': float,
                                                                'BUDGET': float,
                                                                'RELEASE_DAY': float,
                                                                'RELEASE_MON': float,
                                                                'RELEASE_YEAR': float,
                                                                'WINNER': float,
                                                                'RANK': float})
WIN.rename(columns={'RATING': 'SCORES',
                    'BUDGET': 'BUDGETS',
                    'RELEASE_DAY': 'RELEASED_DAY',
                    'RELEASE_MON': 'RELEASED_MON',
                    'RELEASE_YEAR': 'RELEASED_YEAR'}, 
           inplace=True)

# DROPPING PRODUCTION_CO & RANK FROM ALL FOR THE SAKE OF TIME
DF_WIN = WIN.drop(['Unnamed: 0', 'PRODUCTION_CO', 'RANK'], axis=1).copy()
DF_WIN.head()

Unnamed: 0,TCONST,TITLE,YEAR,BECHDEL,SCORES,VOTES,BUDGETS,RELEASED_DAY,RELEASED_MON,RELEASED_YEAR,COUNTRY,WINNER
0,tt0018578,wings,1927.0,1.0,7.7,10234.0,2000000.0,5.0,1.0,1929.0,USA,1.0
1,tt0019729,the broadway melody,1928.0,0.0,6.1,5836.0,379000.0,6.0,6.0,1929.0,USA,1.0
2,tt0020629,all quiet on the western front,1929.0,1.0,8.0,53729.0,1448864.0,24.0,8.0,1930.0,USA,1.0
3,tt0022958,grand hotel,1930.0,1.0,7.5,15822.0,700000.0,11.0,9.0,1932.0,USA,1.0
4,tt0021746,cimarron,1931.0,1.0,5.9,4837.0,1433000.0,9.0,2.0,1931.0,USA,1.0


In [5]:
# Create the big mamba jam DataFrame of everything.
DF = DF_WIN.append(DF_LOSE)
#DF.columns
DF.dropna(inplace=True)
DF = DF.sort_values(by='YEAR').copy()
DF.reset_index(drop=True, inplace=True)
DF.head(5)
#DF[DF.WINNER == 1]

Unnamed: 0,TCONST,TITLE,YEAR,BECHDEL,SCORES,VOTES,BUDGETS,RELEASED_DAY,RELEASED_MON,RELEASED_YEAR,COUNTRY,WINNER
0,tt0018379,seventh heaven,1926.0,0.0,7.7,2801.0,1300000.0,30.0,10.0,1927.0,USA,0.0
1,tt0018578,wings,1927.0,1.0,7.7,10234.0,2000000.0,5.0,1.0,1929.0,USA,1.0
2,tt0018037,the jazz singer,1927.0,0.0,6.7,8254.0,422000.0,6.0,10.0,1927.0,USA,0.0
3,tt0037024,the lodger,1927.0,0.0,7.2,2751.0,800000.0,19.0,1.0,1944.0,USA,0.0
4,tt0018051,the kid brother,1927.0,0.0,7.6,3448.0,250000.0,17.0,1.0,1927.0,USA,0.0


In [6]:
    allCountries = DF['COUNTRY'].tolist()
    allCountries = list(dict.fromkeys(allCountries))

In [7]:
def country2number(Country):
    return{
        'USA': 35,
        'Germany': 1,
        'France': 2,
        'Denmark': 3,
        'Sweden': 4,
        'Argentina': 5,
        'Japan': 6,
        'Portugal': 7,
        'Brazil': 8,
        'Uruguay': 9,
        'Scotland': 10,
        'Finland': 11,
        'Italy': 12,
        'Australia': 13,
        'UK': 14,
        'West': 15,
        'India': 16,
        'West Germany': 1,
        'Soviet': 17,
        'Czechoslovakia': 18,
        'USSR': 17,
        'Canada': 19,
        'Spain': 20,
        'Poland': 21,
        'Iceland': 22,
        'Serbia': 23,
        'Hungary': 24,
        'Greece': 25,
        'Taiwan': 26,
        'Ireland': 27,
        'South': 28,
        'Romania': 29,
        'Israel': 30,
        'Belgium': 31,
        'Iran': 32,
        'Russia': 17,
        'Philippines': 33,
        'Lebanon': 34,
        'Mexico': 36
    }[Country]

In [8]:
numCountries = []
for row in DF.iterrows():
    stringCountry = row[1].COUNTRY
    encodedCountry = country2number(stringCountry)
    numCountries.append(encodedCountry)

In [9]:
DF = DF.drop(['COUNTRY'], axis=1).copy()
DF['COUNTRY'] = numCountries
DF = DF.sort_values(by='YEAR').copy()
DF.to_csv("/Users/nicolespaar/Desktop/combinedData_NEW.csv")
DF.head(50)

Unnamed: 0,TCONST,TITLE,YEAR,BECHDEL,SCORES,VOTES,BUDGETS,RELEASED_DAY,RELEASED_MON,RELEASED_YEAR,WINNER,COUNTRY
0,tt0018379,seventh heaven,1926.0,0.0,7.7,2801.0,1300000.0,30.0,10.0,1927.0,0.0,35
1,tt0018578,wings,1927.0,1.0,7.7,10234.0,2000000.0,5.0,1.0,1929.0,1.0,35
2,tt0018037,the jazz singer,1927.0,0.0,6.7,8254.0,422000.0,6.0,10.0,1927.0,0.0,35
3,tt0037024,the lodger,1927.0,0.0,7.2,2751.0,800000.0,19.0,1.0,1944.0,0.0,35
4,tt0018051,the kid brother,1927.0,0.0,7.6,3448.0,250000.0,17.0,1.0,1927.0,0.0,35
5,tt0018033,it,1927.0,1.0,7.4,2726.0,2000000.0,15.0,2.0,1927.0,0.0,35
6,tt0018192,napoléon,1927.0,0.0,7.7,6295.0,152000.0,7.0,4.0,1927.0,0.0,1
7,tt0018455,sunrise,1927.0,1.0,8.1,43717.0,200000.0,4.0,11.0,1927.0,0.0,35
8,tt0017136,metropolis,1927.0,1.0,8.3,145916.0,3407043.0,13.0,3.0,1927.0,0.0,35
9,tt0018526,underworld,1927.0,1.0,7.6,2228.0,100000.0,29.0,10.0,1927.0,0.0,35
