In [1]:
import requests
import pandas as pd
import math, json
from bs4 import BeautifulSoup
import re
from IPython.display import clear_output
import numpy as np

from concurrent.futures import ThreadPoolExecutor, wait

In [2]:
#Removes request warnings from console

from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

In [3]:
def getNamesOfMovies(genre):
    headers = {
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Referer': 'https://imsdb.com/',
        'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
    }
    response = requests.get(f"http://imsdb.com/genre/{genre}", headers=headers, verify=False)
    html = response.text
    soup = BeautifulSoup(html, "html.parser")
    movie_names=[]
    for row in soup.find_all('p'):
        movie_names.append(row.find('a').get('title'))
    return movie_names
    

In [20]:
def getMovieInfos(movie_name):
    from urllib import parse
    headers = {
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Referer': 'https://imsdb.com/',
        'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
    }
    response = requests.get(f"http://imsdb.com/Movie%20Scripts/{parse.quote(movie_name)}.html", headers=headers, verify=False)
    html = response.text
    soup = BeautifulSoup(html, "html.parser")
    movie_detail = soup.find(class_='script-details')
    all_a = movie_detail.find_all('a')
    genres = []
    authors = []
    for a in all_a:
        text = a.text
        if a['href'].startswith('/writer'):
            authors.append(text)
        if a['href'].startswith('/genre'):
            genres.append(text)
    
    movie_info = {
        'imgSrc': movie_detail.find('img')['src'],
        'title': movie_name.replace(' Script', ''),
        'genres': genres,
        'authors': authors,
    }
    
    return movie_info

In [5]:
genres = ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Short', 'Thriller', 'War', 'Western']

def collect_movie_names_for_all_genres():
    movie_name_list = set()
    for genre in genres:
        movie_names = getNamesOfMovies(genre)
        print(genre, len(movie_names))
        if len(movie_names) > 0:
            movie_name_list = movie_name_list | set(movie_names)

    return movie_name_list


In [6]:
movie_name_set = list(collect_movie_names_for_all_genres())

Action 336
Adventure 203
Animation 45
Comedy 394
Crime 231
Drama 652
Family 53
Fantasy 130
Film-Noir 4
Horror 158
Musical 27
Mystery 123
Romance 211
Sci-Fi 180
Short 3
Thriller 406
War 32
Western 17


In [15]:
getMovieInfos(movie_name_set[0])

{'imgSrc': '/images/no-poster.gif',
 'title': 'Hellraiser',
 'genres': ['Horror'],
 'authors': ['Clive Barker']}

In [18]:
def get_all_movie_infos():
    movie_info_list = []
    print(f"total {len(movie_name_set)}")
    for i, movie in enumerate(movie_name_set):
        print(f'{i}/{len(movie_name_set)}---- start')
        try:
            movie_info = getMovieInfos(movie)
            movie_info_list.append(movie_info)
            print(f'{movie} success')
        except BaseException as e:
            print(f"failed {movie} due to {str(e)}")
    return movie_info_list
            

In [19]:
movie_infos = get_all_movie_infos()

total 1209
0/1209---- start
Hellraiser Script success
1/1209---- start
Artist, The Script success
2/1209---- start
Pacifier, The Script success
3/1209---- start
Terminator 2: Judgement Day Script success
4/1209---- start
Game, The Script success
5/1209---- start
American Pie Script success
6/1209---- start
Superbad Script success
7/1209---- start
Ghost Ship Script success
8/1209---- start
Sweeney Todd: The Demon Barber of Fleet Street Script success
9/1209---- start
Fruitvale Station Script success
10/1209---- start
Beasts of the Southern Wild Script success
11/1209---- start
Godfather Part II Script success
12/1209---- start
Shampoo Script success
13/1209---- start
Zero Dark Thirty Script success
14/1209---- start
Unknown Script success
15/1209---- start
Drive Angry Script success
16/1209---- start
My Mother Dreams the Satan's Disciples in New York Script success
17/1209---- start
G.I. Jane Script success
18/1209---- start
Lord of the Rings: Fellowship of the Ring, The Script success


Who Framed Roger Rabbit? Script success
166/1209---- start
Iron Lady, The Script success
167/1209---- start
Colombiana Script success
168/1209---- start
Midnight in Paris Script success
169/1209---- start
Shakespeare in Love Script success
170/1209---- start
So I Married an Axe Murderer Script success
171/1209---- start
Darkman Script success
172/1209---- start
Time Machine, The Script success
173/1209---- start
Bringing Out the Dead Script success
174/1209---- start
Rocky Script success
175/1209---- start
Traffic Script success
176/1209---- start
Arthur Script success
177/1209---- start
Malcolm X Script success
178/1209---- start
Timber Falls Script success
179/1209---- start
Feast Script success
180/1209---- start
Anniversary Party, The Script success
181/1209---- start
Remember Me Script success
182/1209---- start
Fright Night Script success
183/1209---- start
He's Just Not That Into You Script success
184/1209---- start
Hall Pass Script success
185/1209---- start
Pineapple Express 

Station West Script success
333/1209---- start
Man on the Moon Script success
334/1209---- start
Gang Related Script success
335/1209---- start
Lone Star Script success
336/1209---- start
Priest Script success
337/1209---- start
Princess Bride, The Script success
338/1209---- start
Alien vs. Predator Script success
339/1209---- start
Tin Cup Script success
340/1209---- start
Horrible Bosses Script success
341/1209---- start
American Beauty Script success
342/1209---- start
Tamara Drewe Script success
343/1209---- start
Mini's First Time Script success
344/1209---- start
Spartan Script success
345/1209---- start
Lost in Translation Script success
346/1209---- start
French Connection, The Script success
347/1209---- start
Philadelphia Script success
348/1209---- start
Theory of Everything, The Script success
349/1209---- start
Surrogates Script success
350/1209---- start
Scream Script success
351/1209---- start
Dry White Season, A Script success
352/1209---- start
Hellboy Script success


Belle Script success
498/1209---- start
Good Girl, The Script success
499/1209---- start
44 Inch Chest Script success
500/1209---- start
Case 39 Script success
501/1209---- start
Sweet Hereafter, The Script success
502/1209---- start
Innerspace Script success
503/1209---- start
Blind Side, The Script success
504/1209---- start
ParaNorman Script success
505/1209---- start
Village, The Script success
506/1209---- start
Sicario Script success
507/1209---- start
His Girl Friday Script success
508/1209---- start
Basquiat Script success
509/1209---- start
What About Bob? Script success
510/1209---- start
Day the Clown Cried, The Script success
511/1209---- start
Benny & Joon Script success
512/1209---- start
Extract Script success
513/1209---- start
Blade: Trinity Script success
514/1209---- start
Insidious Script success
515/1209---- start
Warrior Script success
516/1209---- start
Double Indemnity Script success
517/1209---- start
Wonder Boys Script success
518/1209---- start
Dave Barry's C

Revenant, The Script success
665/1209---- start
Hudson Hawk Script success
666/1209---- start
Unbreakable Script success
667/1209---- start
Being There Script success
668/1209---- start
Major League Script success
669/1209---- start
Meet John Doe Script success
670/1209---- start
Eyes Wide Shut Script success
671/1209---- start
Take Shelter Script success
672/1209---- start
Alien: Resurrection Script success
673/1209---- start
Wild At Heart Script success
674/1209---- start
Smokin' Aces Script success
675/1209---- start
Devil Wears Prada, The Script success
676/1209---- start
Bruce Almighty Script success
677/1209---- start
A Most Violent Year Script success
678/1209---- start
Interstellar Script success
679/1209---- start
St. Elmo's Fire Script success
680/1209---- start
Jacob's Ladder Script success
681/1209---- start
High Fidelity Script success
682/1209---- start
Repo Man Script success
683/1209---- start
Forrest Gump Script success
684/1209---- start
Harry Potter and the Half-Bloo

Bling Ring, The Script success
831/1209---- start
Talented Mr. Ripley, The Script success
832/1209---- start
Mute Witness Script success
833/1209---- start
Stir of Echoes Script success
834/1209---- start
Friday the 13th Script success
835/1209---- start
Barton Fink Script success
836/1209---- start
American History X Script success
837/1209---- start
Jaws Script success
838/1209---- start
Crow: City of Angels, The Script success
839/1209---- start
Alien Script success
840/1209---- start
Bonfire of the Vanities Script success
841/1209---- start
Space Milkshake Script success
842/1209---- start
50-50 Script success
843/1209---- start
Seventh Seal, The Script success
844/1209---- start
Shrek the Third Script success
845/1209---- start
Gattaca Script success
846/1209---- start
Spare Me Script success
847/1209---- start
Erik the Viking Script success
848/1209---- start
Tremors Script success
849/1209---- start
Rush Script success
850/1209---- start
Blade II Script success
851/1209---- star

Nightmare on Elm Street, A Script success
996/1209---- start
Crank Script success
997/1209---- start
Heavenly Creatures Script success
998/1209---- start
Cold Mountain Script success
999/1209---- start
Three Kings Script success
1000/1209---- start
Unforgiven Script success
1001/1209---- start
Charlie's Angels Script success
1002/1209---- start
Hanna Script success
1003/1209---- start
Star Trek Script success
1004/1209---- start
28 Days Later Script success
1005/1209---- start
No Country for Old Men Script success
1006/1209---- start
Ghostbusters 2 Script success
1007/1209---- start
Paul Script success
1008/1209---- start
Candle to Water Script success
1009/1209---- start
Gods and Monsters Script success
1010/1209---- start
Gremlins 2 Script success
1011/1209---- start
Ghost World Script success
1012/1209---- start
Sixth Sense, The Script success
1013/1209---- start
Ace Ventura: Pet Detective Script success
1014/1209---- start
Top Gun Script success
1015/1209---- start
Punch-Drunk Love

Ed Wood Script success
1164/1209---- start
Entrapment Script success
1165/1209---- start
White Ribbon, The Script success
1166/1209---- start
Life Script success
1167/1209---- start
Devil's Advocate Script success
1168/1209---- start
American Psycho Script success
1169/1209---- start
Liar Liar Script success
1170/1209---- start
Jason X Script success
1171/1209---- start
Hannibal Script success
1172/1209---- start
Monkeybone Script success
1173/1209---- start
42 Script success
1174/1209---- start
Inglourious Basterds Script success
1175/1209---- start
Wild Wild West Script success
1176/1209---- start
Pretty Woman Script success
1177/1209---- start
Ruins, The Script success
1178/1209---- start
I Still Know What You Did Last Summer Script success
1179/1209---- start
Wonder Woman Script success
1180/1209---- start
Austin Powers - International Man of Mystery Script success
1181/1209---- start
True Grit Script success
1182/1209---- start
Withnail and I Script success
1183/1209---- start
Up 

In [21]:
import json

f = open(f"./data/movie_default.json", 'w')
json.dump(movie_infos, f)
f.close()

In [4]:
stop_words = "a's, able, im, uh, about, cod't, cont'd, above, according, accordingly, across, actually, after, afterwards, again, against, ain’t, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, appear, appreciate, appropriate, are, aren’t, around, as, aside, ask, asking, associated, at, available, away, awfully, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, both, brief, but, by, c’mon, c’s, came, can, can’t, cannot, cant, cause, causes, certain, certainly, changes, clearly, co, com, come, comes, concerning, consequently, consider, considering, contain, containing, contains, corresponding, could, couldn’t, course, currently, definitely, described, despite, did, didn’t, different, do, does, doesn’t, doing, don’t, done, down, downwards, during, each, edu, eg, eight, either, else, elsewhere, enough, entirely, especially, et, etc, even, ever, every, everybody, everyone, everything, everywhere, ex, exactly, example, except, far, few, fifth, first, five, followed, following, follows, for, former, formerly, forth, four, from, further, furthermore, get, gets, getting, given, gives, go, goes, going, gone, got, gotten, greetings, had, hadn’t, happens, hardly, has, hasn’t, have, haven’t, having, he, he’s, hello, help, hence, her, here, here’s, hereafter, hereby, herein, hereupon, hers, herself, hi, him, himself, his, hither, hopefully, how, howbeit, however, i, i’d, i’ll, i’m, i’ve, ie, if, ignored, immediate, in, inasmuch, inc, indeed, indicate, indicated, indicates, inner, insofar, instead, into, inward, is, isn’t, it, it’d, it’ll, it’s, its, itself, just, keep, keeps, kept, know, knows, known, last, lately, later, latter, latterly, least, less, lest, let, let’s, like, liked, likely, little, look, looking, looks, ltd, mainly, many, may, maybe, me, mean, meanwhile, merely, might, more, moreover, most, mostly, much, must, my, myself, name, namely, nd, near, nearly, necessary, need, needs, neither, never, nevertheless, new, next, nine, no, nobody, non, none, noone, nor, normally, not, nothing, novel, now, nowhere, obviously, of, off, often, oh, ok, okay, old, on, once, one, ones, only, onto, or, other, others, otherwise, ought, our, ours, ourselves, out, outside, over, overall, own, particular, particularly, per, perhaps, placed, please, plus, possible, presumably, probably, provides, que, quite, qv, rather, rd, re, really, reasonably, regarding, regardless, regards, relatively, respectively, right, said, same, saw, say, saying, says, second, secondly, see, seeing, seem, seemed, seeming, seems, seen, self, selves, sensible, sent, serious, seriously, seven, several, shall, she, should, shouldn’t, since, six, so, some, somebody, somehow, someone, something, sometime, sometimes, somewhat, somewhere, soon, sorry, specified, specify, specifying, still, sub, such, sup, sure, t’s, take, taken, tell, tends, th, than, thank, thanks, thanx, that, that’s, thats, the, their, theirs, them, themselves, then, thence, there, there’s, thereafter, thereby, therefore, therein, theres, thereupon, these, they, they’d, they’ll, they’re, they’ve, think, third, this, thorough, thoroughly, those, though, three, through, throughout, thru, thus, to, together, too, took, toward, towards, tried, tries, truly, try, trying, twice, two, un, under, unfortunately, unless, unlikely, until, unto, up, upon, us, use, used, useful, uses, using, usually, value, various, very, via, viz, vs, want, wants, was, wasn’t, way, we, we’d, we’ll, we’re, we’ve, welcome, well, went, were, weren’t, what, what’s, whatever, when, whence, whenever, where, where’s, whereafter, whereas, whereby, wherein, whereupon, wherever, whether, which, while, whither, who, who’s, whoever, whole, whom, whose, why, will, willing, wish, with, within, without, won't, wonder, would, would, wouldn't, yes, yet, you, you'd, you'll, you're, you've, your, yours, yourself, yourselves, zero".replace('’',"'").split(', ')
end_words = '|'.join(['\,','\.','\?','\!','\-','\:','\;','\r','\n','\t','\(','\)','\{','\}',"\'s","\_","\&", "\#", "\@", "\#","\^","\*","\/","\[","\]"])

In [5]:
def process_texts(raw_script):
    end_replaced = re.sub(end_words, " ", raw_script.lower())
    # split
    script_chunks = end_replaced.split(' ')
    # remove stop_wrods
    filtered_script = filter(lambda x: len(x) > 1 and ((x in stop_words) is False), script_chunks)
    return filtered_script

In [6]:
def make_keyword_dict_and_count(keywords):
    keyword_list = ["{}".format(key.replace('"', "").replace("'","")) for key in list(set(keywords))]
#     print(f"tried with {len(keywords)} keywords")
    
    keyword_dict={key:0 for key in keyword_list}
    for raw_keyword in keywords:
        keyword = "{}".format(raw_keyword.replace('"', "").replace("'",""))
        keyword_dict.update({keyword:keyword_dict[keyword] + 1})
    return (keyword_list, keyword_dict)

In [7]:
def getMovieScripts(movie_name):
    headers = {
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Referer': 'https://imsdb.com/',
        'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
    }
    response = requests.get(f"http://imsdb.com/scripts/{movie_name}.html", headers=headers, verify=False)
    html = response.text
    soup = BeautifulSoup(html, "html.parser")
    raw_script = soup.find("td", class_='scrtext')
    try:
        raw_script = raw_script.find('pre')
        try:
            raw_script = raw_script.find('pre').text
        except:
            raw_script = raw_script.text
    except:
        val = soup.find("td", class_='scrtext')
        if val != None:
            raw_script = val.text
    if raw_script == None:
        raw_script = ""
    
    return movie_name, raw_script

In [13]:
def downloads(results, filename):
    with open(f"{filename}.json", "w") as json_file:
#         json.dump([val for val in results.values()], json_file)
        json.dump(results, json_file)
        json_file.close()
    
#     with open(f"{filename}_doc_infos.json", "w") as json_file:
#         json.dump([item for item in doc_data.items()], json_file)
#         json_file.close()


In [15]:
def storeScriptByGenres(genres=['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Short', 'Thriller', 'War', 'Western']):
    results = {}
    doc_data = {}
    N_threads = 4
    for genre in genres:
        names = getNamesOfMovies(genre)
        count = 0
        name=""
        movie_data = []
        future_list = []
        with ThreadPoolExecutor(max_workers=N_threads) as e:
            for name in names:
                future = e.submit(getMovieScripts, name)
                future_list.append(future)
        
            (done, _) = wait(future_list)
            movie_data = [r.result() for r in done]
            try:
                downloads(movie_data, genre)
#                 for name, movie in movie_data:
#                     if name in results:
#                         doc_data[name].append(genre)
#                     else:
#                         results[name] = movie
#                         doc_data[name] = [genre]
#                     count+=1
                count += len(movie_data)
            except BaseException as e:
                print(str(e), f'while {genre} {name}')
            print(f"count {count}/{len(names)}")
#     return (results, doc_data)


In [None]:
storeScriptByGenres()

count 336/336
count 203/203
count 45/45
count 394/394
count 231/231
count 652/652
count 53/53
count 130/130
count 4/4
count 158/158
count 27/27
count 123/123
count 211/211
count 180/180
count 3/3


In [10]:
downloads(r,d)

In [10]:
import pandas as pd
import json
genres = ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Short', 'Thriller', 'War', 'Western']
counts = []

for genre in genres:
    f = open(f"./data/grouped_by_movie/preprocessed/{genre}.json")
    data = json.loads(f.read())
    f.close()
    counts.append(len(data))

pd.DataFrame([genres, counts])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,Action,Adventure,Animation,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Short,Thriller,War,Western
1,336,203,44,393,231,652,53,129,4,158,26,123,210,180,3,406,32,17
