In [2]:
from bs4 import BeautifulSoup, NavigableString, Tag
from datascience import *
from collections import Counter

In [3]:
data = Table.read_table('scripts_metadata.csv')
data.show(5)

title,Genres,Average user rating,IMSDb rating,IMSDb opinion,Script Date,Movie Release Date,Writers,Submitted by,script_path
10 Things I Hate About You Script,Comedy;Romance;,(8.76 out of 10),(7 out of 10),A better-than-most teen film.,: November 1997,,Karen McCullah Lutz;Kirsten Smith;William Shakespeare;,,scripts/10 Things I Hate About You Script.html
12 Script,"Comedy;Read ""12"" Script;",None available,Not available,None available,,,Lawrence Bridges;,,scripts/12 Script.html
12 and Holding Script,Drama;,(7.00 out of 10),Not available,None available,: April 2004,: May 2006,Anthony Cipriano;,,scripts/12 and Holding Script.html
12 Monkeys Script,Drama;Sci-Fi;Thriller;,(9.25 out of 10),Not available,None available,: June 1994,,David Peoples;Janet Peoples;,,scripts/12 Monkeys Script.html
12 Years a Slave Script,Drama;,None available,Not available,None available,,: November 2013,John Ridley;,: XXyTurtle,scripts/12 Years a Slave Script.html


In [4]:
data = data.where('script_path', are.not_equal_to('nan'))
data.show(5)

title,Genres,Average user rating,IMSDb rating,IMSDb opinion,Script Date,Movie Release Date,Writers,Submitted by,script_path
10 Things I Hate About You Script,Comedy;Romance;,(8.76 out of 10),(7 out of 10),A better-than-most teen film.,: November 1997,,Karen McCullah Lutz;Kirsten Smith;William Shakespeare;,,scripts/10 Things I Hate About You Script.html
12 Script,"Comedy;Read ""12"" Script;",None available,Not available,None available,,,Lawrence Bridges;,,scripts/12 Script.html
12 and Holding Script,Drama;,(7.00 out of 10),Not available,None available,: April 2004,: May 2006,Anthony Cipriano;,,scripts/12 and Holding Script.html
12 Monkeys Script,Drama;Sci-Fi;Thriller;,(9.25 out of 10),Not available,None available,: June 1994,,David Peoples;Janet Peoples;,,scripts/12 Monkeys Script.html
12 Years a Slave Script,Drama;,None available,Not available,None available,,: November 2013,John Ridley;,: XXyTurtle,scripts/12 Years a Slave Script.html


In [5]:
all_scripts = {}

for fname in data['script_path']:
    
    print(fname)
    with open(fname, 'r') as f:
        raw = f.read()
    soup = BeautifulSoup(raw, 'html5lib')

    try:
        bolded = soup.find('td', {'class': 'scrtext'} ).find_all('b') #find all bolded elements 
        text = soup.find('td', {'class': 'scrtext'} ).text
        b_text = [b.text.strip() for b in bolded]
        bolded_text = [b for b in b_text if len(b) > 0] 
        sift_out = ['INT.', "EXT.", "-"] 
        characters = []
        scenes = []
        for c in bolded_text:
            character = True
            for s in sift_out:
                if s in c:
                    character = False
            if character == True:
                characters.append(c)
            elif len(c) > 4:
                scenes.append(c)
        
        characters = [c[0] for c in Counter(characters).most_common() if c[1] > 5]
        
        scenes.extend([c[0] for c in Counter(characters).most_common() if c[1] <= 5 and len(c[0].split()) > 3])

        movie_name = fname.split('/')[-1][:-5].replace(' Script', '')

        
        all_scripts[movie_name] = {}
        all_scripts[movie_name]['cast'] = characters
        all_scripts[movie_name]['scenes'] = scenes
        all_scripts[movie_name]['text'] = text
        
    except:
        pass

scripts/10 Things I Hate About You Script.html
scripts/12 Script.html
scripts/12 and Holding Script.html
scripts/12 Monkeys Script.html
scripts/12 Years a Slave Script.html


KeyboardInterrupt: 

In [8]:
all_scripts.keys()

dict_keys(['10 Things I Hate About You', '12', '12 and Holding', '12 Monkeys'])

In [12]:
import numpy as np
import networkx as nx
from lxml import etree
import itertools
from datascience import *
import matplotlib.pyplot as plt
from nltk.util import ngrams

def make_graph(c_dict):
    '''
    This function accepts a dictionary with number of lines and scenes to create a
    NetworkX graph object
    '''
    # setup graph object
    G = nx.Graph()
    
    # add nodes with attributes of number of lines and scenes
    for c in c_dict.keys():
            G.add_node(
                c,
                scenes=c_dict[c]
                #scenes=c_dict[c]['scenes']
            )

    # make edges by iterating over all combinations of nodes
    for (node1, data1), (node2, data2) in itertools.combinations(G.nodes(data=True), 2):

        # count scenes together by getting union of their sets
        scenes_together = len(set(data1['scenes']) & set(data2['scenes']))
        
        if scenes_together:
            # add more weight for more scenes together
            G.add_edge(node1, node2, weight=scenes_together)
            
    return G

In [13]:
def gini(array):
    """Calculate the Gini coefficient of a numpy array."""
    # https://github.com/oliviaguest/gini
    array = np.sort(array) # values must be sorted
    index = np.arange(1, array.shape[0] + 1) # index per array element
    n = array.shape[0] # number of array elements
    return ((np.sum((2 * index - n  - 1) * array)) / (n * np.sum(array))) #Gini coefficient

In [15]:
gini_array = []
for i in all_scripts.keys():
    scene_index_list = []
    count = 0
    for scene in set(all_scripts[i]['scenes']):
        indices = [m.start() for m in re.finditer(scene, all_scripts[i]['text'])]
        scene_index_list.extend(indices)
    scene_texts = []
    for n in ngrams(sorted(scene_index_list), 2):
        scene_texts.append(all_scripts[i]['text'][n[0]:n[1]])
    cast_dict = {}
    for c in all_scripts[i]['cast']:
        cast_dict[c] = []
        for i, scene in enumerate(scene_texts):
            if scene.count(c) > 0:
                cast_dict[c].append(i)
    
    G = make_graph(cast_dict)
    
    network_tab = Table()
    network_tab.append_column(label="Characters", values=[c for c in sorted(cast_dict.keys())])
    dc = [x[1] for x in sorted(nx.degree_centrality(G).items(), key=lambda x: x[0])]
    network_tab.append_column(label="Degree Centrality", values=dc)
    bc = [x[1] for x in sorted(nx.betweenness_centrality(G).items(), key=lambda x: x[0])]
    network_tab.append_column(label="Betweenness Centrality", values=bc)
    ec = [x[1] for x in sorted(nx.eigenvector_centrality(G).items(), key=lambda x: x[0])]
    network_tab.append_column(label="Eigenvector Centrality", values=ec)
    network_tab.show()
    print('The Gini Coefficient of this movie is', gini(network_tab.column('Eigenvector Centrality')))
    
    gini_array = np.append(gini_array, gini(network_tab.column('Eigenvector Centrality')))


Characters,Degree Centrality,Betweenness Centrality,Eigenvector Centrality
BIANCA,0.833333,0.0454545,0.413741
BRUCE,0.25,0.0,0.0208809
CAMERON,0.833333,0.0454545,0.385503
CHASTITY,0.5,0.0,0.115439
JOEY,0.833333,0.0671717,0.304199
KAT,1.0,0.159091,0.49326
MANDELLA,0.666667,0.030303,0.181087
MICHAEL,0.666667,0.0123737,0.309785
MISS PERKY,0.416667,0.0,0.0908165
MRS. BLAISE,0.25,0.0,0.0384913


The Gini Coefficient of this movie is 0.395583967833


Characters,Degree Centrality,Betweenness Centrality,Eigenvector Centrality
ALLEN,0.681818,0.185848,0.520027
AUGUST,0.0909091,0.0,0.0239186
AUNT DEBBIE,0.318182,0.00408163,0.115686
BARRY,0.0909091,0.0,0.0773191
BRENDA,0.272727,0.0263863,0.0536598
DAVE,0.181818,0.0,0.0698129
DOOLITTLE,0.181818,0.0,0.0822354
FLASHBACK:,0.227273,0.0,0.0320583
H. TOM,0.363636,0.00588538,0.114859
INTERVIEWER,0.136364,0.0,0.0583514


The Gini Coefficient of this movie is 0.529555446655


Characters,Degree Centrality,Betweenness Centrality,Eigenvector Centrality
(CONTINUED),0.888889,0.141301,0.548588
ASHLEY,0.333333,0.0,0.0858596
COACH,0.222222,0.0,0.0271151
CONTINUED:,0.888889,0.141301,0.548588
CONTINUED: (2),0.777778,0.0764862,0.223348
DOCTOR,0.222222,0.0,0.0120642
GABE,0.111111,0.0,0.00413949
GRACE,0.388889,0.00762527,0.164789
GUS,0.388889,0.00473856,0.13709
JACOB,0.777778,0.135707,0.312262


The Gini Coefficient of this movie is 0.504689140843


Characters,Degree Centrality,Betweenness Centrality,Eigenvector Centrality
ASTROPHYSICIST,0.555556,0.0148874,0.208181
BEN,0.333333,0.00326797,0.0968154
BILLINGS,0.222222,0.0,0.090714
BOTANIST,0.5,0.000726216,0.149263
COLE,1.0,0.298439,0.635163
DR. FLETCHER,0.611111,0.0330065,0.111897
DR. MASON,0.555556,0.0183007,0.0520864
DR. PETERS,0.277778,0.00653595,0.0972781
ENGINEER,0.555556,0.0070443,0.157991
FALE,0.277778,0.0,0.0732269


The Gini Coefficient of this movie is 0.408470882305


In [16]:
movie_array = []
for i in all_scripts.keys():
    thing = str(i) + ' Script'
    movie_array = np.append(movie_array, thing)
gini_table = Table().with_column('Movie', movie_array).with_column("Gini", gini_array)
gini_table.show()

Movie,Gini
10 Things I Hate About You Script,0.395584
12 Script,0.529555
12 and Holding Script,0.504689
12 Monkeys Script,0.408471


In [18]:
correct_data = data.where('title', movie_array)
final_table = correct_data.join('title', gini_table, 'Movie')
final_table.show()

title,Genres,Average user rating,IMSDb rating,IMSDb opinion,Script Date,Movie Release Date,Writers,Submitted by,script_path,Gini
10 Things I Hate About You Script,Comedy;Romance;,(8.76 out of 10),(7 out of 10),A better-than-most teen film.,: November 1997,,Karen McCullah Lutz;Kirsten Smith;William Shakespeare;,,scripts/10 Things I Hate About You Script.html,0.395584
12 Monkeys Script,Drama;Sci-Fi;Thriller;,(9.25 out of 10),Not available,None available,: June 1994,,David Peoples;Janet Peoples;,,scripts/12 Monkeys Script.html,0.408471
12 Script,"Comedy;Read ""12"" Script;",None available,Not available,None available,,,Lawrence Bridges;,,scripts/12 Script.html,0.529555
12 and Holding Script,Drama;,(7.00 out of 10),Not available,None available,: April 2004,: May 2006,Anthony Cipriano;,,scripts/12 and Holding Script.html,0.504689
