In [1]:
import os
import re
import nltk
import itertools
import csv

# set working directory
wd = os.path.join(os.getcwd(), 'marvel_scripts')
print wd, '\n'

# view script files
files = os.listdir(wd)
print files, '\n'

# get files for scripts from IMSDB
scripts_imsdb = [file[:-4] for file in files if file.endswith('_imsdb.txt')]
print scripts_imsdb, '\n'

# preview scripts
for s in scripts_imsdb:
    with open(os.path.join(wd, s + '.txt')) as script:
        head = [next(script) for x in xrange(10)]
    print s, '\n', head, '\n'
    
#Since each script has a different number of spaces at the 
#beginning of a line for entity name, etc, declare these 
#parameters in a dict

parameter_dict = {'fantastic_four_imsdb': [35, 25, 18], 'ghost_rider_imsdb': [], 'spider-man_imsdb': [16,8,28], 
                 'x-men_imsdb': [27, 17, 35], 'x-men_origins_wolverine_imsdb': []}


C:\Users\rohan_000\Documents\Berkeley\W266-1 - NLP\Final project\w266_project\marvel_scripts 

['ant-man_tw.txt', 'avengers_age_of_ultron_tw.txt', 'captain_america_civil_war_tw.txt', 'captain_america_the_first_avenger_tw.txt', 'captain_america_the_winter_soldier_tw.txt', 'fantastic_four_imsdb.txt', 'ghost_rider_imsdb.txt', 'guardians_of_the_galaxy_tw.txt', 'iron_man_3_tw.txt', 'lego_marvel_super_heroes_tw.txt', 'no_char_tags', 'spider-man_imsdb.txt', 'the_amazing_spider-man_2_tw.txt', 'the_amazing_spider-man_tw.txt', 'the_avengers_tw.txt', 'the_wolverine_tw.txt', 'thor_the_dark_world_tw.txt', 'thor_tw.txt', 'x-men_apocalypse_tw.txt', 'x-men_days_of_future_past_tw.txt', 'x-men_first_class_tw.txt', 'x-men_imsdb.txt', 'x-men_origins_wolverine_imsdb.txt', 'x-men_the_last_stand_tw.txt'] 

['fantastic_four_imsdb', 'ghost_rider_imsdb', 'spider-man_imsdb', 'x-men_imsdb', 'x-men_origins_wolverine_imsdb'] 

fantastic_four_imsdb 
['FANTASTIC FOUR\n', '\n', '\n', '                                 

## Preprocess Scripts

In [2]:
# encoding: utf-8

### preprocess scripts from imsdb

# dict for preprocessed scripts
# key = script file, value = preprocessed script [character, dialogue]
scripts_raw = {}

### preprocess_line()
# - convert to lowercase
# - remove special chars
# - add narrator as speaker for scene descriptions

def preprocess_line(line, title, nip, dip, character):
   
    # convert to lowercase
    #line = line.lower()
    
    # replace special chars '\'''', '\'',
    for c in ['*', '\n']:
        line = line.replace(c, '')
    
    # return narrator dialogue
    if is_character(line, title):
        return '', False, False, re.sub('[\(\[].*?[\)\]]', '', line).strip()
    
    elif is_dialogue(line, title):
        return re.sub('\s+', ' ', line).strip(), True, False, character
    
    elif is_blank(line):
        if dip:
            return '', False, False, ''
        else:
            return '', dip, nip, character
    else:
        return re.sub('\s+', ' ', line).strip(), False, True, 'narrator'

def is_dialogue(line, title):
    return re.match('\s{' + str(parameter_dict[title][1]) + '}.', line) is not None

def is_character(line, title):
    return re.match('\s{' + str(parameter_dict[title][0]) + '}.', line) is not None

def is_blank(line):
    return not line.strip()
    
### print_script() - print specific lines of script
def print_script(title, script, start=0, end=1):
    print title
    for i in xrange(start, end+1):
        print script[i]

# preprocess scripts

for s in [script for script in scripts_imsdb if parameter_dict[script]]:
    with open(os.path.join(wd, s + '.txt')) as script:
        # store and preprocess lines, remove 1st element (embed image)
        lines = script.readlines()
        lines_processed = []
        
        cur_character = 'narrator'
        dialogue_in_progress = False
        narration_in_progress = True
        
        dialogue_builder = ''
        narration_builder = ''
        
        for line in lines[parameter_dict[s][2]:]:
            line, dip, nip, character = preprocess_line(line, s, narration_in_progress, 
                                                        dialogue_in_progress, cur_character)
            
            if not dip and dialogue_in_progress:
                dialogue_builder = dialogue_builder + ' ' + line
                lines_processed.append([cur_character, dialogue_builder.strip()])
                dialogue_builder = ''
            elif not nip and narration_in_progress:
                #cur_character = 'narrator'
                narration_builder = narration_builder + ' ' + line
                lines_processed.append([cur_character, narration_builder.strip()])
                narration_builder = ''
                
            cur_character = character 
            dialogue_in_progress = dip
            narration_in_progress = nip
            
            if nip and line.strip():
                narration_builder = narration_builder + ' ' + line
                
            if dip and line.strip(): 
                dialogue_builder = dialogue_builder + ' ' + line
            
            
        
        # remove blank lines
        #lines = [line for line in lines if line != ['']]
        
        # merge dialogues
        #lines = merge_dialogue(lines)
        
    # add lines to script dict
    scripts_raw[s] = lines_processed

start, end = 0, 20
for k, v in scripts_raw.items():
    print_script(k, v, start, end)
    print '\n'
    
# set working directory
wd = os.path.join(os.getcwd(), 'prep_scripts')
print wd, '\n'

# write script to CSV files
for k, v in scripts_raw.items():
    with open(os.path.join(wd, k + '.csv'), 'wb') as f:
        writer = csv.writer(f)
        writer.writerows(v)


fantastic_four_imsdb
['narrator', "FADE IN: CLOSE ON A MASSIVE STEEL HEAD Our first thought: DR. DOOM? But it's not moving. A welder's torch sparks into frame in the hands of a sculptor on scaffolding. This is art, an epic 20 foot statue going up of a business mogul (VICTOR VON DOOM) in whose generously extended hands sit two intertwined columns of DNA. His face is chiseled, angular, perfect (too perfect). Past sparks, we MOVE down to pick up... EXT. STREET/VON DOOM INDUSTRIES TOWER - DAY REED RICHARDS and BEN GRIMM head toward the soaring glass-box atrium of VDI Headquarters. Designed to inspire awe, it does."]
['REED', 'High open space, exposed structural elements. Obviously aimed at first time visitors to create feelings of... smallness, inadequacy.']
['narrator', 'Ben glances at Reed, who looks a little nervous.']
['BEN', "Good thing it ain't workin... Reed, what are we doing here? This guy's fast-food, strip-mall science --"]
['REED', "This wasn't our first stop, in case you forgo