In [None]:
# GEREKLİ KÜTÜPHANELER

import os 
import re
import numpy as np
import codecs
import argparse

from os.path import isfile, join, sep, getsize, exists
from tqdm import tqdm

In [None]:
# ARGUMENT 

def read_args():
    parser = argparse.ArgumentParser(
        description='Script that parses a movie script txt into its constituent classes')
    parser.add_argument("-i", "--input", 
                        help="Path to script TXT to be parsed", required=True)
    parser.add_argument("-o", "--output", 
                        help="Path to directory for saving output", required=True)
    args = parser.parse_args()
    return os.path.abspath(args.input), os.path.abspath(args.output)

In [None]:
# SATIR POZİSYONLARI 

def get_offset(script_lines, script_str):
    offset_mat = np.empty((0, 2), dtype=int)
    pos_init = 0
    for line_val in script_lines:
        if line_val != '':
            line_start = script_str.find(line_val, pos_init)
            sub_script = script_str[line_start: (line_start + len(line_val))]
            valid_indices = [(line_start + i)
                             for i, x in enumerate(sub_script) if x != ' ']
            offset_mat = np.append(offset_mat, np.array(
                [[min(valid_indices), (max(valid_indices) + 1)]]), axis=0)
            pos_init = line_start + len(line_val) + 1
        else:
            offset_mat = np.append(offset_mat, np.array(
                [[pos_init, (pos_init + 1)]]), axis=0)
            pos_init += 1

    return offset_mat + 1

In [None]:
# DOSYAYI OKU

def read_script(file_path):
    fid = codecs.open(file_path, mode='r', encoding='utf-8')
    script_file = fid.read()
    fid.close()
    script_lines = script_file.splitlines()
    script_offsets = get_offset(script_lines, script_file)
    return script_lines, script_offsets

In [None]:
# SAHNE SINIRLARINI ETİKETLE (“INT.”, “EXT.”)

def get_scene_bound(script_noind, tag_vec, tag_set, bound_set):
    bound_ind = [i for i, x in enumerate(script_noind) if tag_vec[i] not in tag_set and
                 x.isupper() and
                 any([y in x.lower() for y in bound_set])]
    if len(bound_ind) > 0:
        for x in bound_ind:
            tag_vec[x] = 'S'

    return tag_vec, bound_ind

In [None]:
# GEÇİŞLERİ ETİKETLE (“CUT”, “FADE”)

def get_trans(script_noind, tag_vec, tag_set, trans_thresh, trans_set):
    re_func = re.compile('[^a-zA-Z ]')
    trans_ind = [i for i, x in enumerate(script_noind) if tag_vec[i] not in tag_set
                 and len(re_func.sub('', x).split()) < trans_thresh
                 and any([y in x.lower() for y in trans_set])]
    if len(trans_ind) > 0:
        for x in trans_ind:
            tag_vec[x] = 'T'

    return tag_vec, trans_ind

In [None]:
# METADATALARI ETİKETLE ("BLACK SCREEN", "darkness")

def get_meta(script_noind, tag_vec, tag_set, meta_thresh, meta_set, sent_thresh, bound_ind, trans_ind):
    re_func = re.compile('[^a-zA-Z ]')
    met_ind = [i for i, x in enumerate(script_noind) if tag_vec[i] not in tag_set
               and i != 0 and i != (len(script_noind) - 1)
               and len(x.split()) < meta_thresh
               and len(re_func.sub('', script_noind[i - 1]).split()) == 0
               and len(re_func.sub('', script_noind[i + 1]).split()) == 0
               and any([y in x for y in meta_set])]
    sent_ind = [i for i, x in enumerate(script_noind) if tag_vec[i] not in tag_set
                and i != 0 and i != (len(script_noind) - 1)
                and len(x.split()) > sent_thresh
                and len(script_noind[i - 1].split()) == 0
                and len(script_noind[i + 1].split()) > 0]
    meta_ind = sorted(met_ind + bound_ind + trans_ind + sent_ind)
    if len(meta_ind) > 0:
        for i, x in enumerate(script_noind[: meta_ind[0]]):
            if len(x.split()) > 0:
                tag_vec[i] = 'M'

    return tag_vec

In [None]:
# DİYALOG VE DİYALOG METADATALARINI İÇEREN SATIRI AYRI SINIFLARA AYIR

def separate_dial_meta(line_str):
    if '(' in line_str and ')' in line_str:
        bef_par_str = ' '.join(line_str.split('(')[0].split())
        in_par_str = ' '.join(line_str.split('(')[1].split(')')[0].split())
        rem_str = ')'.join(line_str.split(')')[1:])
    else:
        bef_par_str = line_str
        in_par_str = ''
        rem_str = ''

    return bef_par_str, in_par_str, rem_str

In [None]:
# KARAKTER VE DİYALOGLARI ETİKETLE

def get_char_dial(script_noind, tag_vec, tag_set, char_max_words):
    char_ind = [i for i, x in enumerate(script_noind) if tag_vec[i] not in tag_set and all([y.isupper() for y in x.split()])
                and i != 0 and i != (len(script_noind) - 1)\
                and len(script_noind[i - 1].split()) == 0\
                and len(script_noind[i + 1].split()) > 0\
                and len(x.split()) < char_max_words\
                and any([separate_dial_meta(x)[y] for y in [0, 2]])]
    if char_ind[-1] < (len(script_noind) - 1):
        char_ind += [len(script_noind) - 1]
    else:
        char_ind += [len(script_noind)]

    for x in range(len(char_ind) - 1):
        tag_vec[char_ind[x]] = 'C'
        dial_flag = 1
        while dial_flag > 0:
            line_ind = char_ind[x] + dial_flag
            if len(script_noind[line_ind].split()) > 0 and line_ind < char_ind[x + 1]:
                dial_str, dial_meta_str, rem_str = separate_dial_meta(
                    script_noind[line_ind])
                if dial_str != '' or rem_str != '':
                    tag_vec[line_ind] = 'D'
                else:
                    tag_vec[line_ind] = 'E'

                dial_flag += 1
            else:
                dial_flag = 0

    return tag_vec

In [None]:
# SAHNE BETİMLEMELERİNİ ETİKETLE

def get_scene_desc(script_noind, tag_vec, tag_set):
    desc_ind = [i for i, x in enumerate(script_noind) if tag_vec[i] not in tag_set and
                len(x.split()) > 0 and
                not x.strip('.').isdigit()]
    for x in desc_ind:
        tag_vec[x] = 'N'

    return tag_vec

In [None]:
# SATIRDA PARANTEZ BAŞI VEYA PARANTEZ SONU KONTROLÜ

def par_start(line_set):
    return [i for i, x in enumerate(line_set) if '(' in x]

def par_end(line_set):
    return [i for i, x in enumerate(line_set) if ')' in x]

In [None]:
# ÇOK SATIRLI SINIFLARI BİRLEŞTİR - ÇOK SINIFLI SATIRLARI AYIR

def combine_tag_lines(tag_valid, script_valid):
    tag_final = []
    script_final = []
    changed_tags = [x for x in tag_valid]
    for i, x in enumerate(tag_valid):
        if x in ['M', 'T', 'S']:
            # metadata, geçiş ve sahne sınırlarını olduğu gibi ekle
            tag_final.append(x)
            script_final.append(script_valid[i])
        elif x in ['C', 'D', 'N']:
            # karakter, diyalog veya sahne betimlemesi birden fazla satırdan oluşuyorsa birleştir
            if i == 0 or x != tag_valid[i - 1]:
                # çoklu satırın ilk satırıysa başlat
                to_combine = []
                comb_ind = []

            to_combine += script_valid[i].split()
            comb_ind.append(i)
            if i == (len(tag_valid) - 1) or x != tag_valid[i + 1]:
                combined_str = ' '.join(to_combine)
                if x == 'N':
                    # sahne betimlemesiyse olduğu gibi yaz
                    tag_final.append(x)
                    script_final.append(combined_str)
                else:
                    _, in_par, _ = separate_dial_meta(combined_str)
                    if in_par != '':
                        # diyalog metadata satırlarını bul
                        comb_lines = [script_valid[j] for j in comb_ind]
                        dial_meta_ind = []
                        while len(par_start(comb_lines)) > 0 and len(par_end(comb_lines)) > 0:
                            start_ind = comb_ind[par_start(comb_lines)[0]]
                            end_ind = comb_ind[par_end(comb_lines)[0]]
                            dial_meta_ind.append([start_ind, end_ind])
                            comb_ind = [x for x in comb_ind if x > end_ind]
                            comb_lines = [script_valid[j] for j in comb_ind]

                        # eski etiketleri diyalog metadata etiketiyle değiştir 
                        for dial_ind in dial_meta_ind:
                            for change_ind in range(dial_ind[0], (dial_ind[1] + 1)):
                                changed_tags[change_ind] = 'E'

                        # diyalog metadatayı ayıkla 
                        dial_meta_str = ''
                        char_dial_str = ''
                        while '(' in combined_str and ')' in combined_str:
                            before_par, in_par, combined_str = separate_dial_meta(
                                combined_str)
                            char_dial_str += ' ' + before_par
                            dial_meta_str += ' ' + in_par

                        char_dial_str += ' ' + combined_str
                        char_dial_str = ' '.join(char_dial_str.split())
                        dial_meta_str = ' '.join(dial_meta_str.split())
                        if x == 'C':
                            # karakter satırıysa diyalog metadatasını ekle
                            tag_final.append(x)
                            script_final.append(
                                ' '.join(char_dial_str.split()))
                            tag_final.append('E')
                            script_final.append(dial_meta_str)
                        elif x == 'D':
                            # diyalog satırıysa diyalog metadatasını önüne ekle
                            tag_final.append('E')
                            script_final.append(dial_meta_str)
                            tag_final.append(x)
                            script_final.append(
                                ' '.join(char_dial_str.split()))
                    else:
                        # diyalog metadatası yoksa olduğu gibi yaz
                        tag_final.append(x)
                        script_final.append(combined_str)
        elif x == 'E':
            # diyalog metadatasıysa parantezsiz yaz
            split_1 = script_valid[i].split('(')
            split_2 = split_1[1].split(')')
            dial_met = split_2[0]
            tag_final.append('E')
            script_final.append(dial_met)

    return tag_final, script_final, changed_tags

In [None]:
# BİRLEŞTİRİLMEMİŞ SINIFLARIN KONTROLÜ

def find_same(tag_valid):
    same_ind_mat = np.empty((0, 2), dtype=int)
    if len(tag_valid) > 1:
        check_start = 0
        check_end = 1
        while check_start < (len(tag_valid) - 1):
            if tag_valid[check_start] != 'M' and tag_valid[check_start] == tag_valid[check_end]:
                while check_end < len(tag_valid) and tag_valid[check_start] == tag_valid[check_end]:
                    check_end += 1

                append_vec = np.array(
                    [[check_start, (check_end - 1)]], dtype=int)
                same_ind_mat = np.append(same_ind_mat, append_vec, axis=0)
                check_end += 1
                check_start = check_end - 1
            else:
                check_start += 1
                check_end += 1

    return same_ind_mat

In [None]:
# ARDIŞIK AYNI SINIFLARI BİRLEŞTİR

def merge_tag_lines(tag_final, script_final):
    merge_ind = find_same(tag_final)
    if merge_ind.shape[0] > 0:
        # ön işlem: farklı olanlar
        tag_merged = tag_final[: merge_ind[0, 0]]
        script_merged = script_final[: merge_ind[0, 0]]
        for ind in range(merge_ind.shape[0] - 1):
            # birleştirme: aynı olanlar 
            tag_merged += [tag_final[merge_ind[ind, 0]]]
            script_merged += [' '.join(script_final[merge_ind[ind, 0]: (merge_ind[ind, 1] + 1)])]
            # birleştirme: farklı olanlar 
            tag_merged += tag_final[(merge_ind[ind, 1] + 1): merge_ind[(ind + 1), 0]]
            script_merged += script_final[(merge_ind[ind, 1] + 1): merge_ind[(ind + 1), 0]]

        # son işlem: aynı olanlar
        tag_merged += [tag_final[merge_ind[-1, 0]]]
        script_merged += [' '.join(script_final[merge_ind[-1, 0]: (merge_ind[-1, 1] + 1)])]
        # son işlem: farklı olanlar
        tag_merged += tag_final[(merge_ind[-1, 1] + 1):]
        script_merged += script_final[(merge_ind[-1, 1] + 1):]
    else:
        tag_merged = tag_final
        script_merged = script_final

    return tag_merged, script_merged

In [None]:
# DİYALOĞUN ÖNCESİNDE YER ALAN DİYALOG META VERİSİNİN KONTROLÜ

def find_arrange(tag_valid):
    c_ind = [i for i, x in enumerate(tag_valid) if x == 'C']
    c_segs = []
    arrange_ind = []
    invalid_set = [['C', 'E', 'D'], ['C', 'D', 'E', 'D']]
    if len(c_ind) > 0:
        # C-* bloklarına ayır
        if c_ind[0] != 0:
            c_segs.append(tag_valid[: c_ind[0]])

        for i in range((len(c_ind) - 1)):
            c_segs.append(tag_valid[c_ind[i]: c_ind[i + 1]])

        c_segs.append(tag_valid[c_ind[-1]:])
        # gerekirse blokları yeniden düzenle 
        for i in range(len(c_segs)):
            inv_flag = 0
            if len(c_segs[i]) > 2:
                if any([c_segs[i][j: (j + len(invalid_set[0]))] == invalid_set[0]
                        for j in range(len(c_segs[i]) - len(invalid_set[0]) + 1)]):
                    inv_flag = 1

            if inv_flag == 0 and len(c_segs[i]) > 3:
                if any([c_segs[i][j: (j + len(invalid_set[1]))] == invalid_set[1]
                        for j in range(len(c_segs[i]) - len(invalid_set[1]) + 1)]):
                    inv_flag = 1

            if inv_flag == 1:
                arrange_ind.append(i)

    return c_segs, arrange_ind

In [None]:
# DİYALOG METADATASINI HER ZAMAN DİYALOGDAN SONRA GELECEK ŞEKİLDE DÜZENLE

def rearrange_tag_lines(tag_merged, script_merged):
    tag_rear = []
    script_rear = []
    char_blocks, dial_met_ind = find_arrange(tag_merged)
    if len(dial_met_ind) > 0:
        last_ind = 0
        for ind in range(len(char_blocks)):
            if ind in dial_met_ind:
                # karakter ekle 
                tag_rear += ['C']
                script_rear.append(script_merged[last_ind])
                # diyalog ekle 
                if 'D' in char_blocks[ind]:
                    tag_rear += ['D']
                    script_rear.append(' '.join([script_merged[last_ind + i]
                                                 for i, x in enumerate(char_blocks[ind]) if x == 'D']))

                # diyalog metadatasını ekle 
                if 'E' in char_blocks[ind]:
                    tag_rear += ['E']
                    script_rear.append(' '.join([script_merged[last_ind + i]
                                                 for i, x in enumerate(char_blocks[ind]) if x == 'E']))
                # kalanları ekle 
                tag_rear += [x for x in char_blocks[ind]
                             if x not in ['C', 'D', 'E']]
                script_rear += [script_merged[last_ind + i]
                                for i, x in enumerate(char_blocks[ind]) if x not in ['C', 'D', 'E']]
            else:
                tag_rear += char_blocks[ind]
                script_rear += script_merged[last_ind: (
                    last_ind + len(char_blocks[ind]))]

            last_ind += len(char_blocks[ind])

    return tag_rear, script_rear

In [None]:
# PARSE FONKSİYONU

def parse(file_path, save_dir, tag_flag, off_flag, save_name=None, tag_name=None, offset_name=None):

# Tanımlamalar:

    tag_set = ['S', 'N', 'C', 'D', 'E', 'T', 'M']
    meta_set = ['BLACK', 'darkness']
    bound_set = ['int.', 'ext.', 'int ', 'ext ', 'exterior ', 'interior ']
    trans_set = ['cut', 'fade', 'transition', 'dissolve']
    char_max_words = 5
    meta_thresh = 2
    sent_thresh = 5
    trans_thresh = 6

    # text dosyasını oku
    script_orig, script_offsets = read_script(file_path)

    # satırbaşını kaldır
    alnum_filter = re.compile('[\W_]+', re.UNICODE)
    script_noind = []
    for script_line in script_orig:
        if len(script_line.split()) > 0 and alnum_filter.sub('', script_line) != '':
            script_noind.append(' '.join(script_line.split()))
        else:
            script_noind.append('')

    num_lines = len(script_noind)
    tag_vec = np.array(['0' for x in range(num_lines)])

# Etiketleme İşlemleri:

    # sahne sınırları
    tag_vec, bound_ind = get_scene_bound(
        script_noind, tag_vec, tag_set, bound_set)
    # geçişler
    tag_vec, trans_ind = get_trans(
        script_noind, tag_vec, tag_set, trans_thresh, trans_set)
    # metadata
    tag_vec = get_meta(script_noind, tag_vec, tag_set, meta_thresh,
                       meta_set, sent_thresh, bound_ind, trans_ind)
    # karakter-diyalog
    tag_vec = get_char_dial(script_noind, tag_vec, tag_set, char_max_words)
    # sahne betimlemeleri
    tag_vec = get_scene_desc(script_noind, tag_vec, tag_set)
    
    # etiketsiz satırları kaldır
    nz_ind_vec = np.where(tag_vec != '0')[0]
    tag_valid = []
    script_valid = []
    for i, x in enumerate(tag_vec):
        if x != '0':
            tag_valid.append(x)
            script_valid.append(script_noind[i])

    # etiketleri güncelle
    tag_valid, script_valid, changed_tags = combine_tag_lines(
        tag_valid, script_valid)
    for change_ind in range(len(nz_ind_vec)):
        if tag_vec[nz_ind_vec[change_ind]] == 'D':
            tag_vec[nz_ind_vec[change_ind]] = changed_tags[change_ind]

    # etiketleri kaydet
    if tag_flag == 'on':
        if tag_name is None:
            tag_name = os.path.join(save_dir, '.'.join(
                file_path.split('/')[-1].split('.')[: -1]) + '_tags.txt')
        else:
            tag_name = os.path.join(save_dir, tag_name)

        np.savetxt(tag_name, tag_vec, fmt='%s', delimiter='\n')

    # offsetleri kaydet
    if off_flag == 'on':
        if offset_name is None:
            offset_name = os.path.join(save_dir, '.'.join(
                file_path.split('/')[-1].split('.')[: -1]) + '_offsets.txt')
        else:
            offset_name = os.path.join(save_dir, offset_name)

        np.savetxt(offset_name, script_offsets, fmt='%s', delimiter=',')

    # etiket ve satırları biçimlendir 
    max_rev = 0
    while find_same(tag_valid).shape[0] > 0 or len(find_arrange(tag_valid)[1]) > 0:
        tag_valid, script_valid = merge_tag_lines(tag_valid, script_valid)
        tag_valid, script_valid = rearrange_tag_lines(tag_valid, script_valid)
        max_rev += 1
        if max_rev == 1000:
            raise AssertionError(
                "Too many revisions. Something must be wrong.")
        
    # senaryoyu kaydet
    if save_name is None:
        save_name = os.path.join(save_dir, '.'.join(
            file_path.split(os.sep)[-1].split('.')[:-1]) + '_parsed.txt')
    else:
        save_name = os.path.join(save_dir, save_name)

    fid = open(save_name, 'w')
    for tag_ind in range(len(tag_valid)):
        _ = fid.write(tag_valid[tag_ind] + ': ' + script_valid[tag_ind] + '\n')

    fid.close()

In [None]:
# MAIN FONKSİYONU:

if __name__ == "__main__":
    DIR_FINAL = join("scripts", "filtered")
    DIR_OUT = join("scripts", "parsed")
    DIR_OUT_FULL = join(DIR_OUT, "tagged")

    # klasörlerin varlığını kontrol et ve oluştur
    if not os.path.exists(DIR_OUT):
        os.makedirs(DIR_OUT)
    if not os.path.exists(DIR_OUT_FULL):
        os.makedirs(DIR_OUT_FULL)

    # "scripts/filtered" klasöründeki dosyaları al
    files = [join(DIR_FINAL, f) for f in os.listdir(DIR_FINAL)
             if isfile(join(DIR_FINAL, f)) and os.path.getsize(join(DIR_FINAL, f)) > 3000]

    # dosyalar üzerinde işlem yap
    for file_path in tqdm(files):
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        
        # çıktı dosyasının isimlerini belirle
        save_name = file_name + "_parsed.txt"
        tag_name = None  # etiketleme yapılmayacaksa

        # 'parse' fonksiyonunu çağır
        try:
            parse(file_path, DIR_OUT, tag_flag="off", off_flag="off", save_name=save_name, tag_name=tag_name, offset_name=None)
        except Exception as err:
            print(f"Hata oluştu: {err}")
            pass