## Importing subtitles data

In [1]:
import glob
import os
import sys

### Import SRT subtitles data

In [2]:
seasons = [1, 6, 7, 10]
seasons = [str(season) for season in seasons]
subtitles_data_srt = []

for season in seasons:
    path = './data/friends-season-' + season + '/'
    for file in os.listdir(path):
        f = open(path + file, 'rb')
        content = f.read().decode('utf8', 'ignore')
        subtitles_data_srt.append(content)

### Import SUB subtitles data

In [3]:
seasons = [2,3,4,5,8,9]
seasons = [str(season) for season in seasons]
subtitles_data_sub = []

for season in seasons:
    path = './data/friends-season-' + season + '/'
    for file in os.listdir(path):
        f = open(path + file, 'rb')
        content = f.read().decode('utf8', 'ignore')
        subtitles_data_sub.append(content)

## Extracting only lines (대사) from the files

In [4]:
import re

In [5]:
def extract_dialogue_sub(file_content):
    dialogue = []
    lines = file_content.split('\r\n')
    for line in lines:
        s = re.sub('{.*?}', '', line)
        s = re.sub('\|', ' ', s)
        dialogue.append(s)
    return dialogue

In [6]:
subtitles_lines_sub = []
for content in subtitles_data_sub:
    subtitles_lines_sub.append(extract_dialogue_sub(content))

In [7]:
def extract_dialogue_srt(file_content):
    dialogue = []
    lines = file_content.split('\r\n\r\n')
    for line in lines:
        dialogue.extend(line.split('\r\n')[2:])
    return dialogue

In [8]:
subtitles_lines_srt = []
for content in subtitles_data_srt:
    subtitles_lines_srt.append(extract_dialogue_srt(content))

In [35]:
subtitles_lines = []
subtitles_lines.extend(subtitles_lines_srt)
subtitles_lines.extend(subtitles_lines_sub)

In [39]:
# Save pickle data for all subtitles_lines
import pickle

with open('./results/subtitles_lines.pkl', 'wb') as f:
    pickle.dump(subtitles_lines, f)

# with open('./results/subtitles_lines.pkl', 'rb') as f:
#     subtitles_lines = pickle.load(f)

## Recognizing Proper Noun via POS tagging

In [10]:
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

In [11]:
pps = []
for line in subtitles_lines[0][3:-1]:
#     tagged_sent = pos_tag([token.lower() for token in word_tokenize(line)])
    tagged_sent = pos_tag(word_tokenize(line))
    pps.extend ([word for word, pos in tagged_sent if pos == 'NNP'])

## Removing stop words

In [13]:
def remove_stopwords(pps):
    f = open('./stopwords.txt', 'r')
    stoplist = set(f.read().split())
    f.close()    
    pps_ = [pp for pp in pps if not pp.lower() in stoplist]
    return pps_

## Using NLTK.Tree for recognizing NE

NLTK `ne_chunk` page: (`ne` is short for named entities)
http://www.nltk.org/book/ch07.html

```
NE Type	Examples
ORGANIZATION	Georgia-Pacific Corp., WHO
PERSON	Eddy Bonte, President Obama
LOCATION	Murray River, Mount Everest
DATE	June, 2008-06-29
TIME	two fifty a m, 1:30 p.m.
MONEY	175 million Canadian Dollars, GBP 10.40
PERCENT	twenty pct, 18.75 %
FACILITY	Washington Monument, Stonehenge
GPE	South East Asia, Midlothian
```

In [14]:
from nltk.tree import Tree
from nltk.chunk import ne_chunk

In [22]:
def extract_ne(ne_type):
    pps = []
    for line in subtitles_lines[0]:
        tagged_sent = pos_tag(word_tokenize(line))
        pps.extend([chunk[0][0] for chunk in ne_chunk(tagged_sent) \
                    if isinstance(chunk, Tree) and chunk._label == ne_type])
    pps = remove_stopwords(pps)
    return pps

In [17]:
pps = extract_ne('PERSON')

## Making json files for objects

In [23]:
import json

In [40]:
def make_objects_json(objs, class_idx):
    idx = 1
    json_str = ""
    for obj in objs:
        d = dict()
        d["id"] = "O" + str(idx)
        d["class"] = "C" + str(class_idx)
        d["type"] = "object"
        d["title"] = str(obj)
        idx += 1
        json_str += json.dumps(d) + ", "
    
    return json_str[:-2]

In [41]:
ne_types = ['ORGANIZATION', 'PERSON', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENT', 'FACILITY', 'GPE']
index = 1
f = open('./results/object_class.json', 'w')

for ne_type in ne_types:    
    d = dict()
    d["id"] = "C" + str(index)
    d["type"] = "object-class"
    d["title"] = ne_type
    pps = extract_ne(ne_type)
    print (str(ne_type) + ", " + str(len(list(set(pps)))))
    d["objects"] = make_objects_json(list(set(pps)), index)
    d = json.dumps(d)
    f.write(d+"\n")
    index += 1

f.close()

ORGANIZATION, 5
PERSON, 18
LOCATION, 0
DATE, 0
TIME, 0
MONEY, 0
PERCENT, 0
FACILITY, 0
GPE, 18


## Extracting verbs from the sentence

In [None]:
verbs = []
tagged_sents = []
for line in subtitles_lines[0][3:-1]:
    tagged_sent = pos_tag(word_tokenize(line))
    verbs.extend ([(word, pos) for word, pos in tagged_sent if pos.startswith('VB')])
    tagged_sents.append(tagged_sent)