## Importing subtitles data

In [2]:
import glob
import os
import sys

In [3]:
seasons = [1, 6, 7, 10]
seasons = [str(season) for season in seasons]

In [4]:
subtitles_data = []

for season in seasons:
    path = './data/friends-season-' + season + '/'
    for file in os.listdir(path):
        f = open(path + file, 'rb')
        content = f.read().decode('utf8', 'ignore')
        subtitles_data.append(content)

## Extracting only lines (대사) from the files

In [5]:
def extract_dialogue(file_content):
    dialogue = []
    lines = file_content.split('\r\n\r\n')
    for line in lines:
        dialogue.extend(line.split('\r\n')[2:])
    return dialogue

In [6]:
subtitles_lines = []
for content in subtitles_data:
    subtitles_lines.append(extract_dialogue(content))

## Recognizing Proper Noun via POS tagging

In [7]:
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

In [8]:
pps = []
for line in subtitles_lines[0][3:-1]:
#     tagged_sent = pos_tag([token.lower() for token in word_tokenize(line)])
    tagged_sent = pos_tag(word_tokenize(line))
    pps.extend ([word for word, pos in tagged_sent if pos == 'NNP'])

In [9]:
pps[:20]

['Notice',
 'Freud',
 'Magic',
 'Eva',
 'Butt',
 'GELULA',
 'CO.',
 'INC',
 'Did',
 'Ross',
 'Come',
 'Ross',
 'Okay',
 'Chandler',
 'Thank',
 'Chandler',
 'Come',
 'Aurora',
 'Estelle',
 'Leonard']

## Removing stop words

In [10]:
def remove_stopwords(pps):
    f = open('./stopwords.txt', 'r')
    stoplist = set(f.read().split())
    f.close()    
    pps_ = [pp for pp in pps if not pp.lower() in stoplist]
    return pps_

## Using NLTK.Tree for recognizing NE

NLTK `ne_chunk` page: (`ne` is short for named entities)
http://www.nltk.org/book/ch07.html

```
NE Type	Examples
ORGANIZATION	Georgia-Pacific Corp., WHO
PERSON	Eddy Bonte, President Obama
LOCATION	Murray River, Mount Everest
DATE	June, 2008-06-29
TIME	two fifty a m, 1:30 p.m.
MONEY	175 million Canadian Dollars, GBP 10.40
PERCENT	twenty pct, 18.75 %
FACILITY	Washington Monument, Stonehenge
GPE	South East Asia, Midlothian
```

In [11]:
from nltk.tree import Tree
from nltk.chunk import ne_chunk

In [12]:
def extract_ne(ne_type):
    pps = []
    for line in subtitles_lines[0][3:-1]:
        tagged_sent = pos_tag(word_tokenize(line))
        pps.extend([chunk[0][0] for chunk in ne_chunk(tagged_sent) \
                    if isinstance(chunk, Tree) and chunk._label == ne_type])
    pps = remove_stopwords(pps)
    return pps

In [13]:
pps = extract_ne('PERSON')

In [16]:
pps[:10]

['Freud',
 'Eva',
 'Ross',
 'Ross',
 'Chandler',
 'Chandler',
 'Aurora',
 'Estelle',
 'Chandler',
 'Rick']

## Making json files for objects

In [17]:
def make_objects_json(objs, class_idx):
    idx = 1
    json_str = ""
    for obj in objs:
        d = dict()
        d["id"] = "O" + str(idx)
        d["class"] = "C" + str(class_idx)
        d["type"] = "object"
        d["title"] = str(obj)
        idx += 1
        json_str += json.dumps(d)
    
    return json_str

'{"title": "Aurora", "id": "O1", "type": "object", "class": "C1"}{"title": "Raggedy", "id": "O2", "type": "object", "class": "C1"}{"title": "Ross", "id": "O3", "type": "object", "class": "C1"}{"title": "Al", "id": "O4", "type": "object", "class": "C1"}{"title": "Rick", "id": "O5", "type": "object", "class": "C1"}{"title": "Ethan", "id": "O6", "type": "object", "class": "C1"}{"title": "Richard", "id": "O7", "type": "object", "class": "C1"}{"title": "Water", "id": "O8", "type": "object", "class": "C1"}{"title": "Chandler", "id": "O9", "type": "object", "class": "C1"}{"title": "Estelle", "id": "O10", "type": "object", "class": "C1"}{"title": "Andrew", "id": "O11", "type": "object", "class": "C1"}{"title": "Eva", "id": "O12", "type": "object", "class": "C1"}{"title": "Monica", "id": "O13", "type": "object", "class": "C1"}{"title": "Tribbiani", "id": "O14", "type": "object", "class": "C1"}{"title": "Freud", "id": "O15", "type": "object", "class": "C1"}{"title": "Joey", "id": "O16", "type": 

In [14]:
import json

In [25]:
ne_types = ['ORGANIZATION', 'PERSON', 'LOCATION', 'DATE', 'TIME', 'MONEY', 'PERCENT', 'FACILITY', 'GPE']
index = 1
f = open('./results/object_class.json', 'w')

for ne_type in ne_types:    
    d = dict()
    d["id"] = "C" + str(index)
    d["type"] = "object-class"
    d["title"] = ne_type
    pps = extract_ne(ne_type)
    print (str(ne_type) + ", " + str(len(list(set(pps)))))
    d["objects"] = make_objects_json(list(set(pps)), index)
    d = json.dumps(d)
    f.write(d+"\n")
    index += 1

f.close()

ORGANIZATION, 5
PERSON, 18
LOCATION, 0
DATE, 0
TIME, 0
MONEY, 0
PERCENT, 0
FACILITY, 0
GPE, 18


## Extracting verbs from the sentence

In [18]:
verbs = []
tagged_sents = []
for line in subtitles_lines[0][3:-1]:
    tagged_sent = pos_tag(word_tokenize(line))
    verbs.extend ([(word, pos) for word, pos in tagged_sent if pos.startswith('VB')])
    tagged_sents.append(tagged_sent)