## Importing subtitles data

In [6]:
import glob
import os
import sys

In [7]:
seasons = [1, 6, 7, 10]
seasons = [str(season) for season in seasons]

In [8]:
subtitles_data = []

for season in seasons:
    path = './data/friends-season-' + season + '/'
    for file in os.listdir(path):
        f = open(path + file, 'rb')
        content = f.read().decode('utf8', 'ignore')
        subtitles_data.append(content)

## Extracting only lines (대사) from the files

In [9]:
def extract_dialogue(file_content):
    dialogue = []
    lines = file_content.split('\r\n\r\n')
    for line in lines:
        dialogue.extend(line.split('\r\n')[2:])
    return dialogue

In [10]:
subtitles_lines = []
for content in subtitles_data:
    subtitles_lines.append(extract_dialogue(content))

## Recognizing Proper Noun via POS tagging

In [11]:
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

In [22]:
pps = []
for line in subtitles_lines[0][3:-1]:
#     tagged_sent = pos_tag([token.lower() for token in word_tokenize(line)])
    tagged_sent = pos_tag(word_tokenize(line))
    pps.extend ([word for word, pos in tagged_sent if pos == 'NNP'])

In [25]:
pps[:20]

['Notice',
 'Freud',
 'Magic',
 'Eva',
 'Butt',
 'GELULA',
 'CO.',
 'INC',
 'Did',
 'Ross',
 'Come',
 'Ross',
 'Okay',
 'Chandler',
 'Thank',
 'Chandler',
 'Come',
 'Aurora',
 'Estelle',
 'Leonard']

## Removing stop words

In [38]:
def remove_stopwords(pps):
    f = open('./stopwords.txt', 'r')
    stoplist = set(f.read().split())
    f.close()    
    pps_ = [pp for pp in pps if not pp.lower() in stoplist]
    return pps_

## Using NLTK.Tree for recognizing NE

NLTK `ne_chunk` page: (`ne` is short for named entities)
http://www.nltk.org/book/ch07.html

```
NE Type	Examples
ORGANIZATION	Georgia-Pacific Corp., WHO
PERSON	Eddy Bonte, President Obama
LOCATION	Murray River, Mount Everest
DATE	June, 2008-06-29
TIME	two fifty a m, 1:30 p.m.
MONEY	175 million Canadian Dollars, GBP 10.40
PERCENT	twenty pct, 18.75 %
FACILITY	Washington Monument, Stonehenge
GPE	South East Asia, Midlothian
```

In [35]:
from nltk.tree import Tree
from nltk.chunk import ne_chunk

In [37]:
def extract_ne(ne_type):
    pps = []
    for line in subtitles_lines[0][3:-1]:
        tagged_sent = pos_tag(word_tokenize(line))
        pps.extend([chunk[0][0] for chunk in ne_chunk(tagged_sent) \
                    if isinstance(chunk, Tree) and chunk._label == ne_type])
    pps = remove_stopwords(pps)
    return pps

In [39]:
pps = extract_ne('PERSON')

## Extracting verbs from the sentence

In [18]:
verbs = []
tagged_sents = []
for line in subtitles_lines[0][3:-1]:
    tagged_sent = pos_tag(word_tokenize(line))
    verbs.extend ([(word, pos) for word, pos in tagged_sent if pos.startswith('VB')])
    tagged_sents.append(tagged_sent)