## Get positive and negative sample from the files labeled

In [1]:
import random
import curses.ascii as ca
import re

In [2]:
'''
Split a text within tagged name.

Input:
text: a string within tagged name

Output:
list_text: a list of string with mentions
'''
def splitText(text):
    list_text = list()
    n = len(text)
    i = 0
    while i < n:
        if text[i:(i+6)] == '<name>':
            j = i+6
            try:
                while text[j:(j+7)] != '</name>':
                    j += 1
                list_text.append(text[i:(j+7)])
                i = j+7
            except:
                print("Wrongly tagged!! Without closing tag")
                return
        elif ca.ispunct(text[i]):
            list_text.append(text[i])
            i += 1
        elif ca.isalnum(text[i]):
            j = i + 1
            while j<n and ca.isalnum(text[j]):
                j += 1
            list_text.append(text[i:j])
            i = j
        else:
            i += 1
    return(list_text)

In [3]:
'''
Figure out positive and negative samples we need from a list of strings

Input:
list_text: a list of string with mentions

Output:
samples: a list of dictionary containing the information around the samples
'''
def figureSample(list_text):
    if type(list_text) is not list:
        print("Please use a list of string as the parameter of function figureSample!!")
        return
    n = len(list_text)
    samples = list()
    for i in range(n):
        if re.search('<name>(.*)</name>', list_text[i]) is not None:
            samples.append(findWordsAround(list_text, i, 1))
        else:
            if list_text[i].istitle():
                samples.append(findWordsAround(list_text, i, 1))
                if list_text[i+1].istitle():
                    samples.append(findWordsAround(list_text, i, 2))
                    if list_text[i+2].istitle():
                        samples.append(findWordsAround(list_text, i, 3))
    return(samples)
     

In [4]:
'''
Return a dictionary contain the information around the words we select:

Input:
list_text:  a list of string with mentions
index: begin index of the word
length: length of the word

Output:
words: a dictionary containing the following information
    sample: the word we select
    before: the word before sample in this sentence. If no word before, get *
    after: the word after sample in this sentence. If no word after, get *
    comma_before: if there is a comma before sample. 1 yes 0 no
    comma_before: if there is a comma after sample. 1 yes 0 no
    y: 1 positive sample 0 negative sample
'''

def findWordsAround(list_text, index, length):
    stop_punct = ['.', '!', '?']
    prefix = ['Mr', 'Mrs', 'Miss', 'Dr', 'Prof', 'Ms']
    words = {'sample': ' '.join(list_text[index:(index+length)]),'before':'*', 'after':'*', 'comma_before': 0,'comma_after': 0,  'y':0}
    if re.match('<name>.*</name>', list_text[index]) is not None:
        words['y'] = 1
        words['sample'] = re.search('<name>(.*)</name>', list_text[index]).group(1)
    if index > 0:
        j = index-1
        if list_text[j] == ",":
            words['comma_before'] = 1
        if re.match('<name>.*</name>', list_text[j]) is None:
            while j >= 0 and not list_text[j].isalnum() :
                if list_text[j] in stop_punct:
                    break
                else:
                    j -= 1
            if j > 0 and list_text[j] not in stop_punct:
                words['before'] = list_text[j]
            elif list_text[j] == '.':
                if list_text[j-1] in prefix:
                    words['before'] = list_text[j-1]
        else:
            words['before'] = re.search('<name>(.*)</name>', list_text[j]).group(1)
    if index+length < len(list_text)-1:
        j = index+length
        if list_text[j] == ",":
            words['comma_after'] = 1
        if re.match('<name>.*</name>', list_text[j]) is None:
            while j < len(list_text) and not list_text[j].isalnum():
                if list_text[j] in stop_punct:
                    break
                else:
                    j += 1
            if j < len(list_text) and list_text[j] not in stop_punct:
                words['after'] = list_text[j]
        else:
            words['after'] = re.search('<name>(.*)</name>', list_text[j]).group(1)
    return(words)

In [5]:
'''
Get positive and negative samples from a list of file

Input:
file: a list of file name

Output:
samples: a list of dictionary containing samples
'''
def getSample(file):
    if type(file) is not list:
        print("Please enter a list!!")
        return
    if len(file) == 0:
        print("List is empty!")
        return
    samples = list()
    for i in file:
        with open(i, 'r') as f:
            text = f.readlines()
            text = ' '.join(text)
        list_text = splitText(text)
        samples += figureSample(list_text)
    return(samples)

#### Randomly split the files into training and test set

In [6]:
file = list()
for i in range(1, 301):
    file.append('../data/tagged/document (' + str(i) + ').txt')

    
#Randomly select training and test set file
n = len(file)
n_test = int(n/3)
n_train = n-n_test
random.seed(123)
index_test = random.sample(range(n), n_test)
index_train = [x for x in range(n) if x not in index_test]
file_test = [file[x] for x in index_test]
file_train = [file[x] for x in index_train]

#### Save the names of test files into a text file

In [7]:
with open('../data/file_test.txt', 'w') as f:
    for i in file_test:
        f.write(i[15:] + '\n')

#### Get positive and negative samples from test and training files

In [8]:
dict_test = getSample(file_test)
dict_train = getSample(file_train)

#### Write samples into json files

In [9]:
import json
with open('../data/dict_test.json', 'w') as f:
    json.dump(dict_test, f)
with open('../data/dict_train.json', 'w') as f:
    json.dump(dict_train, f)