In [1]:
import re
from six import iteritems
import xgboost as xgb
from math import log
import math
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

with open('train.txt', encoding='utf-8') as f:
    train_data = f.readlines()
    f.close
with open('test.txt', encoding='utf-8') as f:
    test_data = f.readlines()
    f.close
with open('Dream_of_the_Red_Chamber_seg.txt', encoding='utf-8') as f:
    corpus_seg = f.readlines()
    f.close
with open('Dream_of_the_Red_Chamber.txt', encoding='utf-8') as f:
    corpus = f.readlines()
    f.close

In [2]:
# Test & Train Array Converter
TRAIN = []
index = 0
for row in train_data:
    if index == 0:
        index += 1
        continue
    index += 1
    x = re.split('\t|\n', row)
    TRAIN.append([x[1], x[2], x[3]])
    
TEST = []
index = 0
for row in test_data:
    if index == 0:
        index += 1
        continue
    index += 1
    x = re.split('\t|\n', row)
    TEST.append([x[1], x[2], x[3]])

In [3]:
# Build The Word Vector
TERM_LIST = []
for paragraph in corpus_seg:
    tokens = paragraph.split()
    for token in tokens:
        if '_P' in token:
            continue
        term = re.sub('_[A-Z | a-z | 0-9]*', '', token)
        if term not in TERM_LIST:
            TERM_LIST.append(term)

In [4]:
RELATION = {
    '祖孫': 0,
    '母子': 1,
    '母女': 2,
    '父子': 3,
    '父女': 4,
    '兄弟姊妹': 5,
    '夫妻': 6,
    '姑叔舅姨甥侄': 7,
    '遠親': 8,
    '主僕': 9,
    '師徒': 10,
    '居處': 11,
}

PERSON = {}
index = 1
for x in TRAIN:
    per1 = x[0]
    per2 = x[1]
    if per1 not in PERSON:
        PERSON[per1] = index
        index += 1
    if per2 not in PERSON:
        PERSON[per2] = index
        index += 1
for x in TEST:
    per1 = x[0]
    per2 = x[1]
    if per1 not in PERSON:
        PERSON[per1] = index
        index += 1
    if per2 not in PERSON:
        PERSON[per2] = index
        index += 1
        
GENERALNAME = [
    '婆子', '夫人', '大姐', '小姐', '嫂子', '姨娘',  '姨媽', '嬸娘', '嫂子', '老娘', '嬤嬤', '奶奶'
]

Rule1 = ['嫁','娶','婚','買','嫡夫','婦','嫡','妻','妾','連理','太太','夫妻']
Rule2 = ['喚作','取名','生','有了','得了','養','懷','爹','娘','父','母','兒','女','女兒','子','孩','乳名','小名']
Rule3 = ['請','給','來','請安','磕頭','問好','跪','稟明','奉','喚來','叫','祖','奶','孫','老太太','帶','領']
Rule4 = ['長','次','大']
Rule5 = ['兄','哥','弟','姊','姐','妹']
Rule6 = ['姑','叔','舅','姨','甥','侄','親']
Rule7 = ['帶','領','教','徒','門生','師父']
Rule8 = ['主','僕','丫','丫頭','丫鬟','心腹','小的','下人','主僕']
Rule9 = ['使喚','謝','領','接','扇','差','命','遣','迎','打發','吩咐','喚','罵']

RULES = {
    '婚配': Rule1, 
    '直系': Rule2, 
    '尊卑': Rule3, 
    '旁系': Rule4, 
    '手足': Rule5, 
    '遠親': Rule6, 
    '師徒': Rule7, 
    '主僕': Rule8,
    '命令': Rule9
}

CORPUS = corpus

PRIORITY = [8, 4, 2, 1]


In [11]:
def isGeneralName(name):
    if len(name) == 2:
        return False
    else:
        subname = name[1:]
        if subname in GENERALNAME:
            return True

def isContainName(content, general_tag, name):
    if name in content:
        return True
    elif(general_tag):
        first_name = name[0]
        last_name = name[1:]
        if first_name in content and last_name in content:
            return True
    return False

class FeatureExtractor:
    
    def __init__(self, per1, per2, rel, file):
        self.per1 = per1
        self.per2 = per2
        self.rel = rel
        self.file = file
        self.word_vector = {}
        self.features = {}
        self.initialize()
    def initialize(self):
        
        # FEATURE0 - RELATION
        self.features['關係'] = 12
        
        # Intialized the word vector
        for term in TERM_LIST:
            self.word_vector[term] = 0
            
        # Added Person Number(e.g. 曹雪芹: 1, 賈寶玉: 2...)
        p1 =  PERSON[self.per2] if PERSON[self.per1] > PERSON[self.per2] else PERSON[self.per1]
        p2 =  PERSON[self.per1] if PERSON[self.per1] > PERSON[self.per2] else PERSON[self.per2]
        
        # FEATURE1 - PER_1
        # FEATURE2 - PER_2
        self.features['角色一'] = p1
        self.features['角色二'] = p2
        
        # FEATURE3 - LAST_NAME
        # Determine the last name is same or not
        if self.per1[0] == self.per2[0]:
            self.features['姓'] = 100
        else:
            self.features['姓'] = 0
            
        #FEATURE4 ~ FEATURE13 - RULE
        for feature, value in iteritems(RULES):
            self.features[feature] = 0
            
    def extract(self, content, priority):
        weight = priority
        tokens = content.split()
        # Relation in the content
        if self.rel in tokens:
            self.features['關係'] = RELATION[self.rel]
        for token in tokens:
            
            if '_P' in token:
                continue
            elif '_DE' in token:
                continue
            elif '_T' in token:
                continue
            elif '_SHI' in token:
                continue
            
            term = re.sub('_[A-Z | a-z | 0-9]*', '', token)
            for feature, rule in iteritems(RULES):
                weight = priority
                if feature == '婚配':
                    weight = priority
                elif feature == '尊卑':
                    weight = priority
                if term in rule:
                    self.features[feature] += 1 * weight                
            self.word_vector[term] += 1 * priority
    def save(self):
        word_freq = ''
        feature_str = ''
        
        for word, freq in iteritems(self.word_vector):
            word_freq = word_freq + str(freq) + ','
            
        for feature, score in iteritems(self.features):
            feature_str = feature_str + str(score) + ','
            
        #self.file.write(str(RELATION[self.REL]) + ',' + word_freq + "," + feature_str[:-1] + '\n')
        self.file.write(str(RELATION[self.rel]) + ',' + feature_str[:-1] + '\n')
        #self.file.write(str(RELATION[self.REL]) + ',' + word_freq[:-1] + '\n')
        
        
class Preprocessor:
    
    def __init__(self, data, corpus, filename):
        self.file = open(filename, 'w');
        self.data = data
        self.extracted = [None] * len(data)
        self.corpus = corpus
        self.vector = {}
        
    def close(self):
        self.file.close();
        
    def transform(self):
        index = 0
        for row in self.data:
            extracted = []
            extractor = FeatureExtractor(row[0], row[1], row[2], self.file)
            
            per1 = row[0]
            per2 = row[1]
            per1_general = isGeneralName(per1)
            per2_general = isGeneralName(per2)
            
            tag = False
            
            # Sentence
            for paragraph in self.corpus:
                sentences = re.split('，|。|？|！|；', paragraph)
                for i in range(len(sentences)):
                    if isContainName(sentences[i], per1_general, per1) and isContainName(sentences[i], per2_general, per2):
                        extractor.extract(sentences[i], PRIORITY[0])
                        extracted.append('S: ' + sentences[i])
                        if (tag == False):
                            tag = True
            
            # Context
            if tag == False:
                for paragraph in self.corpus:
                    sentences = re.split('，|。|？|！|；', paragraph)
                    for i in range(len(sentences)-2):
                        context = sentences[i] + sentences[i+1] + sentences[i+2]
                        if isContainName(context, per1_general, per1) and isContainName(context, per2_general, per2):
                            extractor.extract(context, PRIORITY[1])
                            extracted.append('C: ' + context)
                            if (tag == False):
                                tag = True
            # Otherwise
            if tag == False:
                temp = ['', '']
                for paragraph in self.corpus:
                    sentences = re.split('，|。|？|！|；', paragraph)
                    for sentence in sentences:
                        if isContainName(sentence, per1_general, per1) and temp[0] == '':
                            temp[0] = sentence
                        if isContainName(sentence, per2_general, per2) and temp[1] == '':
                            temp[1] = sentence
                    if temp[0] != '' and temp[1] != '':
                        extractor.extract(temp[0] + ' ' + temp[1], PRIORITY[3])
                        extracted.append('O: ' + temp[0] + ' ' + temp[1])
                        if tag == False:
                            tag = True
                        break;
                        
            # Otherwise
            if tag == False:
                temp = ['', '']
                for paragraph in CORPUS:
                    sentences = re.split('，|。|？|！|；', paragraph)
                    for sentence in sentences:
                        if per1 in sentence and temp[0] == '':
                            temp[0] = sentence
                        if per2 in sentence and temp[1] == '':
                            temp[1] = sentence
                    if temp[0] != '' and temp[1] != '':
                        extracted_content = temp[0] + ' ' + temp[1]
                        extracted.append('R: ' + extracted_content)
                        break;
                
            self.extracted[index] = extracted
            index +=1
            extractor.save()

In [25]:
generateFeatureFile(TRAIN, TEST, corpus_seg)
xgboost_preds = xgboostTraining()

rightRate=0.41071428571428603


In [None]:
rule_based_result = ruleBase()

In [29]:
result = mergeEvaluation(xgboost_preds, rule_based_result, 0.140624)
print(result)

0.158679
0.158679
0.158679
0.140799
0.158679
0.140799
0.158679
0.140799
0.158679
0.158679
0.140799
0.140799
0.140799
0.140799
0.140799
0.140799
0.155635
0.4910714285714286


In [6]:
def generateFeatureFile(train, test, corpus):
    # Training Data & Testing Data Transformation
    pre1 = Preprocessor(train, corpus, 'ftrain.txt')
    pre1.transform()
    pre1.close()

    pre2 = Preprocessor(test, corpus, 'ftest.txt')
    pre2.transform()
    pre2.close()

In [7]:
def xgboostTraining():
    dtrain = xgb.DMatrix('ftrain.txt')
    dtest = xgb.DMatrix('ftest.txt')
    # specify parameters via map
    param = {}
    # use softmax multi-class classification
    param['objective'] = 'multi:softprob'
    # scale weight of positive examples
    param['eta'] = 0.1126
    param['max_depth'] = 2
    param['silent'] = 1
    param['num_class'] = 12
    num_round = 2
    bst = xgb.train(param, dtrain, num_round)
    # make prediction
    bst.save_model('temp.txt')
    bst = xgb.Booster(param)
    bst.load_model('temp.txt')
    preds = bst.predict(dtest)
    return preds

In [8]:
def mergeEvaluation(xgboost_preds, rule_based_result, threshold):
    error = 0
    preds = xgboost_preds
    result = rule_based_result
    
    for i in range(112):
        prob = np.amax(preds[i])
        label = preds[i].tolist().index(prob)
        test_label = TEST[i][2]
        if prob < threshold:
            if result[i] != test_label:
                error += 1;
        else:
            if (label != RELATION[test_label]):
                error += 1;
                print(prob)
    return(1 - error/112)

In [23]:
def tf_idf(corpus,term_dic): #Create function (arg1,arg2....)
    # treat every paragraph as a document : N=num_of_para
    num_of_para=len(corpus)
    temp_dic={}
    for term in term_dic:#有的兩個最終都=0，為什麼？？！？！
        doc_freq=0
        term_freq=0
        for paragraph in corpus:
            if (paragraph.find(term)>=0):
                doc_freq+=1
                term_freq+=paragraph.count(term)     
        #weight=(1+math.log10(term_freq))*(math.log10(num_of_para/doc_freq))
        if doc_freq==0:
            weight=0
        else:
            weight=(1+math.log10(term_freq))*math.log10(num_of_para/doc_freq)
        temp_dic[term] = weight
    return temp_dic

def ruleBase():
    # Build The Term Dictinary
    term_dic = {} # {term:代號,...}
    term_weight_dic = {}
    
    for paragraph in corpus_seg:
        tokens = paragraph.split()
        for token in tokens:
            # Normal Norm
            if '_Na' in token:
                pair = token.split('_')
                if pair[0] not in term_dic:
                    term_dic[pair[0]]=pair[1]
            # 沒有加Nb: 專有名詞！
            # Location
            elif '_Nc' in token:
                pair = token.split('_')
                if pair[0] not in term_dic:
                    term_dic[pair[0]]=pair[1]
            # Time
            elif '_Nd' in token:
                pair = token.split('_')
                if pair[0] not in term_dic:
                    term_dic[pair[0]]=pair[1]
            elif '_V' in token:
                pair = token.split('_')
                if pair[0] not in term_dic:
                    term_dic[pair[0]]=pair[1]
                    
    term_weight_dic = tf_idf(corpus,term_dic)

    featureDic={}
    featureDic['婚配']=['嫁','娶','婚','買','嫡夫','婦','嫡','妻','妾','連理','太太','夫妻']
    featureDic['直系']=['喚作','取名','生','有了','得了','養','懷','爹','娘','父','母','兒','女','女兒','子','孩','乳名','小名']
    featureDic['尊卑']=['請','給','來','請安','磕頭','問好','跪','稟明','奉','喚來','叫','祖','奶','孫','老太太','帶','領']
    featureDic['旁系']=['長','次','大']
    featureDic['手足']=['兄','哥','弟','姊','姐','妹']
    featureDic['遠親']=['姑','叔','舅','姨','甥','侄','親']
    featureDic['師徒']=['帶','領','教','徒','門生','師父']
    featureDic['主僕']=['主','僕','丫','丫頭','丫鬟','心腹','小的','下人','主僕']
    featureDic['命令']=['使喚','謝','領','接','扇','差','命','遣','迎','打發','吩咐','喚','罵']
    featureDic['地點']=[]
    for key in term_dic:
        if 'Nc' in term_dic[key]:
            featureDic['地點'].append(key)
    relationDic={'祖孫':0, '母子':1, '母女':2, '父子':3, '父女':4, '兄弟姊妹':5,'夫妻':6,
                 '姑叔舅姨甥侄':7,'遠親':8,'主僕':9, '師徒':10,'居處':11 }
    featureDic['女性']=['嬤','母','姐','姊','妹','太','夫人','氏','娘','女','姑','姨']
    # 若冠夫姓或父姓，可能會出現的稱呼
    featureDic['父姓']=['姐','母','娘','媽','奶','嬤']


    # store data to judge
    train = [] # form:[[E1 E2 R],[E1,E2,R]......]
    index = 0
    for row in test_data:
        if index == 0:
            index += 1
            continue
        index += 1
        x = re.split('\t|\n', row)
        # E1 E2 R
        train.append([x[1], x[2], x[3]])

    preFile = open('Segpreprocess.txt','w') 

    rightRelation=[]
    judgeRelation=[]
    # search corpus by two entities and relationship
    for row in TEST:
        oneLine=[]
        oneSentence=[]
        threeSentences=[]
        oneParagraph=[]
        # 待補：上一段末句和下一段首句

        rightRelation.append(row[2])

        # 其中一個是地方就不用做feature list判斷了，一定是居處
        if (row[0] in featureDic['地點']) or (row[1] in featureDic['地點']):
            judgeRelation.append('居處')        
            continue
        # 三個字人名如果有姓，去掉比較好找。若最後一個字是「娘」代表是姨娘，姓不可以省略
        else:
            entity1=row[0]
            entity2=row[1]
            if (len(row[0])==3 and ('娘' not in row[0]) and ('嬤' not in row[0])):
                entity1=row[0][1:]
            if (len(row[1])==3 and ('娘' not in row[0]) and ('嬤' not in row[0])):
                entity2=row[1][1:]


        preFile.write(row[0]+' '+row[1]+' '+row[2]+'\n')

        for paragraph in corpus:
            #paragraph.replace(" ","")
            if ((entity1 in paragraph) and (entity2 in paragraph)):
                inLine = False
                inSentence = False
                inThreeSentences = False
                lines = re.split('[，；。？！]', paragraph)
                for line in lines:
                    if ((entity1 in line) and (entity2 in line)):
                        inLine = True
                        oneLine.append(line)
                        preFile.write("LINE:"+line+'\n')

                sentences = re.split('[。？！]', paragraph)
                thrSentences = []

                for sentence in sentences:
                    idx=sentences.index(sentence)
                    # create 3-sentences group
                    if idx>1:
                        thrSentences.append(sentences[idx-2]+"。"+sentences[idx-1]+"。"+sentences[idx])#中間標點統一以。代替
                    # judge if in the list
                    if ((entity1 in sentence) and (entity2 in sentence)):
                        inSentence = True
                        # 不能跟前面重複，中間沒逗點再加
                        commaLoc=[m.start() for m in re.finditer('[，；]', sentence)] # get all locations of '，；' 
                        hasCommaBetween = False
                        for cLoc in commaLoc:#暫不考慮同一人名一句話出現兩次的特例
                            a = sentence.find(row[0])
                            b = sentence.find(row[1])
                            if ((a<cLoc and cLoc<b) or (b<cLoc and cLoc<a)):
                                hasCommaBetween = True
                                break
                        if (hasCommaBetween == True):
                            oneSentence.append(sentence)
                            preFile.write("SENTENCE:"+sentence+'\n')

                for context in thrSentences:
                    if ((entity1 in context) and (entity2 in context)):
                        inThreeSentences = True
                        # 不能跟前面重複，中間沒。再加
                        periodLoc=[m.start() for m in re.finditer('[。]', context)] # get all locations of '。' 

                        hasPeriodBetween = False
                        for pLoc in periodLoc:#暫不考慮同一人名一句話出現兩次的特例
                            a = context.find(entity1)
                            b = context.find(entity2)
                            if ((a<pLoc and pLoc<b) or (b<pLoc and pLoc<a)):
                                hasPeriodBetween = True
                                break
                        if (hasPeriodBetween == True):
                            threeSentences.append(context)
                            preFile.write("CONTEXT:"+context+'\n')


                if not (inLine or inSentence or inThreeSentences):
                    oneParagraph.append(paragraph)
                    preFile.write("PARAGRAPH:"+paragraph+'\n')

        # create a dictionary to store appear phrase weight            
        term_weight_vector={}

        for line in oneLine:
            tempLine = line
            for term in term_dic:
                if term in tempLine:            
                    if term not in term_weight_vector:
                        term_weight_vector[term]=0
                    term_weight_vector[term]+=tempLine.count(term)*16
                    #s = '_'+term_dic[term]
                    #tempLine = tempLine.replace(term,s)
        for context in threeSentences:
            tempContext = context
            for term in term_dic:
                if term in tempContext:            
                    if term not in term_weight_vector:
                        term_weight_vector[term]=0
                    term_weight_vector[term]+=tempContext.count(term)*4
        for sentence in oneSentence:
            tempSentence = sentence
            for term in term_dic:
                if term in tempSentence:            
                    if term not in term_weight_vector:
                        term_weight_vector[term]=0
                    term_weight_vector[term]+=tempSentence.count(term)*2
        for paragraph in oneParagraph:
            tempParagraph = paragraph
            for term in term_dic:
                if term in tempParagraph:            
                    if term not in term_weight_vector:
                        term_weight_vector[term]=0
                    term_weight_vector[term]+=tempParagraph.count(term)*1
        #print (term_weight_vector)

        # create feature list that symbolize different relationship
        featureList=[0]*12
        for term in term_weight_vector:
            if term in featureDic['婚配']:
                featureList[relationDic['夫妻']]+=term_weight_vector[term]*term_weight_dic[term]
            elif term in featureDic['直系']:
                featureList[relationDic['父子']]+=term_weight_vector[term]*term_weight_dic[term]
                featureList[relationDic['父女']]+=term_weight_vector[term]*term_weight_dic[term]
                featureList[relationDic['母子']]+=term_weight_vector[term]*term_weight_dic[term]
                featureList[relationDic['母女']]+=term_weight_vector[term]*term_weight_dic[term]
            elif term in featureDic['尊卑']:
                featureList[relationDic['祖孫']]+=term_weight_vector[term]*term_weight_dic[term]
                featureList[relationDic['主僕']]+=term_weight_vector[term]*term_weight_dic[term]
                featureList[relationDic['夫妻']]+=term_weight_vector[term]*term_weight_dic[term]
            elif term in featureDic['旁系']:
                featureList[relationDic['兄弟姊妹']]+=term_weight_vector[term]*term_weight_dic[term]
            elif term in featureDic['手足']:
                featureList[relationDic['兄弟姊妹']]+=term_weight_vector[term]*term_weight_dic[term]
                featureList[relationDic['主僕']]+=term_weight_vector[term]*term_weight_dic[term]
            elif term in featureDic['主僕']:
                featureList[relationDic['主僕']]+=term_weight_vector[term]*term_weight_dic[term]
            elif term in featureDic['命令']:
                featureList[relationDic['夫妻']]+=term_weight_vector[term]*term_weight_dic[term]
                featureList[relationDic['主僕']]+=term_weight_vector[term]*term_weight_dic[term]
                featureList[relationDic['父子']]+=term_weight_vector[term]*term_weight_dic[term]
                featureList[relationDic['父女']]+=term_weight_vector[term]*term_weight_dic[term]
                featureList[relationDic['母子']]+=term_weight_vector[term]*term_weight_dic[term]
                featureList[relationDic['母女']]+=term_weight_vector[term]*term_weight_dic[term]
            elif term in featureDic['遠親']:
                featureList[relationDic['遠親']]+=term_weight_vector[term]*term_weight_dic[term]
            elif term in featureDic['師徒']:
                featureList[relationDic['師徒']]+=term_weight_vector[term]*term_weight_dic[term]




        # 偵測是否冠夫姓或父姓
        hasMaleHead=False
        for f in featureDic['父姓']:
            if (f in row[0]) or (f in row[1]):
                hasMaleHead=True
                break

        # 同姓：不會是主僕（僕人通常是暱稱），且更有可能是父子、父女、祖孫、兄弟姊妹、姑叔舅姨甥侄、遠親
        if (row[0][0]==row[1][0]):
            featureList[relationDic['主僕']]=0

            featureList[relationDic['遠親']]+=100
            featureList[relationDic['父子']]+=100 # 數字我亂設的
            featureList[relationDic['父女']]+=100
            featureList[relationDic['祖孫']]+=100
            featureList[relationDic['兄弟姊妹']]+=100
            featureList[relationDic['姑叔舅姨甥侄']]+=100

            # 若冠夫姓或父姓，同姓仍有可能是母子或母女
            if not hasMaleHead:
                featureList[relationDic['母子']]=0
                featureList[relationDic['母女']]=0
            else:
                featureList[relationDic['母子']]+=100
                featureList[relationDic['母女']]+=100

        # 偵測兩人中是否有女性
        hasFemale=False
        for f in featureDic['女性']:
            if (f in row[0]) or (f in row[1]):
                hasFemale=True
                break

        # 有女性就不會是父子、師徒
        if hasFemale:
            featureList[relationDic['父子']]=0
            featureList[relationDic['師徒']]=0


        # 還沒設想值相同的狀況
        max_value=featureList.index(max(featureList))


        #judgeRelation.append(relationDic.keys()[relationDic.values().index(max_value)])
        judgeRelation.append(list(relationDic.keys())[list(relationDic.values()).index(max_value)])
    preFile.close()            

    rightRate=0
    for i in range(len(judgeRelation)):
        if judgeRelation[i]==rightRelation[i]:
            #print(judgeRelation[i]+"="+rightRelation[i])
            rightRate+=1/len(judgeRelation)
    print ("rightRate="+str(rightRate))
    return judgeRelation