In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from itertools import chain
import nltk
import sklearn
import scipy. stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import pandas as pd
import numpy as np
from itertools import groupby
import pickle

In [None]:
with open('训练语料/relation extraction dataset/re_train.pkl', 'rb') as f:
    rtrain = pickle.load(f)
with open('训练语料/relation extraction dataset/re_test.pkl', 'rb') as f:
    rtest = pickle.load(f)
with open('训练语料/relation extraction dataset/re_valid.pkl', 'rb') as f:
    rvalid = pickle.load(f)

In [None]:
def tag2code(tags):
    code=[]
    #ent=()
    ents=[]
    n=0
    m=1
    for i in range(len(tags)):
        #print(i,tags[i][0])
        if tags[i][0]=='O':
            code.append('0')
        elif tags[i][0]=='B':
            n+=1
            code.append('T'+str(n))
            
        elif tags[i][0]=='I' and tags[i+1][0]=='I':
            code.append('T'+str(n))
            m+=1
        elif tags[i][0]=='I' and tags[i+1][0]!='I':
            code.append('T'+str(n))
            ent=(code[len(code)-1],tags[i][2:len(tags[i])],i-m, i)
            ents.append(ent)
    return ents,code

In [None]:
taglist=rtrain[0]['tags']
tag2code(taglist)

In [None]:
end2end_dict_list=[]
for i in rtrain:
    end2end_dict={}
    taglist=i['tags']
    end2end_dict['HADM_ID']=i[0]
    end2end_dict['token']=i[1]
    end2end_dict['tags']=i[2]
    end2end_dict['relations']=tag2code(taglist)[0]
    end2end_dict['code']=tag2code(taglist)[1]
    end2end_dict_list.append(end2end_dict)

In [None]:
def file2set(file):
    rset=[]
    pid=[]
    for i in file:
        subset=[]
        pid.append(i['HADM_ID'])
        code=tag2code(i['tags'])[1]

        for j in range(len(i['token'])):
            subset.append((i['token'][j],code[j],i['tags'][j]))
        rset.append(subset)
    return rset

In [None]:
rtrainset=file2set(rtrain)
rtestset=file2set(rtest)
rvalidset=file2set(rvalid)

training set

In [None]:
rtrainset=[]
trainpid=[]
for i in rtrain:
    trainset=[]
    trainpid.append(i['HADM_ID'])
    code=tag2code(i['tags'])[1]
    
    for j in range(len(i['token'])):
        trainset.append((i['token'][j],i['tags'][j],code[j]))
    rtrainset.append(trainset)
rtrainset[0:3]

testing set

In [None]:
rtestset=[]
testpid=[]
for i in rtest:
    testset=[]
    testpid.append(i['HADM_ID'])
    code=tag2code(i['tags'])[1]
    
    for j in range(len(i['token'])):
        testset.append((i['token'][j],i['tags'][j],code[j]))
    rtestset.append(testset)
rtestset[0:3]

validating set

In [None]:
rvalidset=[]
validpid=[]
for i in rvalid:
    validset=[]
    validpid.append(i['HADM_ID'])
    code=tag2code(i['tags'])[1]
    
    for j in range(len(i['token'])):
        validset.append((i['token'][j],i['tags'][j],code[j]))
    rvalidset.append(validset)
rvalidset[0:3]

In [None]:
def word2features(sent,i):
    word=str(sent[i][0])
    #tag= sent[i][1] 
    
    features={
        'bias':1.0,
        'word.lower()':word.lower(),
        'word[-3:]':word[-3:],
        'word[-2:]':word[-2:],
        'word.isupper()':word.isupper(),
        'word.istitle()':word.istitle(),
        'word.isdigit()':word.isdigit(),
        #'tag-start':tag[0],
        #'tag-end':tag.split('-')[len(tag.split('-'))-1],
        
    }
    if i>0:
        word1=str(sent[i-1][0])
        #tag1=sent[i-1][1]
        features.update({
            '-1:word.lower()':word1.lower(),
            '-1:word.istitle()':word1.istitle(),
            '-1:word.isupper()':word1.isupper(),
            #'tag-start':tag1[0],
            #'tag-end':tag1.split('-')[len(tag1.split('-'))-1],
        })
    else:
        features['BOS']=True
        
    if i<len(sent)-1:
        word1=str(sent[i+1][0])
        #print(i,word1)
        #tag1=sent[i+1][1]
        features.update({
            '+1:word.lower()':word1.lower(),
            '+1:word.istitle()':word1.istitle(),
            '+1:word.isupper()':word1.isupper(),
            #'tag-start':tag1[0],
            #'tag-end':tag1.split('-')[len(tag1.split('-'))-1],
        })
    else:
        features['EOS']=True
    
    return features

def sent2features(sent):
    return [word2features(sent,i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [None]:
%%time
X_rtrain=[sent2features(s) for s in rtrainset]
y_rtrain=[sent2labels(s) for s in rtrainset]

X_rtest=[sent2features(s) for s in rtestset]
y_rtest=[sent2labels(s) for s in rtestset]

In [None]:
X_rvalid=[sent2features(s) for s in rvalidset]
y_rvalid=[sent2labels(s) for s in rvalidset]

In [None]:
%%time
crf=sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_rtrain, y_rtrain)

In [None]:
labels=list(crf.classes_)
labels.remove('O')
label_set=[x.split('-')[1] for x in labels]
labels

In [None]:
y_rpred=crf.predict(X_rtest)
metrics.flat_f1_score(y_rtest, y_rpred, average='weighted',labels=labels)

In [None]:
y_rpred

In [None]:
sorted_labels=sorted(labels, key=lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(y_rtest, y_rpred, labels=sorted_labels, digits=3))

In [None]:
%%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=50, 
                        scoring=f1_scorer)
rs.fit(X_rtrain, y_rtrain)

In [None]:
crf = rs.best_estimator_
y_rpred_valid = crf.predict(X_rvalid)
print(metrics.flat_classification_report(
    y_rvalid, y_rpred_valid, labels=sorted_labels, digits=3
))

In [None]:
def sent2tag(sent_list):
    sent2tag_list=[]
    for i in sent_list:
        sent2tag_list=sent2tag_list+i
    return sent2tag_list

def tag2entity(sent_list):
    tag2entity_list=[]
    for j in sent2tag(sent_list):
        if j=='O':
            tag2entity_list.append([j])
        elif j[0]=='B':
            tag2entity_list.append([j])
        elif j[0]=='I':
            tag2entity_list[len(tag2entity_list)-1].append(j)
    return tag2entity_list

#def tag2entity_pred(sent_list,y_pred):
#    tag2entity_pred_list=[]
#    x=0
#    for i in tag2entity(sent_list):
#        tag2entity_pred_list.append(sent2tag(y_pred)[x:x+len(i)])
#        x+=len(i)
#    return tag2entity_pred_list
        
def entity2label(tag2entity_list):
    entity2label_list=[]
    for k in tag2entity_list:
        #print(k)
        l=[]
        for m in k:
            lm=m.split('-')
            l.append(lm[len(lm)-1])
        if len(set(l))==1:
            entity2label_list.append(l[0])
        elif len(set(l))>1:
            entity2label_list.append(','.join(set(l)))  
            #print(','.join(set(l)))
    return entity2label_list

In [None]:
def tag2entity_pred(sent_list_test, sent_list_pred):
    tag2entity_pred_list=[]
    tag2entity_list=tag2entity(sent_list_test)
    sent2tag_pred_list=sent2tag(sent_list_pred)
    x=0
    for i in tag2entity_list:
        tag2entity_pred_list.append(sent2tag_pred_list[x:x+len(i)])
        x+=len(i)
    return tag2entity_pred_list

In [None]:
tag2entity_pred0_list=[]
tag2entity_list=tag2entity(y_rtest)
sent2tag_pred0_list=sent2tag(y_rpred)
x=0
for i in tag2entity_list:
    tag2entity_pred0_list.append(sent2tag_pred0_list[x:x+len(i)])
    x+=len(i)

In [None]:
y_test0_label=entity2label(tag2entity(y_rtest))

In [None]:
y_pred0_label=entity2label(tag2entity_pred0_list)

In [None]:
len(y_test0_label),len(y_pred0_label)

In [None]:
y_test0_label

In [None]:
y_test0_label

calculate precision recall

precision: corrected predicted nihss entitiy / all entity predicted as nihss

In [None]:
TP_dict=dict()
PP_dict=dict()
RP_dict=dict()
for i in [TP_dict,PP_dict, RP_dict]:
    for j in label_set:
        i[j]=0

* TP: number of entities correctly predicted as A,
* PP: number of entities predicted as A, correct or not, 
* RP: real number of entities that is A

In [None]:
for i in range(len(y_test0_label)):
    if y_test0_label[i]==y_pred0_label[i] and y_test0_label[i]!='O':
        TP_dict[y_test0_label[i]]+=1

In [None]:
for i in y_pred0_label:
    if i in label_set:
        PP_dict[i]+=1

In [None]:
for i in y_test0_label:
    if i in label_set:
        RP_dict[i]+=1

In [None]:
precision_dict=dict()
recall_dict=dict()
f1_dict=dict()
for i in label_set:
    precision_dict[i]=TP_dict[i]/PP_dict[i]
    recall_dict[i]=TP_dict[i]/RP_dict[i]
    f1_dict[i]=2*(precision_dict[i]*recall_dict[i])/(precision_dict[i]+recall_dict[i])

In [None]:
precision_dict

In [None]:
recall_dict

In [None]:
f1_dict

In [None]:
precision_df=pd.DataFrame(precision_dict, index=['precision'])
recall_df=pd.DataFrame(recall_dict, index=['recall'])
f1_df=pd.DataFrame(f1_dict, index=['f1'])
pd.concat([precision_df, recall_df,f1_df])

In [None]:
def overall_perf(test_label,pred_label):
    tp=0
    for i in range(len(test_label)):
        if test_label[i]==pred_label[i] and test_label[i]!='O':
            tp+=1
    while 'O' in test_label:
        test_label.remove('O')
    rp=len(test_label)
    
    while 'O' in pred_label:
        pred_label.remove('O')
    pp=len(pred_label)
    precision=tp/pp
    recall=tp/rp
    f1=2*(precision*recall)/(precision+recall)
    overall_df=pd.DataFrame(data={'precision':precision, 'recall':recall, 'f1':f1}, index=['overall'])
    return overall_df

In [None]:
def train_test(traindf,testdf):
    train_set=df2list(traindf)
    test_set=df2list(testdf)
    
    X_train=[sent2features(s) for s in train_set]
    y_train=[sent2labels(s) for s in train_set]
    X_test=[sent2features(s) for s in test_set]
    y_test=[sent2labels(s) for s in test_set]
    
    crf=sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    crf.fit(X_train, y_train)
    
    labels=list(crf.classes_)
    labels.remove('O')
    label_set=[x.split('-')[1] for x in labels]

    y_pred=crf.predict(X_test)
    f1_score=metrics.flat_f1_score(y_test, y_pred, average='weighted',labels=labels)
    print('f1:',f1_score)
    
    y_test_label=entity2label(tag2entity(y_test))
    y_pred_label=entity2label(tag2entity_pred(y_test, y_pred))
    
    TP_dict=dict()
    PP_dict=dict()
    RP_dict=dict()
    for i in [TP_dict,PP_dict, RP_dict]:
        for j in label_set:
            i[j]=0
    
    #print(y_test_label[3292])
    for l in range(len(y_test_label)):
        if y_test_label[l]==y_pred_label[l] and y_test_label[l]!='O':
            #print(l,y_test_label[l])
            TP_dict[y_test_label[l]]+=1
    
    for m in y_pred_label:
        if m in label_set:
            PP_dict[m]+=1
            
    for n in y_test_label:
        if n in label_set:
            RP_dict[n]+=1
            
    precision_dict=dict()
    recall_dict=dict()
    f1_dict=dict()
    for s in label_set:
        precision_dict[s]=TP_dict[s]/PP_dict[s]
        recall_dict[s]=TP_dict[s]/RP_dict[s]
        f1_dict[s]=2*(precision_dict[s]*recall_dict[s])/(precision_dict[s]+recall_dict[s])
    
    precision_df=pd.DataFrame(precision_dict, index=['precision'])
    recall_df=pd.DataFrame(recall_dict, index=['recall'])
    f1_df=pd.DataFrame(f1_dict, index=['f1'])
    performance=pd.concat([precision_df, recall_df,f1_df])
    performancet=pd.DataFrame(performance.values.T, index=performance.columns, columns=performance.index)
    #print('len test',len(y_test_label),'len pred',len(y_pred_label))
    overalldf=overall_perf(y_test_label,y_pred_label)
    performancet=performancet.append(overalldf)
    return performancet

In [None]:
train_test(traindf0,testdf0)

In [None]:
writer = pd.ExcelWriter('performance.xlsx')

for i,j,k in [(traindf0,testdf0,'fold0'),(traindf1,testdf1,'fold1'),(traindf2,testdf2,'fold2'),(traindf3,testdf3,'fold3'),(traindf4,testdf4,'fold4')]:
    df=train_test(i,j)
    df.to_excel(writer, sheet_name=k)
    print(k)
writer.save()

In [None]:
def train_test_vali(traindf,testdf,validf):
    train_set=df2list(traindf)
    test_set=df2list(testdf)
    vali_set=df2list(validf)
    
    X_train=[sent2features(s) for s in train_set]
    y_train=[sent2labels(s) for s in train_set]
    X_test=[sent2features(s) for s in test_set]
    y_test=[sent2labels(s) for s in test_set]
    X_vali=[sent2features(s) for s in vali_set]
    y_vali=[sent2labels(s) for s in vali_set]
    
    crf=sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    crf.fit(X_train, y_train)
    
    labels=list(crf.classes_)
    labels.remove('O')
    label_set=[x.split('-')[1] for x in labels]
    
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs', 
        max_iterations=100, 
        all_possible_transitions=True
    )
    params_space = {
        'c1': scipy.stats.expon(scale=0.5),
        'c2': scipy.stats.expon(scale=0.05),
    }
    
    # use the same metric for evaluation
    f1_scorer = make_scorer(metrics.flat_f1_score, 
                            average='weighted', labels=label_set)

    # search
    rs = RandomizedSearchCV(crf, params_space, 
                            #cv=3, 
                            verbose=1, 
                            n_jobs=-1, 
                            n_iter=50, 
                            scoring=f1_scorer)
    rs.fit(X_train, y_train)
    crf=rs.best_estimator_
    y_pred_vali=crf.predict(X_vali)

    #y_pred=crf.predict(X_test)
    #f1_score=metrics.flat_f1_score(y_test, y_pred, average='weighted',labels=labels)
    #print('f1:',f1_score)
    
    y_vali_label=entity2label(tag2entity(y_vali))
    y_pred_vali_label=entity2label(tag2entity_pred(y_vali, y_pred_vali))
    
    performance=perfm(y_vali_label,y_pred_vali_label)
    overalldf=overall_perf(y_vali_label,y_pred_vali_label)
    performance=performance.append(overalldf)
    
    return performance

In [None]:
train_test_vali(traindf1,testdf1,validf)

In [None]:
y=0
for i,j,k in [(traindf0,testdf0,'fold0'),(traindf1,testdf1,'fold1'),(traindf2,testdf2,'fold2'),(traindf3,testdf3,'fold3'),(traindf4,testdf4,'fold4')]:
    print(y)
    if y==0:
        df=train_test_vali(i,j,validf)
        y+=1
    else:
        dfy=train_test_vali(i,j,validf)
        df=pd.concat([df,dfy], axis=1, join_axes=[df.index])
        y+=1
df

In [None]:
df.to_csv('performance_vali.csv')

In [None]:
%%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=50, 
                        scoring=f1_scorer)
rs.fit(X_train0, y_train0)
crf=rs.best_estimator_
y_pred_t=crf.predict(X_vali)

In [None]:
def perfm(y_real,y_pred):
    TP_dict=dict()
    PP_dict=dict()
    RP_dict=dict()
    for i in [TP_dict,PP_dict, RP_dict]:
        for j in label_set:
            i[j]=0
    
    for i in range(len(y_real)):
        if y_real[i]==y_pred[i] and y_real[i]!='O':
            TP_dict[y_real[i]]+=1
    for i in y_pred:
        if i in label_set:
            PP_dict[i]+=1
    #print(PP_dict)
    for i in y_real:
        if i in label_set:
            RP_dict[i]+=1
            
    precision_dict=dict()
    recall_dict=dict()
    f1_dict=dict()
    for i in label_set:
        if PP_dict[i]==0:
            precision_dict[i]=0
        else:   
            precision_dict[i]=TP_dict[i]/PP_dict[i]
        if RP_dict[i]==0:
            recall_dict[i]=0
        else:
            recall_dict[i]=TP_dict[i]/RP_dict[i]
        if (precision_dict[i]+recall_dict[i])==0:
            f1_dict[i]=0
        else:
            f1_dict[i]=2*(precision_dict[i]*recall_dict[i])/(precision_dict[i]+recall_dict[i])
    
    precision_df=pd.DataFrame(precision_dict, index=['precision'])
    recall_df=pd.DataFrame(recall_dict, index=['recall'])
    f1_df=pd.DataFrame(f1_dict, index=['f1'])
    perf_df=pd.concat([precision_df, recall_df,f1_df])
    perf_dft=pd.DataFrame(perf_df.values.T, index=perf_df.columns, columns=perf_df.index)
    return perf_dft

In [None]:
y_vali_list=entity2label(tag2entity(y_vali))
y_pred_t_list=entity2label(tag2entity_pred(y_vali, y_pred_t))

In [None]:
perfm(y_vali_list,y_pred_t_list)