In [4]:
from corpus_reader import read_dataset
#from n_gram_naive_bayes import trained_naive_bayes, extract_features_and_classify

from word_extraction import remove_puntuations
from numpy.random import choice

from sklearn import svm

import sys
import numpy as np

import re

from time import time

In [5]:
data = read_dataset()
n_docs = len(data)

np.random.seed(42)

train_amount = 0.7
train_ids = choice(n_docs, int(train_amount * n_docs), replace=False)
test_ids = [i for i in range(n_docs) if i not in train_ids]

training = [data[i] for i in train_ids]
test = [data[i] for i in test_ids]

print("%i training documents" % len(training))
print("%i test documents" % len(test))


499 training documents
215 test documents


In [10]:
#given a list of document, iterate over the sentences and output a dictionnary counting words occuring
#between drugs and also the number of pairs

class SVM:
    def __init__(self):
        self.nbPairs = 0
        self.count_words = {}
        
        self.nb_feature_words = 20
        self.feature_words = []
        self.feature_index = {}
        
        self.clf = None
        
        
        #given a class, gives the corresponding value
        self.class_index = {
            'no_interaction': 0,
            'advise' : 1,
            'effect':2,
            'int':3,
            'mechanism':4
        }
        
        #given a class value, gives the corresponding label
        self.class_mapping = {v: k for k, v in self.class_index.items()}


    def get_interaction_text_list(self, pair):
        text = re.split("\W+", pair.textBetween)
        return " ".join([w for w in text if w != ""])
    
    #count specific words in doc list
    def count_words_doc(self, doc_list, nb=-1, verbose = False):
        nb = len(doc_list) if nb == -1 else nb
        for doc in doc_list[0:nb]:
            for sentence in doc.sentences:
                if len(sentence.entities) >= 2:
                    self.nbPairs += len(sentence.pairs)
                    all_interaction_text = [self.get_interaction_text_list(p) for p in sentence.pairs]
                    if verbose:
                        print(sentence.text)
                        print(all_interaction_text)
                        print([e.text for e in sentence.entities])
                        print([str(p) for p in sentence.pairs])

                        print("\n-------\n")
                    
                    for interaction_text in all_interaction_text:
                        for w in interaction_text:
                            self.count_words[w] = self.count_words.get(w,0) + 1
    
    def create_feature_word_list(self, nb_feature_words = 20):
        srt = sorted([(self.count_words[w], w) for w in self.count_words], reverse = True)
        
        self.nb_feature_words = nb_feature_words
        self.feature_words = [w for (n,w) in srt[:nb_feature_words]]
        
        self.feature_index = {}

        for (i,feat) in enumerate(self.feature_words):
            self.feature_index[feat] = i
    
    def get_feature_from_text(self, wordList):
        output = np.zeros(self.nb_feature_words)
        for w in wordList:
            if w in self.feature_index:
                output[self.feature_index[w]] = 1
        
        return output
    
    def get_features_from_pair(self, pair):
        text = re.split("\W+", pair.textBetween)
        text = " ".join([w for w in text if w != ""])
        
        return self.get_feature_from_text(text)
    
    def create_feature_matrix_and_labels(self,doc_list, nb = -1, verbose = False):
        currPair = 0
        
        ncol = self.nb_feature_words
        nrow = self.nbPairs

        matrixFeature = np.zeros(shape=(nrow, ncol))
        
        labels = [0]*self.nbPairs
        
        nb = len(doc_list) if nb == -1 else nb
        for doc in doc_list[0:nb]:
            for sentence in doc.sentences:
                if len(sentence.entities) >= 2:
                    
                    for p in sentence.pairs:
                        matrixFeature[currPair] = self.get_features_from_pair(p)
                        labels[currPair] = self.class_index[p.getLabel()]
                        currPair += 1
                    
                    if verbose == True:
                        print(sentence.text)
                        print([e.text for e in sentence.entities])
                        #print([get_necessary_info(p, sentence.text) for p in sentence.pairs])
                    
                        
                        #print(all_interaction_text)
                        print("\n-------\n")
        
        if verbose == "minim":
            print("Done.")
        
        return [matrixFeature, labels]
    

In [11]:
psvm = SVM()

In [13]:
psvm.count_words_doc(training)

print("Done")
print("Total number of pairs: " + str(psvm.nbPairs))

Done
Total number of pairs: 20916


In [14]:
psvm.create_feature_word_list(nb_feature_words = 20)
#print("Showing most "+ str(nb_feature_words) + " frequent words.")

#for e in srt[:nb_feature_words]:
#    print(e)

In [15]:
temp = psvm.create_feature_matrix_and_labels(training)

trainingFeature = temp[0]
labels = np.array(temp[1])

print("Done")

Done


In [16]:
def count_balancing(labels):
    c = {}
    for l in labels:
        c[l] = c.get(l,0) + 1
    
    return c

In [18]:
#SVC (but not NuSVC) implement a keyword class_weight in the fit method.
#It’s a dictionary of the form {class_label : value}, where value is a floating point number > 0
#as it is unbalanced, balance the classes
c = count_balancing(labels)
weight = {i: psvm.nbPairs/float(c[i]) for i in c.keys()}

#define the SVM
psvm.clf = svm.LinearSVC(class_weight=weight)
#psvm.clf = svm.SVC(probability=True, class_weight=weight)


In [19]:
#train it
start = time()
psvm.clf.fit(trainingFeature,labels)
print ("Done in " + str(time() - start) + "s")

Done in 86.8259999752s


# TESTING CLASSIFIERS

In [222]:
true_positives = 0
false_positives = 0
false_negatives = 0

correct = 0
total = 0

In [223]:

for tdoc in test:
    for sentence in tdoc.sentences:
        if len(sentence.entities) >= 2:
            for p in sentence.pairs:
                s = p.textBetween
                s = s.lower().split(' ')
                f = get_features_from_text(s).reshape(1,-1)
                
                pred = classe_mapping[psvm.clf.predict(f)[0]]
                true = p.getLabel()
                
                total += 1
                
                if pred == true:
                    correct += 1



In [224]:

precision = round(100*100*float(correct)/total)/100
print("correct: "+str(correct))
print("total: "+str(total))
print("precision: "+str(precision)+ "%")


correct: 5834
total: 6923
precision: 84.27%
