In [1]:
import numpy as np
import spacy

In [2]:
import pickle

In [3]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from string import punctuation

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
train_file = open('./final_train.txt', 'r')
traininput = train_file.read()

dev_file = open('./final_dev.txt', 'r')
devinput = dev_file.read()

In [6]:
def parse(input_):
    tags = []
    lexicons = []
    sentences = input_.split("\n\n")
    for sentence in sentences:
        words = sentence.split("\n")
        for word in words:
            tokens = word.split("\t")
            tags.append(tokens[7])
            lexicons.append(tokens[1])
        lexicons.append("</s>")
    
    lexicons.pop()
    return lexicons, tags

In [15]:
def generateSamples(train_words):
    file = open("training_sample_window_5.txt", "w")

    train_data = ""
    for w in train_words:
        if w != "</s>" and w!= "" and w!= " ":
            train_data+=w
            train_data+=" "
        else:
            train_data+="\n"
    
    training_samples = np.empty((1, 5))
    sentences = train_data.split("\n")
    cou = 0
    for sentence in sentences:
        words = sentence.split(" ")
        words.pop()
        for i in range(len(words)):
            cou+=1
#             print(cou)
            lexicons=[]
            try:
                if i == 0:
                    lexicons.extend(["start", "start", words[i], words[i+1], words[i+2]])        
                elif i == 1:
                    lexicons.extend(["start", words[i-1], words[i], words[i+1], words[i+2]])
                elif i == len(words)-1:
                    lexicons.extend([words[i-2], words[i-1], words[i], "end", "end"])
                elif i == len(words)-2:
                    lexicons.extend([words[i-2], words[i-1], words[i], words[i+1], "end"])
                else:
                    lexicons.extend([words[i-2], words[i-1], words[i], words[i+1], words[i+2]])


            except:
                if len(words) == 1:
                    lexicons.extend(["start", "start", words[i], "end", "end"])
                elif i==0:
                    if len(words) == 2:
                        lexicons.extend(["start", "start", words[i], words[i+1], "end"])
                elif i==1:
                    if len(words) == 2:
                        lexicons.extend(["start", words[i-1], words[i], "end", "end"])
                    elif len(words) == 3:
                        lexicons.extend(["start", words[i-1], words[i], words[i+1], "end"])       
        
            sample = np.asarray(lexicons)
            training_samples = np.vstack((training_samples, sample)) 
            
    training_samples = np.delete(training_samples, 0, 0)
    file.write(training_samples)    
    file.close()
    return training_samples

In [13]:
def vectorise_using_tfidf(training_samples, test_samples, training_labels, dev_labels):
    sample_list=[]
    ## Preprocess before giving to tfidf
    for sample in training_samples:
        sample_list.append(' '.join(sample))
    vectorizer = TfidfVectorizer(token_pattern='(?u)\\s?\w+\\s?|\\s?[' + punctuation + ']\\s?', ngram_range=(1, 2))
    
    print("Vectorising train samples")
    train_matrix = vectorizer.fit_transform(sample_list)

    ## Test
    print("Vectorising test samples")
    sample_list=[]
    for sample in test_samples:
        sample_list.append(' '.join(sample))
    test_matrix = vectorizer.transform(sample_list)
    
    test_labels = svm(train_matrix, training_labels, test_matrix)
    
#     check_accuracy(test_samples, test_labels)
    
    cm, recall, precision, f1 = calculate_cm_recall_precision(test_labels, dev_labels)
    print("cm", cm, "\nrecall", recall, "\nprecision", precision, "\nf1:", f1)
    print("meanf1:", np.mean(f1))    
    print("meanr:", np.mean(recall))    
    print("meanp:", np.mean(precision))

In [8]:
def svm(training_data, training_labels, test_samples):
    '''
    Trains model and predicts labels for test samples
    '''
    clf = SVC(kernel="linear")
    print("TRAINING...")
    clf.fit(training_data, training_labels)
    pickle.dump(model, open("./annCorra_model", 'wb'))
    return clf.predict(test_samples)

In [9]:
def calculate_cm_recall_precision(pred_labels, true_labels):
    y_pred = list(pred_labels)
    y_true = true_labels
    list1 = y_pred + y_true
    x = np.array(list1) 
    labels = list(np.unique(x))
#     labels = ['+','-','0','t+','t-']
    recall = recall_score(y_true, y_pred,labels, average=None)
    precision = precision_score(y_true, y_pred,labels, average=None)
    f1 = f1_score(y_true, y_pred,labels, average=None)
    return confusion_matrix(y_true, y_pred), recall, precision, f1;

In [10]:
train_words, train_tags = parse(traininput)

In [14]:
training_samples = generateSamples(train_words)
training_labels = np.asarray(train_tags)

KeyboardInterrupt: 

In [None]:
print(training_labels.shape, training_samples.shape)

In [None]:
dev_words, dev_tags = parse(devinput)

In [None]:
dev_samples = generateSamples(dev_words)
dev_labels = np.asarray(dev_tags)

In [None]:
print(dev_labels.shape, dev_samples.shape)

In [None]:
file = open("dev_sample_window_5.txt", "w")
for sample in dev_samples:
    file.write(sample)    
    file.write("\n")
file.close()

In [None]:
# print("hello")