In [2476]:
import string
import numpy
import sklearn.svm as svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
import io
from collections import defaultdict
import random

In [2477]:
tagsetDict = {"N" : 1,
              "Np" : 2,
              "Nc" : 3,
              "Nu" : 4,
              "V" : 5,
              "A" : 6,
              "P" : 7,
              "L" : 8,
              "M" : 9,
              "R" : 10,
              "E" : 11,
              "C" : 12,
              "I" : 13,
              "T" : 14,
              "B" : 15,
              "Y" : 16,
              "S" : 17,
              "X" : 19 }
inverseTagsetDict = {tagsetDict[k]:k for k in tagsetDict}

In [2478]:
wordBank = defaultdict()
bigramBank = defaultdict()
bigramDict = defaultdict(int)

bigramFreq = {x:[0]*18 for x in tagsetDict}
mostCommonBigrams = defaultdict()

prevParts = None

f_tagged_train = io.open("vi_train.txt", encoding='utf-8').readlines()
f_tagged_test = io.open("vi_test.txt", encoding='utf-8').readlines()
# f = io.open("corpus/VNTQcorpus-small.txt", encoding='utf-8').readlines()

# Separate train and test set
# train = f_tagged[:int(len(f)/2)]
# test = f_tagged[int(len(f)/2):]
train = f_tagged_train[:4500]
test = f_tagged_test[:10000]
print(train)
print(test)

['Trên/E đường/N đi/V ,/, có/V một/M lần/N xe/N cô/N suýt/R rơi/V xuống/R vực/N ở/E đèo/N Ngoạn_Mục/Np ./.\n', '\n', 'Trong/E một/M trận/N đánh/V ác_liệt/A bên/N thành/N cổ/N Quảng_Trị/Np ,/, một/M loạt/N đạn/N pháo/N của/E kẻ_thù/N đã/R rơi/V trúng/A chỗ/N chiến_sĩ/N thông_tin/N Nguyễn_Văn_Thạc/Np .../... Hôm/N đó/P ngày/N 30/M -/- 7/M -/- 1972/M ./.\n', '\n', 'Sau/E khi/N H./Ny “/“ AK/Ny ”/” bị/V xộ/V khám/V ,/, Hoàng/Np đã/R hoàn_lương/V ,/, bây_giờ/P đang/R phụ/V việc/N cho/E gia_đình/N Quân/Np ./.\n', '\n', 'Hà/Np ,/, 21/M tuổi/N -/- làm/V nghề/N hớt_tóc/V ,/, khai/V :/: “/“ Ban_đầu/N nghe/V mấy/L ảnh/N nói/V thuốc/N lắc/V không/R nghiện/V nên/C em/N uống/V thử/V ./.\n', '\n', 'Theo/E anh/Nc Thông/Np ,/, người/N già/A ở/E Đức/Np thường/R sống/V một_mình/Ny và/C sống/V với/E những/L kỷ_vật/N của/E riêng/A mình/P mang/V về/R từ/E những/L chuyến/N du_lịch/V hay/C những/L tặng_vật/N của/E cả/P một/M đời/N ./.\n', '\n', 'Người/N đến/V muộn/A một_chút/L ,/, phà/N vừa/R rời/V bến/N thì/C

In [2479]:
# Get training data
for line in train:
    l_split = line.split()
    for i,w in enumerate(l_split):
        parts = w.split("/")
        if i >= 1:
            prevParts = l_split[i-1].split("/")
        if len(parts) == 1 or \
            parts[1] not in tagsetDict:
            continue

        # if prev parts is a punctuation get the one before that
        if i >= 2 and prevParts[1] in string.punctuation:
            prevParts = l_split[i-1].split("/")

        word = parts[0]
        pos = parts[1]

        if i >= 1:
            prevWord = prevParts[0]
            prevPos = prevParts[1]
            bigramBank[ word ] = (pos, prevWord, prevPos)
            bigramDict[ (pos, prevPos) ] += 1
            #calculate pos after pos
            #bigramFreq[prevPos][tagsetDict[pos]] += 1

        # Makes word map to list of pos tags
        if word not in wordBank:
            wordBank[word] = [pos]
        else:
            wordBank[word] += [pos]

In [2480]:
for k in bigramFreq:
    maxFreq = 0
    maxPos = "N"
    for i,x in enumerate(bigramFreq[k]):
        if x > maxFreq:
            maxFreq = x
            maxPos = inverseTagsetDict[i]
    mostCommonBigrams[k] = maxPos


print ("finished getting training data")

finished getting training data


In [2481]:
#####################################################################
# Features
def feature(word,wordIdx, lineSize, line):
    feat = [1]

    # print("word is : " + str(word))
    # print("wordIdx is : " + str(wordIdx))
    # print("linesize is : "  + str(lineSize))

    # how far word is into sentence
    sentPercent = float(wordIdx)/float(lineSize)
    feat.append(sentPercent)

    # 0.8516463095298431

    if word[0].isupper() and wordIdx != 0:
        feat.append(1)
    else:
        feat.append(0)

    # label pos tags for each word
    posIdx_array = ([0] * len(tagsetDict))
    posSet = []
    if word in wordBank:
        posSet = wordBank[word]
    else:
        # see word we don't know
        if wordIdx == 0:
            posSet = list(tagsetDict.keys())[0] # Naive: always choose Np for word we havent seen
            posIdx_array[tagsetDict[posSet]] = 1
            return feat + posIdx_array + [0]
        else:
            prevWord = line[wordIdx-1]  #TODO this is questionable
            if prevWord in wordBank:
                prevPos = wordBank[prevWord]
                maxPos = mostCommonBigrams[prevPos]
                if prevPos == "E":
                    maxPos = 3
                posIdx_array[tagsetDict[maxPos]] = 1
                feat += posIdx_array + [tagsetDict[maxPos]]
                return feat
            else:
                posSet = list(tagsetDict.keys())[0]  # Naive: always choose Np for word we havent seen
                posIdx_array[tagsetDict[posSet]] = 1
                feat += posIdx_array + [0]
                return feat

    for pos in posSet:
        posIdx = tagsetDict[pos]
        posIdx_array[posIdx] += 1.0/len(wordBank[word])
    feat += (posIdx_array) + [0]

    # Don't put features here!

    return feat
#####################################################################

In [2482]:
# Create y (list of pos tags) and x (feature) data
y = []
X_train = []
for line in train:
    l_split = line.split()
    for i,w in enumerate(l_split):
        parts = w.split("/")
        word = parts[0]
        len_line = len(l_split)

        if len(parts) == 1 or parts[1] not in tagsetDict:
            continue
        y.append(wordBank[word][0]) # [0] for first word tagged pos
        X_train.append(feature(word,i,len_line, l_split))

print ("finished creating y and xtrain")

finished creating y and xtrain


In [2483]:
print ( len(X_train) )
print ( len(y) )

print(X_train[0:10])
print(y[0:10])
train_fit = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train,y)

print ("finished fitting")

43222
43222
[[1, 0.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0], [1, 0.058823529411764705, 0, 0, 0.9999999999999971, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0.11764705882352941, 0, 0, 0, 0, 0, 0, 0.8992248062015519, 0, 0, 0, 0, 0.09689922480620161, 0, 0, 0, 0.003875968992248062, 0, 0, 0, 0], [1, 0.23529411764705882, 0, 0, 0, 0, 0, 0, 0.9954022988505814, 0, 0, 0, 0, 0.0022988505747126436, 0, 0, 0, 0.0022988505747126436, 0, 0, 0, 0], [1, 0.29411764705882354, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.9999999999999907, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0.35294117647058826, 0, 0, 0.9870129870129872, 0, 0, 0, 0.012987012987012988, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0.4117647058823529, 0, 0, 1.0000000000000004, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0.47058823529411764, 0, 0, 0.7058823529411765, 0, 0.29411764705882365, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0.5294117647058824, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 

In [2484]:
X_test = []
correct_results = []
# Get testing data
for line in test:
    l_split = line.split()
    for i,w in enumerate(l_split):
        len_line = len(l_split)
        parts = w.split("/")
        if len(parts) == 1 or parts[1] not in tagsetDict:
            continue
        word = parts[0]
        pos = parts[1]
        if word in string.punctuation or word == ":.":
            continue
        X_test.append(feature(word,i,len_line,l_split))
        correct_results.append(pos)

print("finished getting testing data")

finished getting testing data


In [2485]:
predicted_results = train_fit.predict(X_test)
print(predicted_results[:10])
print(correct_results[:10])

numCorrect = 0
for cor,pred in zip(correct_results, predicted_results):
    if cor == pred:
        numCorrect += 1

accuracy = 1.0*numCorrect/len(correct_results)
print(accuracy)

['E' 'N' 'V' 'A' 'N' 'N' 'Nc' 'N' 'Nc' 'C']
['E', 'N', 'V', 'A', 'N', 'N', 'N', 'N', 'N', 'C']
0.8466577950660252
