In [1]:
import csv


content = []
with open('data-train.csv') as f:
    content = f.readlines()

print(content[0])

1,(Алим`ентов|S) (он`а|S-PRO) (никогд`а|ADV-PRO) (от|PR) (нег`о|S-PRO) (не|PART) (получ`ала|V) (и|CONJ) (т`олько|PART) (сл`ышала|V) (от|PR) (`общих|A) (знак`омых|S) (,|PUNCT) (что|CONJ) (он|S-PRO) (жив|A) (.|PUNCT)



In [2]:
class_cnt = 17
words_freq = {}

tag_map = {
    'S': 1,
    'A': 2,
    'NUM': 3,
    'A-NUM': 4,
    'V': 5,
    'ADV': 6,
    'PRAEDIC': 7,
    'PARENTH': 8,
    'S-PRO': 9,
    'A-PRO': 10,
    'ADV-PRO': 11,
    'PRAEDIC-PRO': 12,
    'PR': 13,
    'CONJ': 14,
    'PART': 15,
    'INTJ': 16,
    'PUNCT': 17
    }

def parse_pair(pair):
    word = ""
    tag = ""
    i = len(pair) - 2
    while pair[i] != '|':
        tag += pair[i]
        i -= 1
    i -= 1
    while i > 0:
        word += pair[i]
        i -= 1
    return word[::-1], tag[::-1]

def parse(line):
    l_id = ""
    words = []
    tags = []
    
    i = 0
    while i < len(line):
        if line[i] == ',':
            break
        l_id += line[i]
        i += 1
    
    l_id = int(l_id)
    
    pairs = line[i + 1:].split()
    for pair in pairs:
        if pair[0] == '(' and pair[-1] == ')':
            w,t = parse_pair(pair)
            
            f = words_freq.get(w, 0)
            words_freq[w] = f + 1
            
            words.append(w)
            tags.append(t)
    return l_id, words, tags
    

In [3]:
ids = []
words = []
tags = []

for line in content:
    l_id, ws, ts = parse(line)
    if len(ws) > 0:
        ids.append(l_id)
        words.append(ws) 
        tags.append(ts)

In [4]:
train_data = []

for i in range(len(ids)):
    p = []
    for j in range(len(words[i])):
        p.append((words[i][j], tags[i][j]))
    train_data.append(p)

In [5]:
print(train_data[0])

[('Алим`ентов', 'S'), ('он`а', 'S-PRO'), ('никогд`а', 'ADV-PRO'), ('от', 'PR'), ('нег`о', 'S-PRO'), ('не', 'PART'), ('получ`ала', 'V'), ('и', 'CONJ'), ('т`олько', 'PART'), ('сл`ышала', 'V'), ('от', 'PR'), ('`общих', 'A'), ('знак`омых', 'S'), (',', 'PUNCT'), ('что', 'CONJ'), ('он', 'S-PRO'), ('жив', 'A'), ('.', 'PUNCT')]


In [6]:
import nltk
from nltk.tag import hmm


trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(train_data[:5000])

In [7]:
print(tagger.tag("Алим`ентов он`а никогд`а".split()))

[('Алим`ентов', 'S'), ('он`а', 'S-PRO'), ('никогд`а', 'ADV-PRO')]


In [8]:
test_content = []
with open('data-test.csv') as f:
    test_content = f.readlines()
    
    for line in test_content:
        i = 0
        while i < len(line):
            if line[i] == ',':
                break
            i += 1
        for w in line[i+1:].split():
            f = words_freq.get(w, 0)
            words_freq[w] = f + 1

print(test_content[0])

1,Благотвор`ительный баз`ар состо`ится в четвёртом отдел`ении п`осле `ужина .



In [9]:
import pycrfsuite
from nltk.tag import CRFTagger


ct = CRFTagger()
ct.train(train_data, 'model.crf.tagger')

In [10]:
print(ct.tag_sents(["Алим`ентов он`а никогд`а".split()]))

[[('Алим`ентов', 'S'), ('он`а', 'S-PRO'), ('никогд`а', 'ADV-PRO')]]


In [11]:
with open('output.csv', 'w') as out:
    out.write('id,tags\n')
    for line in test_content:
        i = 0
        l_id = ""
        while i < len(line):
            if line[i] == ',':
                break
            l_id += line[i]
            i += 1

        out.write(l_id + ',')

        pairs = ct.tag_sents([line[i + 1:].split()])[0]
        out.write(' '.join([p[1] for p in pairs]))
        out.write('\n')