In [44]:
!pip install pymystem3 tqdm

[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [23]:
import os
import tensorflow as tf
from collections import namedtuple
import pandas as pd
from pymystem3 import Mystem
import re
from tqdm import tqdm, tqdm_notebook

In [24]:
input_csv_path = 'data/restroom-dataset.v2.csv'
output_dir = 'wrk'

In [47]:
def replace_extension(src_path, new_suffix):
    dirname, basename = os.path.split(src_path)
    basename_no_ext, _ = os.path.splitext(basename)
    return os.path.join(dirname, basename_no_ext + new_suffix)

def make_out_path(out_base, src_path, infix, suffix='tfrecords'):
    fn = os.path.basename(src_path)
    fn_noext, _ = os.path.splitext(fn)
    return os.path.join(out_base, fn_noext+'.'+infix+'.'+suffix)

In [48]:
train_out_path = make_out_path(output_dir, input_csv_path, 'train')
train_size_path = replace_extension(train_out_path, '-size.txt')
valid_out_path = make_out_path(output_dir, input_csv_path, 'valid')
valid_size_path = replace_extension(valid_out_path, '-size.txt')
print(train_out_path, train_size_path, valid_out_path, valid_size_path, sep='\n')

wrk/restroom-dataset.v2.train.tfrecords
wrk/restroom-dataset.v2.train-size.txt
wrk/restroom-dataset.v2.valid.tfrecords
wrk/restroom-dataset.v2.valid-size.txt


In [27]:
df = pd.read_csv(input_csv_path)

In [29]:
df['User Intent'].value_counts()

Book           48
OMG            42
Fail           14
CheckStatus    10
GoToHell        5
Name: User Intent, dtype: int64

In [32]:
#with open(input_csv_path) as inp:
    # skip header
#    inp.readline()
    #
#    intents = []
#    texts = []
#    for l in inp:
#        l = l.rstrip().split(',')
#        intents.append(l[0])
#        texts.append(l[1])
#    df = pd.DataFrame({'intent' : intents, 'text' : texts})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 3 columns):
User Intent     119 non-null object
Message Text    119 non-null object
Unnamed: 2      2 non-null object
dtypes: object(3)
memory usage: 2.9+ KB


In [31]:
rnc2univ_mapping = {}
with open('data/ru-rnc.map') as inp:
    map_txt = inp.read()
for pair in map_txt.split('\n'):
    pair = re.sub('\s+', ' ', pair, flags=re.U).split(' ')
    if len(pair) > 1:
        rnc2univ_mapping[pair[0]] = pair[1]
print(rnc2univ_mapping)

{'A': 'ADJ', 'ADV': 'ADV', 'ADVPRO': 'ADV', 'ANUM': 'ADJ', 'APRO': 'DET', 'COM': 'ADJ', 'CONJ': 'SCONJ', 'INTJ': 'INTJ', 'NONLEX': 'X', 'NUM': 'NUM', 'PART': 'PART', 'PR': 'ADP', 'S': 'NOUN', 'SPRO': 'PRON', 'UNKN': 'X', 'V': 'VERB'}


In [9]:
mystem = Mystem()

In [10]:
def tokenize(txt):
    processed = mystem.analyze(txt)
    tagged = []
    for w in processed:
        try:
            lemma = w["analysis"][0]["lex"].lower().strip()
            pos = w["analysis"][0]["gr"].split(',')[0]
            pos = pos.split('=')[0].strip()
            if pos in rnc2univ_mapping:
                tagged.append(lemma + '_' + rnc2univ_mapping[pos])  # здесь мы конвертируем тэги
            else:
                tagged.append(lemma + '_X')  # на случай, если попадется тэг, которого нет в маппинге
        except (KeyError, IndexError):
            continue  # знаки препинания
    return tagged

In [12]:
tqdm.pandas()

In [34]:
df['tokens'] = df['Message Text'].progress_apply(tokenize)

100%|██████████| 119/119 [00:00<00:00, 4528.83it/s]


In [36]:
df = df[['User Intent', 'Message Text', 'tokens']]

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 3 columns):
User Intent     119 non-null object
Message Text    119 non-null object
tokens          119 non-null object
dtypes: object(3)
memory usage: 2.9+ KB


In [38]:
df = df.sample(frac=1, random_state=41352)

In [39]:
df.sample(10)

Unnamed: 0,User Intent,Message Text,tokens
93,OMG,сейрьезно!? пошел в ботинок нассу,"[сейрьезно_ADV, пойти_VERB, в_ADP, ботинок_NOU..."
102,OMG,кого опять прорвало,"[кто_PRON, опять_ADV, прорвать_VERB]"
97,Fail,Нихуя у него двойной!,"[ниховать_VERB, у_ADP, он_PRON, двойной_ADJ]"
77,CheckStatus,как вообще,"[как_SCONJ, вообще_ADV]"
72,OMG,ну опять,"[ну_PART, опять_ADV]"
7,GoToHell,Пошел нахуй,"[пойти_VERB, нахуй_ADV]"
113,Fail,в шкафчик насрал,"[в_ADP, шкафчик_NOUN, насрать_VERB]"
23,Book,я хочу,"[я_PRON, хотеть_VERB]"
5,CheckStatus,Когда уже?,"[когда_SCONJ, уже_ADV]"
41,Book,хочу звонить,"[хотеть_VERB, звонить_VERB]"


In [44]:
def _convert_row_to_example(intent, text, tokens):
    seq = tf.train.SequenceExample()
    seq_features = seq.context.feature
    #
    seq_features['text'].bytes_list.value.append(text.encode())
    seq_features['intent'].bytes_list.value.append(intent.encode())
    # sequence element features
    token_features = seq.feature_lists.feature_list
    token_flist = []
    for token_str in tokens:
        token_flist.append(tf.train.Feature(bytes_list=tf.train.BytesList(value=[token_str.encode()])))
    token_features['token'].feature.extend(token_flist)
    return seq

In [50]:
with tf.python_io.TFRecordWriter(train_out_path) as train_writer, \
    tf.python_io.TFRecordWriter(valid_out_path) as valid_writer:
    train_size = 0
    valid_size = 0
    for i, t in enumerate(df.itertuples(index=False)):
        if (i+1) % 4 == 0:
            writer = valid_writer
            valid_size += 1
        else:
            writer = train_writer
            train_size += 1
        intent, text, tokens = t
        example = _convert_row_to_example(intent, text, tokens)
        writer.write(example.SerializeToString())

with open(train_size_path, mode='w') as out:
    out.write(str(train_size))
with open(valid_size_path, mode='w') as out:
    out.write(str(valid_size))

In [51]:
!ls -alh wrk

total 56K
drwxr-xr-x 3 ec2-user ec2-user 4.0K Jan  1 22:01 .
drwxr-xr-x 9 ec2-user ec2-user 4.0K Jan  1 22:01 ..
drwxr-xr-x 2 ec2-user ec2-user 4.0K Dec 20 19:50 baseline-v0
-rw-rw-r-- 1 ec2-user ec2-user 6.9K Dec 20 23:07 restroom-dataset.v1.train.tfrecords
-rw-rw-r-- 1 ec2-user ec2-user 2.3K Dec 20 23:07 restroom-dataset.v1.valid.tfrecords
-rw-rw-r-- 1 ec2-user ec2-user    2 Jan  1 22:01 restroom-dataset.v2.train-size.txt
-rw-rw-r-- 1 ec2-user ec2-user  14K Jan  1 22:01 restroom-dataset.v2.train.tfrecords
-rw-rw-r-- 1 ec2-user ec2-user    2 Jan  1 22:01 restroom-dataset.v2.valid-size.txt
-rw-rw-r-- 1 ec2-user ec2-user 4.3K Jan  1 22:01 restroom-dataset.v2.valid.tfrecords


In [54]:
!cat wrk/restroom-dataset.v2.train-size.txt

90

In [55]:
!cat wrk/restroom-dataset.v2.valid-size.txt

29