In [None]:
import re
import csv
import string
import random
import warnings
from collections import Counter
from functools import lru_cache, partial
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine

import gensim
import fastText
import pymorphy2


from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm import tqdm_notebook

from keras import backend as K 
from keras.models import load_model
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout, Concatenate, Conv1D, MaxPooling1D, BatchNormalization
from keras.models import Model
from keras.layers import Conv2D, MaxPool1D, Permute, Add

random.seed(42)
np.random.seed(42)

# Pre-trained stuff

There are two difficult-to-replicate parts of the solution - 1) fasttext embeddings trained on opensuptitles 2) neural net. Set these flags to true and put OS data in the `data/ext` folder to redo these calculation

In [None]:
REDO_VECTORS = False
REDO_NN = False

# OpenSubtitles Vectors

Here I trained fasttext embeddings on OpenSubtitles 2018 dataset. To replicate results you'll need to redo the training. Dataset was processed a little with `process_OS_data` function.

In [None]:
 def process_OS_data(source, total):
        table = str.maketrans({key: None for key in string.punctuation})
        processed = source + '.processed'

        with open(source, encoding='utf-8') as source_f, open(processed, 'w', encoding='utf-8') as processed_f:
            for line in tqdm_notebook(source_f, total=total):
                # Lowercase and remove puctuation
                processed_f.write(line.lower().translate(table))

In [None]:
if REDO_VECTORS:
    process_OS_data('data/ext/OpenSubtitles2018.en-ru.ru', 25910105)
    fasttext = fastText.train_unsupervised(
        'data/ext/OpenSubtitles2018.en-ru.ru',
        model='skipgram',
        thread=4,
        dim=200
    )
    fasttext.save_model('data/ext/os_model_2018.bin')
else:
    fasttext = fastText.load_model('data/ext/os_model_2018.bin')
    
fasttext_dim = fasttext.get_dimension()

In [None]:
TEST_COLUMNS = ['context_id', 'context_2', 'context_1', 'context_0', 'reply_id', 'reply']
TRAIN_COLUMNS = TEST_COLUMNS + ['label', 'confidence']

LABELS_MAPPING = {
    'good': 2,
    'neutral': 1,
    'bad': 0, 
}

train = pd.read_csv('data/train.tsv', sep='\t', header=None, quoting=csv.QUOTE_NONE, names=TRAIN_COLUMNS)
final = pd.read_csv('data/final.tsv', sep='\t', header=None, quoting=csv.QUOTE_NONE, names=TEST_COLUMNS)

train['label'] = train.label.map(LABELS_MAPPING)

train.fillna('', inplace=True)
final.fillna('', inplace=True)

In [None]:
def clear_text(text):        
    text = re.sub(r'[^\w-]+', ' ', text.lower().replace('ё', 'е'))
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def clear_data_inplace(df):
    df['context_2'] = df.context_2.map(clear_text)
    df['context_1'] = df.context_1.map(clear_text)
    df['context_0'] = df.context_0.map(clear_text)
    df['reply'] = df.reply.map(clear_text)

In [None]:
clear_data_inplace(train)
clear_data_inplace(final)

## Vectors of important words

In [None]:
vectorizer = TfidfVectorizer().fit(pd.concat([
    train.context_2 + ' ' + train.context_1 + ' ' + train.context_0 + ' ' + train.reply,
    final.context_2 + ' ' + final.context_1 + ' ' + final.context_0 + ' ' + final.reply,
]))

In [None]:
def get_top_important_words(sent, vocab, idf_dict, n):
    words = [(idf_dict[word], word) for word in sent.split() if word in vocab]
    if len(words) <= n:
        return [word for score, word in words]        
    else:
        nth = sorted(words, key=lambda x: -x[0])[:n][-1]
        return [word for score, word in words if score >= nth[0]][:n]
    
    
def get_vectors(words, fasttext, n):
    a = np.zeros((n, fasttext_dim))
    for i in range(len(words)):
        a[i] = fasttext.get_word_vector(words[i])
    return a


def get_important_words(df, vectorizer, fasttext, n):
    vocab = vectorizer.vocabulary_
    idf_dict = dict(zip(sorted(vectorizer.vocabulary_), vectorizer.idf_))
    
    words_getter = partial(get_top_important_words, vocab=vocab, idf_dict=idf_dict, n=n)
    vectors_getter = partial(get_vectors, fasttext=fasttext, n=n)
            
    return np.concatenate([
        np.array(df.context_2.map(words_getter).map(vectors_getter).tolist()),
        np.array(df.context_1.map(words_getter).map(vectors_getter).tolist()),
        np.array(df.context_0.map(words_getter).map(vectors_getter).tolist()),
        np.array(df.reply.map(words_getter).map(vectors_getter).tolist()),
    ], axis=1)

In [None]:
train_top_vectors = get_important_words(train, vectorizer, fasttext, n=5)
final_top_vectors = get_important_words(final, vectorizer, fasttext, n=5)

## Grammems features

In [None]:
morph = pymorphy2.MorphAnalyzer()
grammes_keys = [
    'NOUN', 'gen2', 'Fimp', 'Surn', 'Fixd', 'GNdr', 'impr', 'datv', 'ANim', 'Prdx', 
    'Sgtm', 'Adjx', 'ADJS', 'Pltm', 'accs', 'NUMB', 'Slng', 'past', 'Dmns', 'ADJF', 
    'Supr', 'Impx', 'voct', 'gent', 'Anum', 'LATN', 'Inmx', 'Impe', 'Dist', 'Abbr', 
    'actv', 'loc2', 'V-en', 'NPRO', 'plur', '3per', 'nomn', 'V-ey', 'sing', 'Litr', 
    'Ques', 'Prnt', 'incl', 'masc', 'Ms-f', 'Geox', 'PRTS', 'inan', 'Cmp2', 'INFN', 
    'ablt', 'tran', 'perf', 'indc', 'Coun', 'GRND', 'V-oy', 'PNCT', 'impf', 'PRCL', 
    'intr', 'Name', 'pres', 'Orgn', 'loct', 'Poss', 'Af-p', 'Anph', 'anim', 'Subx', 
    'ADVB', 'Apro', 'V-sh', 'excl', 'V-be', 'neut', 'Erro', 'Infr', 'Vpre', 'femn', 
    'futr', 'Arch', 'PREP', 'CONJ', 'UNKN', 'ROMN', 'Coll', 'PRTF', 'PRED', 'INTJ', 
    'VERB', 'pssv', 'Patr', '2per', 'V-ej', 'intg', 'NUMR', 'COMP', 'Qual', '1per',
]

@lru_cache(maxsize=2**32)
def get_tags(word):
    return morph.tag(word)[0].grammemes

def get_grammems_sentence(sent):
    grammems = Counter(tag for word in sent for tag in get_tags(word))
    return {k: v/len(sent) for k, v in grammems.items()}

def get_grammems(df): 
    c2 = df.context_2.str.split().map(get_grammems_sentence).tolist()
    c1 = df.context_1.str.split().map(get_grammems_sentence).tolist()
    c0 = df.context_0.str.split().map(get_grammems_sentence).tolist()
    r = df.reply.str.split().map(get_grammems_sentence).tolist()
    
    return np.concatenate([
        pd.DataFrame(c2, columns=grammes_keys).fillna(0).values,
        pd.DataFrame(c1, columns=grammes_keys).fillna(0).values,
        pd.DataFrame(c0, columns=grammes_keys).fillna(0).values,
        pd.DataFrame(r, columns=grammes_keys).fillna(0).values,
    ], axis=1)

In [None]:
train_grammems = get_grammems(train)
final_grammems = get_grammems(final)

## Embeddings diff features

In [None]:
def similarity(v1, v2):
    return 1 - cosine(v1, v2)

def get_cos_sentence(sent):    
    sent = sent.split()
    mapped = list(map(fasttext.get_word_vector, sent))
    cos = sum(similarity(v1, v2) for v1, v2 in zip(mapped[:-1], mapped[1:]))
    return cos / len(sent) if sent else 0

def get_cos(df):
    return pd.concat([
        df.context_2.map(get_cos_sentence),
        df.context_1.map(get_cos_sentence),
        df.context_0.map(get_cos_sentence),
        df.reply.map(get_cos_sentence),
    ], axis=1).values

In [None]:
train_cos_diff = get_cos(train)
final_cos_diff = get_cos(final)

## Sentence scalar multiplication feature

In [None]:
def get_sum_sentence(sent):
    return sum(map(fasttext.get_word_vector, sent.split()), np.zeros(fasttext_dim))

def get_scalar_mul(df):
    c2 = np.array(df.context_2.map(get_sum_sentence).tolist())
    c1 = np.array(df.context_1.map(get_sum_sentence).tolist())
    c0 = np.array(df.context_0.map(get_sum_sentence).tolist())
    r = np.array(df.reply.map(get_sum_sentence).tolist())
    
    # Sure c2 * c1 * c0 * r should be here
    # But by mistake I counted c2 twice. 
    # Yet I only discovered this when I was preparing my code and
    # it's too late to change anything now
    return c2 * c1 * c2 * r

In [None]:
train_scalar_mul = get_scalar_mul(train)
final_scalar_mul = get_scalar_mul(final)

## Idf counts

In [None]:
def get_idf_sentence(sent, vocab, idf_dict):
    sum_idf = sum(idf_dict[word] for word in sent if word in vocab)
    return sum_idf / len(sent) if sent else 0

def get_idf(df, vectorizer):
    vocab = vectorizer.vocabulary_
    idf_dict = dict(zip(sorted(vectorizer.vocabulary_), vectorizer.idf_))
    def idf_weight_getter(sent):
        return get_idf_sentence(sent.split(), vocab=vocab, idf_dict=idf_dict)
    
    return pd.concat([
        df.context_2.map(idf_weight_getter),
        df.context_2.map(len),
        df.context_1.map(idf_weight_getter),
        df.context_1.map(len),
        df.context_0.map(idf_weight_getter),
        df.context_0.map(len),
        df.reply.map(idf_weight_getter),
        df.reply.map(len),
    ], axis=1).values

In [None]:
train_idf_weight = get_idf(train, vectorizer)
final_idf_weight = get_idf(final, vectorizer)

## fasttext cosine

In [None]:
def similarity_vetorized(v1, v2):
    norm1 = np.sqrt(np.sum(np.square(v1), axis=1))
    norm2 = np.sqrt(np.sum(np.square(v2), axis=1))
    return np.sum(v1 * v2, axis=1) / norm1 / norm2

def get_fasttext_cos_data(df):
    c2 = np.array(df.context_2.map(fasttext.get_sentence_vector).tolist())
    c1 = np.array(df.context_1.map(fasttext.get_sentence_vector).tolist())
    c0 = np.array(df.context_0.map(fasttext.get_sentence_vector).tolist())
    r = np.array(df.reply.map(fasttext.get_sentence_vector).tolist())
    
    return np.nan_to_num(np.vstack([
        similarity_vetorized(c2, r),
        similarity_vetorized(c1, r),
        similarity_vetorized(c0, r),
        similarity_vetorized(c2 + c1 + c0, r),
    ]).T)

In [None]:
train_fasttext_cos = get_fasttext_cos_data(train)
final_fasttext_cos = get_fasttext_cos_data(final)

## fasttext sums

In [None]:
def get_fasttext_sum_sent(sent, vocab, idf_dict):
    return sum(
        (fasttext.get_word_vector(word) * idf_dict[word] for word in sent if word in vocab), 
        np.zeros(fasttext_dim)
    )

def get_fasttext_sum(df, vectorizer=None):
    vocab = vectorizer.vocabulary_
    idf_dict = dict(zip(sorted(vectorizer.vocabulary_), vectorizer.idf_))
    def sum_getter(sent):
        return get_fasttext_sum_sent(sent.split(), vocab=vocab, idf_dict=idf_dict)
    
    return (
        np.array(df.context_2.map(sum_getter).tolist()),
        np.array(df.context_1.map(sum_getter).tolist()),
        np.array(df.context_0.map(sum_getter).tolist()),
        np.array(df.reply.map(sum_getter).tolist()),
    )

In [None]:
train_fasttext_sum_context_2, \
train_fasttext_sum_context_1, \
train_fasttext_sum_context_0, \
train_fasttext_sum_reply = get_fasttext_sum(train, vectorizer)

final_fasttext_sum_context_2, \
final_fasttext_sum_context_1, \
final_fasttext_sum_context_0, \
final_fasttext_sum_reply = get_fasttext_sum(final, vectorizer)

## Intersections

In [None]:
def get_intersection(df, vectorizer):
    vocab = vectorizer.vocabulary_
    idf_dict = dict(zip(sorted(vectorizer.vocabulary_), vectorizer.idf_))

    data = []
    for tup in df.itertuples():
        reply = set(tup.reply.split())
        list_c2 = set(tup.context_2.split()) & reply
        list_c1 = set(tup.context_1.split()) & reply
        list_c0 = set(tup.context_0.split()) & reply
        
        # Vectors of intersections
        inter_c2 = get_fasttext_sum_sent(list_c2, vocab, idf_dict)
        inter_c1 = get_fasttext_sum_sent(list_c1, vocab, idf_dict)
        inter_c0 = get_fasttext_sum_sent(list_c0, vocab, idf_dict)
        
        # Idf and number of words in intersections
        idf_c2 = get_idf_sentence(list_c2, vocab, idf_dict)
        inter_c2 = np.concatenate([inter_c2, [idf_c2, len(list_c2)]])

        idf_c1 = get_idf_sentence(list_c1, vocab, idf_dict)
        inter_c1 = np.concatenate([inter_c1, [idf_c1, len(list_c1)]])

        idf_c0 = get_idf_sentence(list_c0, vocab, idf_dict)
        inter_c0 = np.concatenate([inter_c0, [idf_c0, len(list_c0)]])
        
        data.append((inter_c2, inter_c1, inter_c0))

    return map(np.array, zip(*data))

In [None]:
train_inter_c2, train_inter_c1, train_inter_c0 = get_intersection(train, vectorizer)
final_inter_c2, final_inter_c1, final_inter_c0 = get_intersection(final, vectorizer)

# Concatenate all the features

In [None]:
train_r_result = train_fasttext_sum_reply
train_c2_result = np.concatenate([train_fasttext_sum_context_2, train_inter_c2], axis=1)
train_c1_result = np.concatenate([train_fasttext_sum_context_1, train_inter_c1], axis=1)
train_c0_result = np.concatenate([train_fasttext_sum_context_0, train_inter_c0], axis=1)
train_features_full = np.concatenate([
    train_idf_weight, 
    train_fasttext_cos,
    train_grammems,
    train_cos_diff,
    train_scalar_mul,
], axis=1)

final_r_result = final_fasttext_sum_reply
final_c2_result = np.concatenate([final_fasttext_sum_context_2, final_inter_c2], axis=1)
final_c1_result = np.concatenate([final_fasttext_sum_context_1, final_inter_c1], axis=1)
final_c0_result = np.concatenate([final_fasttext_sum_context_0, final_inter_c0], axis=1)
final_features_full = np.concatenate([
    final_idf_weight, 
    final_fasttext_cos,
    final_grammems,
    final_cos_diff,
    final_scalar_mul,
], axis=1)

# Validation split

In [None]:
np.random.seed(0)
validation_contexts = set(np.random.choice(np.unique(train.context_id), 3500, replace=False))
validation_mask = train.context_id.isin(validation_contexts).values

In [None]:
sub_train_r_result = train_r_result[~validation_mask]
sub_train_c2_result = train_c2_result[~validation_mask]
sub_train_c1_result = train_c1_result[~validation_mask]
sub_train_c0_result = train_c0_result[~validation_mask]
sub_train_features_full = train_features_full[~validation_mask]
sub_train_top_vectors = train_top_vectors[~validation_mask]


validation_r_result = train_r_result[validation_mask]
validation_c2_result = train_c2_result[validation_mask]
validation_c1_result = train_c1_result[validation_mask]
validation_c0_result = train_c0_result[validation_mask]
validation_features_full = train_features_full[validation_mask]
validation_top_vectors = train_top_vectors[validation_mask]

In [None]:
sub_train_input = [
    sub_train_c2_result, sub_train_c1_result, sub_train_c0_result, 
    sub_train_r_result, sub_train_features_full, sub_train_top_vectors,
]

validation_input = [
    validation_c2_result, validation_c1_result, validation_c0_result, 
    validation_r_result, validation_features_full, validation_top_vectors,
]

final_input = [
    final_c2_result, final_c1_result, final_c0_result, 
    final_r_result, final_features_full, final_top_vectors,
]


y = to_categorical(train.label)
sub_train_y = y[~validation_mask]
validation_y = y[validation_mask]

# NN

In [None]:
def get_nn(dim_c2=402, dim_c1=402, dim_c0=402, dim_r=200, features_dim=616):
    input_c2 = Input(shape=(dim_c2,))
    input_c1 = Input(shape=(dim_c1,))
    input_c0 = Input(shape=(dim_c0,))
    input_r = Input(shape=(dim_r,))
    input_features = Input(shape=(features_dim,))

    dl_c2 = Dense(1024, input_dim=1, activation='relu')(input_c2)
    dl_c2 = Dropout(0.5)(dl_c2)
    dl_c2 = Dense(512, activation='relu')(dl_c2)
    dl_c2 = BatchNormalization()(dl_c2)
    
    dl_c1 = Dense(1024, input_dim=1, activation='relu')(input_c1)
    dl_c1 = Dropout(0.5)(dl_c1)
    dl_c1 = Dense(512, activation='relu')(dl_c1)
    dl_c1 = BatchNormalization()(dl_c1)
    
    dl_c0 = Dense(1024, input_dim=1, activation='relu')(input_c0)
    dl_c0 = Dropout(0.5)(dl_c0)
    dl_c0 = Dense(512, activation='relu')(dl_c0)
    dl_c0 = BatchNormalization()(dl_c0)
    
    dl_r = Dense(1024,input_dim=1, activation='relu')(input_r)
    dl_r = Dropout(0.5)(dl_r)
    dl_r = Dense(512, activation='relu')(dl_r)
    dl_r = BatchNormalization()(dl_r)
    
    dl_features = Dense(1024,input_dim=1, activation='relu')(input_features)
    dl_features = Dropout(0.5)(dl_features)
    dl_features = Dense(512, activation='relu')(dl_features)
    dl_features = BatchNormalization()(dl_features)
    
    out1 = Concatenate()([dl_c2, dl_c1, dl_c0, dl_r, dl_features])
    
    x1 = Dense(512, activation='relu')(out1)
    x1 = Dropout(0.5)(x1)
    x1 = Dense(1024, activation='relu')(x1)
    x1 = BatchNormalization()(x1)
    
    input_top = Input(shape=(20, 200, ))
    
    cl_t0 = Conv1D(100, 1, activation='relu')(input_top)
    cl_t0 = MaxPooling1D(10)(cl_t0)
    cl_t0 = Flatten()(cl_t0)
    cl_t0 = Dropout(0.5)(cl_t0)
    
    cl_t1 = Conv1D(100, 2, activation='relu')(input_top)
    cl_t1 = MaxPooling1D(7)(cl_t1)
    cl_t1 = Flatten()(cl_t1)
    cl_t1 = Dropout(0.5)(cl_t1)
    
    cl_t2 = Conv1D(100, 3, activation='relu')(input_top)
    cl_t2 = MaxPooling1D(7)(cl_t2)
    cl_t2 = Flatten()(cl_t2)
    cl_t2 = Dropout(0.5)(cl_t2)
    
    cl_t3 = Conv1D(100, 4, activation='relu')(input_top)
    cl_t3 = MaxPooling1D(7)(cl_t3)
    cl_t3 = Flatten()(cl_t3)
    cl_t3 = Dropout(0.5)(cl_t3)
    
    cl_t4 = Conv1D(100, 5, activation='relu')(input_top)
    cl_t4 = MaxPooling1D(7)(cl_t4)
    cl_t4 = Flatten()(cl_t4)
    cl_t4 = Dropout(0.5)(cl_t4)
    
    out2 = Concatenate()([cl_t0, cl_t1, cl_t2, cl_t3, cl_t4])

    x2 = Dense(512, activation='relu')(out2)
    x2 = Dropout(0.5)(x2)
    x2 = Dense(1024, activation='relu')(x2)
    x2 = BatchNormalization()(x2)
    
    out = Concatenate()([x1, x2])
  
    x = Dense(1024, activation='relu')(out)
    x = Dropout(0.5)(x)
    x = Dense(512, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dense(3, activation='softmax')(x)

    model = Model(
        inputs=[input_c2, input_c1, input_c0, input_r, input_features, input_top], 
        outputs=[x],
    )

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    return model

In [None]:
if REDO_NN:
    nn = get_nn()
    nn.fit(
        sub_train_input, sub_train_y, sample_weight=train.confidence.values[~validation_mask],
        validation_data=(validation_input, validation_y),
        batch_size=64,
        nb_epoch=15,
    )
else:
    nn = load_model('data/final_model_weights.hdf5')

## Final predict

In [None]:
final['nn_score'] = nn.predict(final_input, batch_size=512, verbose=1).dot([-1, 0, 1])

In [None]:
final['nn_score'] = np.load('ILYA_final_88.npy').dot([-1, 0, 1])

In [None]:
final[['context_id', 'reply_id', 'nn_score']].to_csv('nn_scores.csv', index=False)

In [None]:
sub = final.groupby('context_id').apply(
    lambda x: x.sort_values('nn_score', ascending=False).reply_id
).reset_index(level=0)

sub.to_csv('nn-final-sub.tsv', index=False, header=False, sep='\t')