In [None]:
import csv
from collections import Counter
from functools import partial, lru_cache

import pymorphy2
import numpy as np
import pandas as pd
from scipy import sparse

import lightgbm as lgb
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

import fastText
from gensim.models import KeyedVectors

## Reading data

Here we read the data, map label column to numbers and normalize texts

In [None]:
TEST_COLUMNS = ['context_id', 'context_2', 'context_1', 'context_0', 'reply_id', 'reply']
TRAIN_COLUMNS = TEST_COLUMNS + ['label', 'confidence']

LABELS_MAPPING = {
    'good': 2,
    'neutral': 1,
    'bad': 0, 
}

morph = pymorphy2.MorphAnalyzer()

@lru_cache(maxsize=2**32)
def normalize_word(word):
    return morph.normal_forms(word)[0]

def normalize_sent(sent):
    return ' '.join(map(normalize_word, sent.split()))


def do_basic_stuff(df):
    if 'label' in df:
        df['label'] = df.label.map(LABELS_MAPPING)
        
    df.fillna('', inplace=True)
    df['context'] = df.context_2 + ' ' + df.context_1 + ' ' + df.context_0
    
    df['context_normalized'] = df.context.map(normalize_sent)
    df['reply_normalized'] = df.reply.map(normalize_sent)

In [None]:
train = pd.read_csv('data/train.tsv', sep='\t', header=None, quoting=csv.QUOTE_NONE, names=TRAIN_COLUMNS)
public = pd.read_csv('data/public.tsv', sep='\t', header=None, quoting=csv.QUOTE_NONE, names=TEST_COLUMNS)
final = pd.read_csv('data/final.tsv', sep='\t', header=None, quoting=csv.QUOTE_NONE, names=TEST_COLUMNS)

do_basic_stuff(train)
do_basic_stuff(public)
do_basic_stuff(final)

## Validation split

I fixed 3500 context_ids for validation (~25% of the training data)

In [None]:
np.random.seed(0)
validation_contexts = set(np.random.choice(np.unique(train.context_id), 3500, replace=False))
validation_mask = train.context_id.isin(validation_contexts)

validation = train[validation_mask].reset_index(drop=True)
train = train[~validation_mask].reset_index(drop=True)

# Making features

Here is a feature making function. It takes a `func` and apllies it to train, validation and final datasets to generate all kinds of features

In [None]:
def make_features(func):
    return func(train), func(validation), func(final)

### Simple features

In [None]:
# Raw texts
contexts = pd.concat([
    train.context,
    validation.context,
    public.context,
    # I tried to use final set here instead of public, but it made things worse
    # Probably because of the noise in the final dataset
]).drop_duplicates().values

counts = Counter(word for sent in contexts for word in set(sent.split()))
IDF = {word: np.log(len(contexts) / count) for word, count in counts.items()}


# Normalize texts

contexts_normalized = pd.concat([
    train.context_normalized,
    validation.context_normalized,
    public.context_normalized,
]).drop_duplicates().values

counts_normalized = Counter(word for sent in contexts_normalized for word in set(sent.split()))
IDFn = {word: np.log(len(contexts) / count) for word, count in counts_normalized.items()}


def simple_features(df):
    
    def idf_weight(sent, idf):
        return sum(idf.get(word, 0) for word in sent)
    
    res = pd.DataFrame(index=df.index)
    
    res['reply_word_count'] = df.reply.str.split().map(len)
    res['context_word_count'] = df.context.str.split().map(len)
    res['context_0_word_count'] = df.context_0.str.split().map(len)
    
    res['reply_equals_to_context_0'] = df.reply == df.context_0
    res['reply_is_in_context'] = df.apply(lambda x: x['reply'] in x['context'], axis=1)
    res['reply_is_in_context_0'] = df.apply(lambda x: x['reply'] in x['context_0'], axis=1)

    res['reply_idf'] = df.reply.str.split().map(partial(idf_weight, idf=IDF))
    res['reply_idfn'] = df.reply.str.split().map(partial(idf_weight, idf=IDFn))
    res['reply_idfnn'] = df.reply_normalized.str.split().map(partial(idf_weight, idf=IDFn))
    res['context_idf'] = df.context.str.split().map(partial(idf_weight, idf=IDF))
    res['context_idfn'] = df.context.str.split().map(partial(idf_weight, idf=IDFn))
    res['context_idfnn'] = df.context_normalized.str.split().map(partial(idf_weight, idf=IDFn))
    res['context_0_idf'] = df.context_0.str.split().map(partial(idf_weight, idf=IDF))
    
    def get_intersection(row):
        return set(row['reply'].split()) & set(row['context_0'].split())
    
    intersection = train.apply(get_intersection, axis=1)
    res['context_0_intersection_word_count'] = intersection.map(len)
    res['context_0_intersection_idf'] = intersection.map(partial(idf_weight, idf=IDF))
    res['context_0_intersection_idfn'] = intersection.map(partial(idf_weight, idf=IDFn))
    
    intersection = train.apply(get_intersection, axis=1)
    res['context_intersection_word_count'] = intersection.map(len)
    res['context_intersection_idf'] = intersection.map(partial(idf_weight, idf=IDF))
    
    return res.astype(np.float32)

In [None]:
train_simple_features, validation_simple_features, final_simple_features = \
    make_features(simple_features)
    
train_simple_features.head()

In [None]:
train_simple_features.shape

### Special tfidf features

In [None]:
vect_1 = TfidfVectorizer(ngram_range=(1, 3), analyzer='char', max_features=2000)
vect_1.fit(train.reply)

vect_2 = TfidfVectorizer(ngram_range=(1, 2), analyzer='word', max_features=2000)
vect_2.fit(train.context)

vect_3 = TfidfVectorizer(ngram_range=(1, 2), analyzer='char', use_idf=False)
vect_3.fit(train.context)

def vectorizer_features(df):
    features = []
    columns = [
        'vect_1_reply_context_mul',
        'vect_1_reply_context_0_mul',
        'vect_1_reply_context_1_mul',
        
        'vect_2_reply_context_mul',
        'vect_2_reply_context_0_mul',
        'vect_2_reply_context_1_mul',
        
        'vect_3_reply_context_mul',
        'vect_3_reply_context_0_mul',
        'vect_3_reply_context_1_mul',
    ]
    for vect in [vect_1, vect_2, vect_3]:
        repl = vect.transform(df.reply)
        cont = vect.transform(df.context)
        cont_0 = vect.transform(df.context_0)
        cont_1 = vect.transform(df.context_0)
        
        features.extend([
            np.ravel(repl.multiply(cont).sum(1)),
            np.ravel(repl.multiply(cont_0).sum(1)),
            np.ravel(repl.multiply(cont_1).sum(1)),
        ])
        
        
    # I forgot float32 type conversion here :(
    # But I don't want any changes that may break anything now
    return pd.DataFrame(np.hstack([features]).T, columns=columns)

In [None]:
train_vectorizer_features, validation_vectorizer_features, final_vectorizer_features = \
    make_features(vectorizer_features)
    
train_vectorizer_features.head()

### pymorphy2 tags

In [None]:
# I just randomly fixed the order by mistake and cant't reorder them now,
# beacuse it will lead to slightly different training results
KNOWN_GRAMMEMES = [
    'ADVB', 'Subx', 'Dmns', 'Prnt', 'INTJ', 'Refl', 'nomn', 'Name', 'ablt', 'Anum',
    'V-be', 'Dist', 'real', 'ADJF', 'Adjx', 'perf', 'MOod', 'ADJS', 'Abbr', 'Trad', 
    'masc', 'inan', 'voct', 'NUMR', 'CONJ', 'Vpre', 'tran', 'COMP', 'Prdx', 'Ques', 
    'Sgtm', 'Poss', 'Coun', 'intg', 'PREP', '3per', 'impf', '2per', 'VERB', 'futr', 
    'Supr', 'Fixd', 'plur', 'Mult', 'Infr', 'Fimp', 'TRns', 'actv', 'CAse', 'INvl', 
    'NUMB', 'Inmx', 'GRND', 'VOic', 'ANim', 'gent', 'V-ey', 'Orgn', 'Impe', 'Impx', 
    'Coll', 'ROMN', 'loc2', 'V-oy', 'V-bi', 'NOUN', 'gen2', 'Arch', 'Surn', 'loc1', 
    'V-ie', 'V-sh', 'Pltm', 'Erro', 'UNKN', 'Litr', 'Geox', 'intr', 'loct', 'GNdr', 
    'TEns', 'PNCT', '1per', 'Anph', 'acc2', 'Patr', 'ASpc', 'Ms-f', 'gen1', 'excl', 
    'Apro', 'V-ej', 'Slng', 'Cmp2', 'PErs', 'INFN', 'PRTF', 'datv', 'anim', 'impr', 
    'femn', 'Af-p', 'NPRO', 'incl', 'accs', 'sing', 'indc', 'Init', 'PRTS', 'PRED', 
    'pres', 'PRCL', 'NMbr', 'pssv', 'LATN', 'POST', 'past', 'V-en', 'Qual', 'neut',
]

@lru_cache(maxsize=2**32)
def get_tags(word):
    return Counter(morph.tag(word)[0].grammemes)


def get_tags_dataframe(series):
    df = pd.DataFrame(
        list(series.map(lambda sent: sum(map(get_tags, sent.split()), Counter()))), 
        columns=KNOWN_GRAMMEMES
    ).fillna(0)
    return df.div(df.sum(1), axis='rows').astype(np.float32)

In [None]:
train_reply_grammems, validation_reply_grammems, final_reply_grammems = \
    make_features(lambda df: get_tags_dataframe(df.reply))
    
train_context_0_grammems, validation_context_0_grammems, final_context_0_grammems = \
    make_features(lambda df: get_tags_dataframe(df.context_0))
    
train_reply_grammems.head()

### Fasttext features

In [None]:
fasttextWiki = KeyedVectors.load_word2vec_format('data/ext/wiki.ru.vec', binary=False)

def get_sent_vector(sent):
    res = np.zeros(300)
    for word in sent.split():
        if word in fasttextWiki:
            res += fasttextWiki[word]  
    return res


def embed_sentences(df, column):
    x = np.array([get_sent_vector(sent) for sent in df[column]])
    return pd.DataFrame(
        x / (1 + np.sqrt(np.square(x).sum(1, keepdims=True))),
        index=df.index,
        columns=['fasttext_wiki_%s_%d' % (column, i) for i in range(300)]
    ).astype(np.float32)


train_fasttext_wiki_context_0, validation_fasttext_wiki_context_0, final_fasttext_wiki_context_0 = \
    make_features(partial(embed_sentences, column='context_0'))
    
train_fasttext_wiki_reply, validation_fasttext_wiki_reply, final_fasttext_wiki_reply = \
    make_features(partial(embed_sentences, column='reply'))
    
train_fasttext_wiki_reply.head()

In [None]:
fasttextCC = fastText.load_model('data/ext/cc.ru.300.bin')

In [None]:
def embed_fasttext_reply_cc(df):
    return pd.DataFrame(
        np.array(df.reply.map(fasttextCC.get_sentence_vector).tolist()),
        index=df.index,
        columns=['fasttext_cc_reply_%d' % i for i in range(300)],
    ).astype(np.float32)

In [None]:
train_fasttext_cc_reply, validation_fasttext_cc_reply, final_fasttext_cc_reply = \
    make_features(embed_fasttext_reply_cc)
    
train_fasttext_cc_reply.head()

### tf-idf features

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1, 3), analyzer='char', max_features=2000)
tfidf.fit(train.reply)


train_reply_tfidf = tfidf.transform(train.reply).astype(np.float32)
train_context_0_tfidf = tfidf.transform(train.context_0).astype(np.float32)
train_mul_tfidf = train_reply_tfidf.multiply(train_context_0_tfidf)
train_mul_sum_tfidf = train_mul_tfidf.sum(1)

validation_reply_tfidf = tfidf.transform(validation.reply).astype(np.float32)
validation_context_0_tfidf = tfidf.transform(validation.context_0).astype(np.float32)
validation_mul_tfidf = validation_reply_tfidf.multiply(validation_context_0_tfidf)
validation_mul_sum_tfidf = validation_mul_tfidf.sum(1)

final_reply_tfidf = tfidf.transform(final.reply).astype(np.float32)
final_context_0_tfidf = tfidf.transform(final.context_0).astype(np.float32)
final_mul_tfidf = final_reply_tfidf.multiply(final_context_0_tfidf)
final_mul_sum_tfidf = final_mul_tfidf.sum(1)

# SVD decoposition of tf-idf

In [None]:
svd_1 = TruncatedSVD(n_components=10, random_state=0, n_iter=15)
svd_2 = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), analyzer='word', max_features=2000)),
    ('svd', TruncatedSVD(n_components=10, random_state=0, n_iter=15)),
])

svd_1.fit(train_reply_tfidf)
svd_2.fit(train.reply)


train_svd_1 = svd_1.transform(train_reply_tfidf)
train_svd_2 = svd_2.transform(train.reply)

validation_svd_1 = svd_1.transform(validation_reply_tfidf)
validation_svd_2 = svd_2.transform(validation.reply)

final_svd_1 = svd_1.transform(final_reply_tfidf)
final_svd_2 = svd_2.transform(final.reply)

## Putting all features together

In [None]:
Xtrain = sparse.hstack([
    train_reply_tfidf,
    train_context_0_tfidf,
    train_svd_1,
    train_svd_2,
    train_reply_grammems, 
    train_context_0_grammems,
    train_fasttext_wiki_reply,
    train_fasttext_wiki_context_0,
    train_fasttext_cc_reply,
    train_simple_features,
    train_mul_sum_tfidf,
    train_vectorizer_features,
    train_mul_tfidf,
]).tocsr()

Xvalidation = sparse.hstack([
    validation_reply_tfidf,
    validation_context_0_tfidf,
    validation_svd_1,
    validation_svd_2,
    validation_reply_grammems, 
    validation_context_0_grammems,
    validation_fasttext_wiki_reply,
    validation_fasttext_wiki_context_0,
    validation_fasttext_cc_reply,
    validation_simple_features,
    validation_mul_sum_tfidf,
    validation_vectorizer_features,
    validation_mul_tfidf,
]).tocsr()

Xfinal = sparse.hstack([
    final_reply_tfidf,
    final_context_0_tfidf,
    final_svd_1,
    final_svd_2,
    final_reply_grammems, 
    final_context_0_grammems,
    final_fasttext_wiki_reply,
    final_fasttext_wiki_context_0,
    final_fasttext_cc_reply,
    final_simple_features,
    final_mul_sum_tfidf,
    final_vectorizer_features,
    final_mul_tfidf,
]).tocsr()

ytrain = train.label.values
yvalidation = validation.label.values

# lighgbm models

We will use two estimators: one to estimate the probability of a "good" class, and the other one for a "bad" class. I'm to lazy to really optimize hyperparameters, so it's pretty much default, except for `colsample_bytree` which is low, because we have lots of features (`colsample_bytree=0.5` overtfits a lot). Since features are exactly the same I used different random state to at least select features in deffirent order.

In [None]:
bst_good = lgb.LGBMClassifier(n_estimators=2000, learning_rate=0.025, colsample_bytree=0.3, random_state=0)
bst_bad = lgb.LGBMClassifier(n_estimators=2000, learning_rate=0.025, colsample_bytree=0.3, random_state=42)

print('Training models on %d features' % Xtrain.shape[1])

# Training final models on complete data

In [None]:
Xall = sparse.vstack([Xtrain, Xvalidation])
yall = np.concatenate([ytrain, yvalidation])
wall = np.concatenate([train.confidence, validation.confidence])

In [None]:
bst_good.fit(Xall, yall == 2, sample_weight=wall)
bst_bad.fit(Xall, yall == 0, sample_weight=wall)

In [None]:
p_good = bst_good.predict_proba(Xfinal)[:, 1]
p_bad = bst_bad.predict_proba(Xfinal)[:, 1]
final['lgb_score'] = p_good - p_bad

final[['context_id', 'reply_id', 'lgb_score']].to_csv('lgb_scores.csv', index=False)

In [None]:
sub = final.groupby('context_id').apply(
    lambda x: x.sort_values('lgb_score', ascending=False).reply_id
).reset_index(level=0)

sub.to_csv('lgb-final-sub.tsv', index=False, header=False, sep='\t')

### Taking a look at top features

In [None]:
feature_names = ['reply_' + s for s in list(sorted(tfidf.vocabulary_))] + \
                ['context_0_' + s for s in list(sorted(tfidf.vocabulary_))] + \
                ['svd_1_%d' % d for d in range(10)] + \
                ['svd_2_%d' % d for d in range(10)] + \
                list(validation_reply_grammems.columns) + \
                list(validation_context_0_grammems.columns) + \
                list(validation_fasttext_wiki_reply.columns) + \
                list(validation_fasttext_wiki_context_0.columns) + \
                list(validation_fasttext_cc_reply.columns) + \
                list(validation_simple_features.columns) + \
                ['tfidf_dist'] + \
                list(validation_vectorizer_features.columns) + \
                ['multiply_' + s for s in list(sorted(tfidf.vocabulary_))]
                
imp = pd.DataFrame({
    'imp_bad': bst_bad.feature_importances_,
    'imp_good': bst_good.feature_importances_,
}, index=feature_names)

In [None]:
imp.sort_values('imp_bad', ascending=False).head(20)

In [None]:
imp.sort_values('imp_good', ascending=False).head(20)