In [1]:
from zipfile import ZipFile
import pandas as pd
import numpy as np

# Загрузка данных

In [2]:
def gen_parcer(f):
    for line in f.readlines():
        yield line.decode().strip().split("\t")

In [3]:
with ZipFile('data.zip') as datazip:
    with datazip.open('train.tsv') as f:
        pd_train = pd.DataFrame(gen_parcer(f),
                                columns=["context_id", "context_2", "context_1", "context_0", "reply_id", "reply",
                                         "label", "confidence"])
    with datazip.open('public.tsv') as f:
         pd_public = pd.DataFrame(gen_parcer(f),
                                  columns=["context_id", "context_2", "context_1", "context_0", "reply_id", "reply"])

In [4]:
pd_train.head()

Unnamed: 0,context_id,context_2,context_1,context_0,reply_id,reply,label,confidence
0,22579918886,"кликни на меня а потом на надпись "" видео - зв...","о , я тебя вижу .","ладно , повесь трубку .",0,не могу .,good,0.8753516175
1,22579918886,"кликни на меня а потом на надпись "" видео - зв...","о , я тебя вижу .","ладно , повесь трубку .",1,"нет , звонить буду я .",neutral,0.9009682113
2,22579918886,"кликни на меня а потом на надпись "" видео - зв...","о , я тебя вижу .","ладно , повесь трубку .",2,"слушай , я не мог уйти .",bad,0.8843202145
3,22579918886,"кликни на меня а потом на надпись "" видео - зв...","о , я тебя вижу .","ладно , повесь трубку .",3,я не прекращу звонить .,good,0.9825304673
4,22579918886,"кликни на меня а потом на надпись "" видео - зв...","о , я тебя вижу .","ладно , повесь трубку .",4,я звоню им .,good,0.8380535096


In [5]:
pd_public.head()

Unnamed: 0,context_id,context_2,context_1,context_0,reply_id,reply
0,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,0,неа .
1,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,1,"нет , не хочу ."
2,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,2,нет .
3,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,3,"конечно , нет ."
4,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,4,"разумеется , нет ."


In [6]:
pd_train.count()

context_id    97533
context_2     97533
context_1     97533
context_0     97533
reply_id      97533
reply         97533
label         97533
confidence    97533
dtype: int64

In [7]:
pd_public.count()

context_id    9968
context_2     9968
context_1     9968
context_0     9968
reply_id      9968
reply         9968
dtype: int64

In [8]:
target_encoder = {'bad': 0, 'good': 2, 'neutral': 1}  
pd_train.label = pd_train.label.map(target_encoder)

In [9]:
pd_data = pd.concat([pd_train, pd_public], axis=0).reset_index(drop=True)
pd_data.label.fillna(-1, inplace=True)
pd_data.confidence.fillna(0.0, inplace=True)

# Что за данные

In [10]:
pd_train.shape, pd_public.shape, pd_data.shape

((97533, 8), (9968, 6), (107501, 8))

In [11]:
pd_data.groupby("label")["context_id"].count() / len(pd_data)

label
-1.0    0.092725
 0.0    0.323439
 1.0    0.104687
 2.0    0.479149
Name: context_id, dtype: float64

# Простой бейзлайн

## features

In [12]:
from scipy import sparse

In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [14]:
cv_all = CountVectorizer()
s_words = pd_data[["context_2", "context_1", "context_0", "reply"]].apply(lambda x: " ".join(x), axis=1)
cv_all.fit(s_words)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [15]:
tfidf_all = TfidfVectorizer()

In [16]:
def fit_tfidf(**kwargs):
    global tfidf_all
    tfidf_all = TfidfVectorizer(**kwargs)
    s_words = pd_data[["context_2", "context_1", "context_0", "reply"]].apply(lambda x: " ".join(x), axis=1)
    tfidf_all.fit(s_words)

In [17]:
def to_features_cv(df):
    context = cv_all.transform(df[["context_2", "context_1", "context_0"]].apply(lambda x: " ".join(x), axis=1))
    reply = cv_all.transform(df["reply"])
    features = sparse.hstack([context, reply])
    return features

In [18]:
def to_features_tfidf(df):
    context = tfidf_all.transform(df[["context_2", "context_1", "context_0"]].apply(lambda x: " ".join(x), axis=1))
    reply = tfidf_all.transform(df["reply"])
    features = sparse.hstack([context, reply])
    return features

## vectors

- rusvectores
- nlpub (russion distribution t...)

## crossval

In [19]:
from tqdm import tqdm_notebook

In [20]:
from sklearn.model_selection import StratifiedKFold

In [21]:
skf = StratifiedKFold(n_splits=3)

In [32]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor

In [23]:
class RandomMinMaxRegressor(object):
    def __init__(self):
        self.min_y = None
        self.max_y = None
    
    def fit(self, x, y):
        self.min_y = y.min()
        self.max_y = y.max()
    
    def predict(self, x):
        return np.random.random(x.shape[0]) * (self.max_y - self.min_y) + self.min_y        

In [24]:
from sklearn.metrics import mean_absolute_error

In [25]:
def calc_ndcg_score(df, y_predict):
    df = df.copy()
    df["predict"] = y_predict
    #dcg
    df.sort_values(["context_id", "predict"], ascending=[True, False], inplace=True)
    df["row_count"] = df.groupby("context_id").cumcount()+1
    df["dcg"] = df["label"] / np.log2(df["row_count"] + 1)
    #idcg
    df.sort_values(["context_id", "label"], ascending=[True, False], inplace=True)
    df["row_count"] = df.groupby("context_id").cumcount()+1
    df["idcg"] = df["label"] / np.log2(df["row_count"] + 1)
    
    df_context = df.groupby("context_id").sum()
    return (df_context["dcg"] / df_context["idcg"]).mean() * 100000

In [26]:
def do_cross_val(model_reg, to_features, to_target):
    mae_errors = []
    ndcg_scores = []

    for _train, _test in tqdm_notebook(skf.split(pd_train.index, pd_train.label), total=3):
        i_train = pd_train.iloc[_train]
        i_test = pd_train.iloc[_test]

        X_train = to_features(i_train)
        X_test = to_features(i_test)
        Y_train = to_target(i_train)
        Y_test = to_target(i_test)

        model_reg.fit(X_train, Y_train)
        Y_predict = model_reg.predict(X_test)
        mae_errors.append(mean_absolute_error(Y_test, Y_predict))
        ndcg_scores.append(calc_ndcg_score(i_test, Y_predict))

    print(mae_errors)
    print(ndcg_scores)

## target

In [30]:
trg_label = lambda x: x.label.values

In [31]:
trg_label_confidence = lambda x: (x.label.values - 1) * x.confidence.values.astype(np.float)

## Нормальные варианты

In [27]:
do_cross_val(LinearRegression(), to_features_cv, trg_label)

A Jupyter Widget


[1.4865290758926417, 1.4505487927986205, 1.5525054989445051]
[84992.04634993454, 85093.63778087734, 85223.37180947387]


In [28]:
fit_tfidf(max_features=10000)
do_cross_val(LinearRegression(), to_features_tfidf, trg_label)

A Jupyter Widget


[1.1641773562812208, 1.2216055918557263, 1.2445674321405]
[85160.73563781707, 85201.6761086984, 85174.57705347847]


In [29]:
fit_tfidf(max_features=10000)
do_cross_val(LinearRegression(), to_features_tfidf, trg_label_confidence)

A Jupyter Widget


[1.0169426147406317, 1.0708111043790087, 1.0870911686307785]
[85219.75427290962, 85295.92031698779, 85256.01623290053]


In [59]:
from sklearn.linear_model import Ridge
fit_tfidf(max_features=10000)
do_cross_val(Ridge(alpha=5.0), to_features_tfidf, trg_label_confidence)

A Jupyter Widget


[0.71889351101579346, 0.71910615270941403, 0.71888026106213321]
[85663.72899401304, 85414.23694238505, 85611.32763574347]


In [68]:
fit_tfidf(max_features=100000, token_pattern=r'\b\w{1,4}')
do_cross_val(Ridge(alpha=5.0), to_features_tfidf, trg_label_confidence)

A Jupyter Widget


[0.71562804231897126, 0.71673147106507085, 0.71544536344518195]
[86001.93024165586, 85689.85801845901, 85993.59856432414]


In [77]:
fit_tfidf(analyzer="char", ngram_range=(3, 4))
do_cross_val(Ridge(alpha=20.0), to_features_tfidf, trg_label_confidence)

A Jupyter Widget


[0.71069604169646261, 0.71096965804572143, 0.7100319742497585]
[86633.9837454209, 86258.9844140555, 86460.99184915234]


## на проверке

## Неудачные варианты

In [37]:
fit_tfidf(max_features=10000)
do_cross_val(Lasso(alpha=0.01), to_features_tfidf, trg_label_confidence)

A Jupyter Widget


[0.75838986817906762, 0.75749894796205131, 0.75695730065427458]
[82749.59984256358, 82818.22569952125, 82951.3881202625]


In [60]:
from sklearn.linear_model import ElasticNet
fit_tfidf(max_features=10000)
do_cross_val(ElasticNet(alpha=1.0), to_features_tfidf, trg_label_confidence)

A Jupyter Widget


[0.75838986817906762, 0.75749894796205131, 0.75695730065427458]
[82749.59984256358, 82818.22569952125, 82951.3881202625]


In [35]:
fit_tfidf(max_features=20000)
do_cross_val(LinearRegression(), to_features_tfidf)

A Jupyter Widget


[1.1641773562812208, 1.2216055918557263, 1.2445674321405]
[85160.73563781707, 85201.6761086984, 85174.57705347847]


In [36]:
fit_tfidf(max_features=5000)
do_cross_val(LinearRegression(), to_features_tfidf)

A Jupyter Widget


[1.1641773562812208, 1.2216055918557263, 1.2445674321405]
[85160.73563781707, 85201.6761086984, 85174.57705347847]


In [37]:
fit_tfidf(max_features=1000)
do_cross_val(LinearRegression(), to_features_tfidf)

A Jupyter Widget


[1.1641773562812208, 1.2216055918557263, 1.2445674321405]
[85160.73563781707, 85201.6761086984, 85174.57705347847]


In [38]:
fit_tfidf(max_features=200)
do_cross_val(LinearRegression(), to_features_tfidf)

A Jupyter Widget


[1.1641773562812208, 1.2216055918557263, 1.2445674321405]
[85160.73563781707, 85201.6761086984, 85174.57705347847]


In [38]:
do_cross_val(RandomMinMaxRegressor())

A Jupyter Widget


[0.94215100562127641, 0.94450351137377053, 0.94044066978111984]
[82635.87988159526, 82487.80155424896, 82826.7969582632]


In [49]:
def do_worst_ndcg():
    mae_errors = []
    ndcg_scores = []

    for _train, _test in tqdm_notebook(skf.split(pd_train.index, pd_train.label), total=3):
        i_test = pd_train.iloc[_test].copy().sort_values(["context_id", "label"], ascending=[True, False])
        
        Y_test = i_test.sort_values(["context_id", "label"], ascending=[True, False]).label.values
        Y_predict = i_test.sort_values(["context_id", "label"], ascending=[True, True]).label.values
        
        mae_errors.append(mean_absolute_error(Y_test, Y_predict))
        ndcg_scores.append(calc_ndcg_score(i_test, Y_predict))

    print(mae_errors)
    print(ndcg_scores)
do_worst_ndcg()

A Jupyter Widget


[1.1195250984251968, 1.1126695579957553, 1.1291294986158105]
[77475.71260547417, 77562.26032466823, 77472.45724727474]


In [48]:
def do_best_ndcg():
    mae_errors = []
    ndcg_scores = []

    for _train, _test in tqdm_notebook(skf.split(pd_train.index, pd_train.label), total=3):
        i_test = pd_train.iloc[_test].copy().sort_values(["context_id", "label"], ascending=[True, False])
        
        Y_test = i_test.sort_values(["context_id", "label"], ascending=[True, False]).label.values
        Y_predict = i_test.sort_values(["context_id", "label"], ascending=[True, False]).label.values
        
        mae_errors.append(mean_absolute_error(Y_test, Y_predict))
        ndcg_scores.append(calc_ndcg_score(i_test, Y_predict))

    print(mae_errors)
    print(ndcg_scores)
do_best_ndcg()

A Jupyter Widget


[0.0, 0.0, 0.0]
[100000.0, 100000.0, 100000.0]


# Submit

In [81]:
def do_submit(submission_name, model_reg, to_features, to_target):
    i_train = pd_train
    i_test = pd_public.copy()

    X_train = to_features(i_train)
    X_test = to_features(i_test)
    Y_train = to_target(i_train)

    model_reg.fit(X_train, Y_train)
    Y_predict = model_reg.predict(X_test)

    i_test["predict"] = Y_predict
    i_test.sort_values(["context_id", "predict"], ascending=[True, False], inplace=True)

    i_test.head(15)
    i_test[["context_id", "reply_id"]].to_csv(submission_name, index=False, header=False, sep="\t")
    
    return i_test

In [82]:
fit_tfidf(analyzer="char", ngram_range=(3, 4))
do_submit("linear_cv_opsion.tsv", Ridge(alpha=20.0), to_features_tfidf, trg_label_confidence)

Unnamed: 0,context_id,context_2,context_1,context_0,reply_id,reply,predict
3479,100097508986637,,,выключай их,4,"пытаюсь , сэр .",0.345558
3478,100097508986637,,,выключай их,3,я пытаюсь !,0.162357
3476,100097508986637,,,выключай их,1,"ты не выключишь , тогда € сам выключу",-0.021206
3480,100097508986637,,,выключай их,5,выключите,-0.270745
3475,100097508986637,,,выключай их,0,выключить что ?,-0.284118
3477,100097508986637,,,выключай их,2,выключить ?,-0.332019
3483,100149747456986,,как вы здесь оказались ?,не надо меня тянуть !,2,я не могу позволить вам уйти .,0.304185
3482,100149747456986,,как вы здесь оказались ?,не надо меня тянуть !,1,"мне показалось , вы меня зовете .",0.258659
3484,100149747456986,,как вы здесь оказались ?,не надо меня тянуть !,3,мне уже легче .,0.219066
3486,100149747456986,,как вы здесь оказались ?,не надо меня тянуть !,5,"что "" не надо "" ? не надо меня .",0.054430
