In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from pymystem3 import Mystem

lemmatizer = Mystem()
stemmer = SnowballStemmer ('russian')
vectorizer = TfidfVectorizer()
categories_map = {
    'bad':0,
    'neutral': 1,
    'good': 2,
}

In [2]:
def read_tsv(file, columns):
    with open(file) as f:
        lines = f.readlines()
    lines = [x.strip().split('\t') for x in lines]
    lines = [[np.nan if isinstance(x, str) and (x.isspace() or not x) else x for x in line] for line in lines]
    return pd.DataFrame(data=lines, columns=columns)

In [3]:
train = read_tsv('train.tsv', [
    'context_id',
    'context_2',
    'context_1',
    'context_0',
    'reply_id',
    'reply',
    'label',
    'confidence'
])
train['context_id'] = train['context_id'].apply(pd.to_numeric)
train['reply_id'] = train['reply_id'].apply(pd.to_numeric)
train['context_2'] = train['context_2'].astype(str)
train['context_1'] = train['context_1'].astype(str)
train['context_0'] = train['context_0'].astype(str)

In [4]:
def preprocess(text):
    if text is None:
        return None
    
    text = text.strip()
    text = text.lower()
#     text = text.translate(str.maketrans('','',string.punctuation))
    words = lemmatizer.lemmatize(text)
    words = [word.strip('\n').strip() for word in words]
    words = [stemmer.stem(word) for word in words]
    text = ' '.join(words)
    return text

In [5]:
train['context_2p'] = train['context_2'].apply(lambda row: preprocess(row))
train['context_1p'] = train['context_1'].apply(lambda row: preprocess(row))
train['context_0p'] = train['context_0'].apply(lambda row: preprocess(row))
train['replyp'] = train['reply'].apply(lambda row: preprocess(row))
train['contextp'] = train['context_2p'].fillna('') + '\n' + \
                    train['context_1p'].fillna('') + '\n' +\
                    train['context_0p'].fillna('')

In [6]:
split_factor = 0.8
index = len(train)*split_factor
index = int(round(index)) 
ttrain = train[:index]
ttest = train[index:]

In [7]:
def stack_uneven(arrays, fill_value=0.):
    sizes = [a.shape for a in arrays]
    max_sizes = np.max(list(zip(*sizes)), -1)
    # The resultant array has stacked on the first dimension
    result = np.full((len(arrays),) + tuple(max_sizes), fill_value)
    for i, a in enumerate(arrays):
      # The shape of this array `a`, turned into slices
      slices = tuple(slice(0,s) for s in sizes[i])
      # Overwrite a block slice of `result` with this array `a`
      result[i][slices] = a
    return result

In [8]:
def tfidf(context, reply):
    Vcontext = vectorizer.fit_transform([context])
    Vreply = vectorizer.fit_transform([reply])
    result = np.dot(Vreply.T, Vcontext).todense()
#     result = np.dot(Vcontext.T, Vreply).todense()
    result = np.asarray(result).flatten()
    return result

def vectorize(df):
    features = []
    labels = []
    for index, row in df.iterrows():
        try:
            features.append(tfidf(row['contextp'], row['replyp']))
            labels.append(categories_map[row['label']])
        except Exception:
            pass
#             print(Exception, row)
    features = stack_uneven(features)
    return (features, labels)

def pad_zeroes(shape, a):
    result = np.zeros(shape)
    result[:a.shape[0],:a.shape[1]] = a
    return result

In [14]:
def decision_tree():
    from sklearn.tree import DecisionTreeClassifier
    return DecisionTreeClassifier()

def log_reg():
    from sklearn.linear_model import LogisticRegression
    return LogisticRegression(multi_class='multinomial', solver='newton-cg')

X_train, y_train = vectorize(ttrain)
X_test, y_test = vectorize(ttest)

max_features = max(X_train.shape[1], X_test.shape[1])
X_train = pad_zeroes((X_train.shape[0], max_features), X_train)
X_test = pad_zeroes((X_test.shape[0], max_features), X_test)

# Y = decision_tree().fit(X_train, y_train).predict(X_test)
# print(accuracy_score(y_test, Y))

Y = log_reg().fit(X_train, y_train).predict(X_test)
print(accuracy_score(y_test, Y))

0.537965874529615


In [10]:
public = read_tsv('public.tsv', [
    'context_id',
    'context_2',
    'context_1',
    'context_0',
    'reply_id',
    'reply',
])

In [11]:
df = public.sort_values(by=['context_id', 'reply_id'], ascending=[True, False])
df = df[['context_id', 'reply_id']]
df.to_csv('output.tsv', header = None, index=False, sep = '\t')