In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import joblib

import sklearn.feature_extraction.text as sktext
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import pymorphy2

import nltk
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from string import punctuation

# Text preparing

In [63]:
X_train = pd.read_csv('data/train/dump4.csv', encoding="cp1251", sep=";")
X_test = pd.read_csv('data/test/test3.csv', encoding="cp1251", sep=";")
X_train

Unnamed: 0,filename,section,text,light
0,001.rtf,common,Коммерческий агент относится к категории техни...,False
1,001.rtf,common,На должность коммерческого агента назначается ...,False
2,001.rtf,common,Назначение на должность коммерческого агента и...,False
3,001.rtf,common,Коммерческий агент должен знать:,True
4,001.rtf,common,"Нормативные правовые акты, положения, инструкц...",False
...,...,...,...,...
6225,535.rtf,tasks,"Внутренние документы: Положения компании, Поло...",False
6226,535.rtf,tasks,Критерии оценки эффективности труда,False
6227,535.rtf,tasks,Выполнение бюджетных планов,False
6228,535.rtf,respons,"Взаимодействие, обмен информацией",False


In [64]:
y_train = X_train.light
X_train = X_train.drop({'light'}, axis=1)
y_train = list(map(lambda x: 1 if x else 0, y_train))

In [65]:
X_train_prepared = X_train.drop({'filename'}, axis=1)
X_test_prepared = X_test.drop({'filename'}, axis=1)

In [66]:
class DenseCountVectorizer(sktext.CountVectorizer):
    def transform(self, raw_documents, copy=True):
        X = super().transform(raw_documents)
        df = pd.DataFrame(X.toarray(), columns=self.get_feature_names())
        return df

    def fit_transform(self, raw_documents, y=None):
        X = super().fit_transform(raw_documents, y=y)
        df = pd.DataFrame(X.toarray(), columns=self.get_feature_names())
        return df

def prepareSentence(morph, russian_stopwords, sentence):
    words = nltk.word_tokenize(sentence)
    words = [morph.parse(word)[0].normal_form for word in words]
    words = [token for token in words if token not in russian_stopwords\
          and token != " " \
          and token.strip() not in punctuation]
    return ' '.join(words)

def one_hot_encode(df, col):
        df = df.copy()
        return df.drop(col, axis=1).join(pd.get_dummies(df[col], prefix=col))
    
def prepareData(data):
    morph = pymorphy2.MorphAnalyzer()
    russian_stopwords = stopwords.words("russian")
    data = one_hot_encode(data, 'section')
    data['text'] = data.apply(lambda row: prepareSentence(morph, russian_stopwords, row['text']), axis=1)
    return data

In [67]:
X_train_prepared = prepareData(X_train_prepared)
X_test_prepared = prepareData(X_test_prepared)
X_train_prepared

Unnamed: 0,text,section_common,section_respons,section_rights,section_tasks
0,коммерческий агент относиться категория технич...,1,0,0,0
1,должность коммерческий агент назначаться лицо ...,1,0,0,0
2,назначение должность коммерческий агент освобо...,1,0,0,0
3,коммерческий агент должный знать,1,0,0,0
4,нормативный правовой акт положение инструкция ...,1,0,0,0
...,...,...,...,...,...
6225,внутренний документ положение компания положен...,0,0,0,1
6226,критерий оценка эффективность труд,0,0,0,1
6227,выполнение бюджетный план,0,0,0,1
6228,взаимодействие обмен информация,0,1,0,0


In [69]:
vectorizer = DenseCountVectorizer(input='content', binary=False, ngram_range=(1,3))
X_train_vectorized = vectorizer.fit_transform(X_train_prepared['text'])
X_test_vectorized = vectorizer.transform(X_test_prepared['text'])

In [70]:
print(f'light: {sum(y_train)}, not light: {len(y_train) - sum(y_train)}')
print(f'y_train: {len(y_train)}, x_train: {len(X_train_vectorized)}')
assert len(y_train) == len(X_train_vectorized)

X_train_vectorized['section_common'] = X_train_prepared.section_common
X_train_vectorized['section_respons'] = X_train_prepared.section_respons
X_train_vectorized['section_rights'] = X_train_prepared.section_rights
X_train_vectorized['section_tasks'] = X_train_prepared.section_tasks

X_test_vectorized['section_common'] = X_test_prepared.section_common
X_test_vectorized['section_respons'] = X_test_prepared.section_respons
X_test_vectorized['section_rights'] = X_test_prepared.section_rights
X_test_vectorized['section_tasks'] = X_test_prepared.section_tasks

light: 2281, not light: 3949
y_train: 6230, x_train: 6230


In [71]:
X_train_vectorized

Unnamed: 0,access,access outlook,access outlook project,confer,confer использовать,confer использовать техника,excel,excel word,excel word access,excel ежедневно,...,яковлев,ярмарка,ярмарка выставка,ярмарка выставка экскурсия,ярмарка торг,ярмарка торг выставка,section_common,section_respons,section_rights,section_tasks
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6225,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6226,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6227,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6228,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# Machine Learning

In [72]:
from catboost import CatBoostClassifier

In [73]:
clf = CatBoostClassifier(iterations = 100, )

clf.fit(X_train_vectorized, y_train)

Learning rate set to 0.185858
0:	learn: 0.5668904	total: 138ms	remaining: 13.7s
1:	learn: 0.4762299	total: 245ms	remaining: 12s
2:	learn: 0.4315910	total: 362ms	remaining: 11.7s
3:	learn: 0.3960607	total: 473ms	remaining: 11.4s
4:	learn: 0.3728647	total: 597ms	remaining: 11.3s
5:	learn: 0.3604088	total: 708ms	remaining: 11.1s
6:	learn: 0.3474752	total: 829ms	remaining: 11s
7:	learn: 0.3379679	total: 956ms	remaining: 11s
8:	learn: 0.3293419	total: 1.07s	remaining: 10.8s
9:	learn: 0.3246136	total: 1.19s	remaining: 10.7s
10:	learn: 0.3214173	total: 1.3s	remaining: 10.6s
11:	learn: 0.3171512	total: 1.41s	remaining: 10.4s
12:	learn: 0.3105797	total: 1.53s	remaining: 10.2s
13:	learn: 0.3077043	total: 1.63s	remaining: 10s
14:	learn: 0.3049313	total: 1.75s	remaining: 9.9s
15:	learn: 0.3009252	total: 1.86s	remaining: 9.79s
16:	learn: 0.2999533	total: 1.98s	remaining: 9.69s
17:	learn: 0.2984541	total: 2.09s	remaining: 9.54s
18:	learn: 0.2953797	total: 2.22s	remaining: 9.48s
19:	learn: 0.2945534	

<catboost.core.CatBoostClassifier at 0x17f51e60388>

In [79]:
light = list(map(lambda x: round(x[1], 2), clf.predict_proba(X_test_vectorized)))

In [75]:
X_test['light'] = light
X_test.to_csv('data/result.csv', sep = ';', encoding="cp1251")

# Saving model

In [76]:
import joblib

In [77]:
joblib.dump(vectorizer, "vectorizer-large.pkl")
clf.save_model('model-large.cbm')