In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import joblib

import sklearn.feature_extraction.text as sktext
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import pymorphy2

import nltk
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from string import punctuation

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Keni0k\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Keni0k\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Keni0k\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Text preparing

In [74]:
X_train = pd.read_csv('data/train/dump4.csv', encoding="cp1251", sep=";")
X_test = pd.read_csv('data/test/test3.csv', encoding="cp1251", sep=";")
X_train

Unnamed: 0,filename,section,text,light,techie
0,001.rtf,common,Коммерческий агент относится к категории техни...,False,0.5
1,001.rtf,common,На должность коммерческого агента назначается ...,False,0.5
2,001.rtf,common,Назначение на должность коммерческого агента и...,False,0.5
3,001.rtf,common,Коммерческий агент должен знать:,True,0.5
4,001.rtf,common,"Нормативные правовые акты, положения, инструкц...",False,0.5
...,...,...,...,...,...
11295,060.rtf,rights,Привлекать специалистов всех (отдельных) струк...,False,0.0
11296,060.rtf,rights,Требовать от руководства предприятия оказания ...,False,0.0
11297,060.rtf,respons,За ненадлежащее исполнение или неисполнение св...,False,0.0
11298,060.rtf,respons,"За правонарушения, совершенные в процессе осущ...",False,0.0


In [75]:
y_train = X_train.light
X_train = X_train.drop({'light'}, axis=1)
y_train = list(map(lambda x: 1 if x else 0, y_train))

In [76]:
X_train_prepared = X_train.drop({'filename'}, axis=1)
X_test_prepared = X_test.drop({'filename'}, axis=1)
# X_train_prepared = X_train
# X_test_prepared = X_test

In [77]:
class DenseCountVectorizer(sktext.CountVectorizer):
    def transform(self, raw_documents, copy=True):
        X = super().transform(raw_documents)
        df = pd.DataFrame(X.toarray(), columns=self.get_feature_names())
        return df

    def fit_transform(self, raw_documents, y=None):
        X = super().fit_transform(raw_documents, y=y)
        df = pd.DataFrame(X.toarray(), columns=self.get_feature_names())
        return df

def prepareSentence(morph, russian_stopwords, sentence):
    words = nltk.word_tokenize(sentence)
    words = [morph.parse(word)[0].normal_form for word in words]
    words = [token for token in words if token not in russian_stopwords\
          and token != " " \
          and token.strip() not in punctuation]
    return ' '.join(words)

def one_hot_encode(df, col):
        df = df.copy()
        return df.drop(col, axis=1).join(pd.get_dummies(df[col], prefix=col))
    
def prepareData(data):
    morph = pymorphy2.MorphAnalyzer()
    russian_stopwords = stopwords.words("russian")
    data = one_hot_encode(data, 'section')
#     data = one_hot_encode(data, 'filename')
    data['text'] = data.apply(lambda row: prepareSentence(morph, russian_stopwords, row['text']), axis=1)
    return data

In [78]:
X_train_prepared = prepareData(X_train_prepared)
X_test_prepared = prepareData(X_test_prepared)
X_train_prepared

Unnamed: 0,text,techie,section_common,section_respons,section_rights,section_tasks
0,коммерческий агент относиться категория технич...,0.5,1,0,0,0
1,должность коммерческий агент назначаться лицо ...,0.5,1,0,0,0
2,назначение должность коммерческий агент освобо...,0.5,1,0,0,0
3,коммерческий агент должный знать,0.5,1,0,0,0
4,нормативный правовой акт положение инструкция ...,0.5,1,0,0,0
...,...,...,...,...,...,...
11295,привлекать специалист весь отдельный структурн...,0.0,0,0,1,0
11296,требовать руководство предприятие оказание сод...,0.0,0,0,1,0
11297,ненадлежащий исполнение неисполнение свой долж...,0.0,0,1,0,0
11298,правонарушение совершенный процесс осуществлен...,0.0,0,1,0,0


In [79]:
vectorizer = DenseCountVectorizer(input='content', binary=False, ngram_range=(1,1))
X_train_vectorized = vectorizer.fit_transform(X_train_prepared['text'])
X_test_vectorized = vectorizer.transform(X_test_prepared['text'])

In [80]:
print(f'light: {sum(y_train)}, not light: {len(y_train) - sum(y_train)}')
print(f'y_train: {len(y_train)}, x_train: {len(X_train_vectorized)}')
assert len(y_train) == len(X_train_vectorized)

X_train_vectorized['section_common'] = X_train_prepared.section_common
X_train_vectorized['section_respons'] = X_train_prepared.section_respons
X_train_vectorized['section_rights'] = X_train_prepared.section_rights
X_train_vectorized['section_tasks'] = X_train_prepared.section_tasks
X_train_vectorized['techie'] = X_train['techie']

X_test_vectorized['section_common'] = X_test_prepared.section_common
X_test_vectorized['section_respons'] = X_test_prepared.section_respons
X_test_vectorized['section_rights'] = X_test_prepared.section_rights
X_test_vectorized['section_tasks'] = X_test_prepared.section_tasks
X_test_vectorized['techie'] = X_test['techie']

light: 4816, not light: 6484
y_train: 11300, x_train: 11300


In [81]:
X_train_vectorized

Unnamed: 0,access,confer,excel,hrменеджер,ii,iii,internet,iso,jтчёт,ms,...,являться,ядовитый,язык,яковлев,ярмарка,section_common,section_respons,section_rights,section_tasks,techie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0.5
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0.5
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0.5
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0.5
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0
11296,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0
11297,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0.0
11298,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0.0


# Machine Learning

In [82]:
from catboost import CatBoostClassifier

In [83]:
clf = CatBoostClassifier(iterations = 100, )

clf.fit(X_train_vectorized, y_train)

Learning rate set to 0.239662
0:	learn: 0.5930739	total: 41.2ms	remaining: 4.08s
1:	learn: 0.5442945	total: 61.8ms	remaining: 3.03s
2:	learn: 0.5059859	total: 95ms	remaining: 3.07s
3:	learn: 0.4823613	total: 113ms	remaining: 2.72s
4:	learn: 0.4734828	total: 131ms	remaining: 2.5s
5:	learn: 0.4689831	total: 151ms	remaining: 2.37s
6:	learn: 0.4633235	total: 168ms	remaining: 2.23s
7:	learn: 0.4565686	total: 188ms	remaining: 2.16s
8:	learn: 0.4477448	total: 206ms	remaining: 2.08s
9:	learn: 0.4455605	total: 226ms	remaining: 2.03s
10:	learn: 0.4430958	total: 244ms	remaining: 1.98s
11:	learn: 0.4382337	total: 263ms	remaining: 1.93s
12:	learn: 0.4371254	total: 281ms	remaining: 1.88s
13:	learn: 0.4342719	total: 299ms	remaining: 1.83s
14:	learn: 0.4335089	total: 317ms	remaining: 1.8s
15:	learn: 0.4327115	total: 334ms	remaining: 1.75s
16:	learn: 0.4317260	total: 353ms	remaining: 1.73s
17:	learn: 0.4310008	total: 372ms	remaining: 1.69s
18:	learn: 0.4300071	total: 389ms	remaining: 1.66s
19:	learn: 0

<catboost.core.CatBoostClassifier at 0x1b056c7d708>

In [84]:
light = list(map(lambda x: round(x[1], 2), clf.predict_proba(X_test_vectorized)))

In [85]:
X_test['light'] = light
X_test.to_csv('data/result1.csv', sep = ';', encoding="cp1251")

# Saving model

In [72]:
import joblib

In [73]:
joblib.dump(vectorizer, "vectorizer-small-person.pkl")
clf.save_model('model-small-person.cbm')