Автор ноутбука: Izvekov Igor (izveigor@gmail.com)

In [51]:
import pandas as pd
import pickle
import re
import json
import os

import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('russian'))

from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

import tika
from tika import parser
tika.initVM()

[nltk_data] Downloading package stopwords to /home/igor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/igor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [52]:
classification = {
    "Договоры для акселератора/Договоры поставки": 0,
    "Договоры для акселератора/Договоры оказания услуг": 1,
    "Договоры для акселератора/Договоры подряда": 2,
    "Договоры для акселератора/Договоры аренды": 3,
    "Договоры для акселератора/Договоры купли-продажи": 4,
}

In [53]:
# Загружаем текста в датасет

data = []
with open("classes.json") as json_file:
    json_data = json_file.read()
    try:
        classes = json.loads(json_data)
    except Exception as e:
        raise e

    for index, file in enumerate(os.listdir('./docs'), start=1):
        class_ = classification[classes[file]]
        parsed = parser.from_file(os.path.join('docs', file))
        content = parsed['content']
        data.append([index, content, class_])

dataset = pd.DataFrame(data, columns=['index', 'text', 'target'])
X = dataset['text'].astype('string')
y = dataset.target.astype('int32')

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

In [55]:
one_hot_encoder = OneHotEncoder()
y_train = one_hot_encoder.fit_transform([[target] for target in y_train]).toarray()
y_test = one_hot_encoder.transform([[target] for target in y_test]).toarray()
print(y_train[:5])

[[0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0.]]


In [56]:
# Очищаем текста от ненужных символов
def cleaning(doc):
    text = ""
    for word in doc:
        token = re.sub("[^А-Яа-я\n ']+", '', str(word)).lower()
        if token:
            token = re.sub("\n", " ", token)
            text += token
    text = ' '.join(text.split())

    cleaned_field = []
    for word in text.split():
        lemma_ = lemmatizer.lemmatize(word)
        if lemma_ not in stop_words:
            cleaned_field.append(lemma_)

    return cleaned_field

X_train = [cleaning(text) for text in X_train]
X_test = [cleaning(text) for text in X_test]

In [57]:
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform([" ".join(o) for o in X_train])
X_test = vectorizer.transform([" ".join(o) for o in X_test])

In [58]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)
acc_random_forest = round(clf.score(X_train, y_train) * 100, 2)
print(acc_random_forest)

99.12


In [59]:
pkl_filename = "model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(clf, file)

In [60]:
print(y_pred[:5])
print(y_pred_proba[:5])

[[0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0.]]
[array([[0.93, 0.07],
       [0.97, 0.03],
       [0.79, 0.21],
       [0.97, 0.03],
       [0.38, 0.62],
       [0.29, 0.71]]), array([[0.9  , 0.1  ],
       [0.94 , 0.06 ],
       [0.93 , 0.07 ],
       [0.86 , 0.14 ],
       [0.88 , 0.12 ],
       [0.895, 0.105]]), array([[0.96 , 0.04 ],
       [0.97 , 0.03 ],
       [0.92 , 0.08 ],
       [0.23 , 0.77 ],
       [0.97 , 0.03 ],
       [0.975, 0.025]]), array([[0.26, 0.74],
       [0.22, 0.78],
       [0.92, 0.08],
       [1.  , 0.  ],
       [0.98, 0.02],
       [0.92, 0.08]]), array([[0.95, 0.05],
       [0.9 , 0.1 ],
       [0.44, 0.56],
       [0.94, 0.06],
       [0.79, 0.21],
       [0.92, 0.08]])]
