# Дискурсивные формулы
## На данный момент:
+ 10 текстов
+ 34663 псевдоклаузы
+ Доля класса 1 (формулы) - 4.4 %
+ Доля класса 0 (не формулы) - 95.6 %

In [1]:
import numpy as np 
import pandas as pd
from scipy.sparse import csr_matrix, hstack, vstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
import time
import pickle

class Profiler():
    def __enter__(self):
        self._startTime = time.time()
         
    def __exit__(self, type, value, traceback):
        print ("Время обучения: {:.3f} секунд".format(time.time() - self._startTime))

seed = 42

In [2]:
data = pd.read_csv('train_data.csv', encoding = 'utf-8-sig', sep = ';')
data.shape

(67239, 27)

In [3]:
target = pd.read_csv('train_target.csv',encoding = 'utf-8-sig', sep = ';' )
target.ix[data['First'] == 0,'"Target"'] = 0

In [4]:
print('Доля формул в выборке:')
print(target['"Target"'].value_counts()[1]/len(target))
print('Доля НЕ формул в выборке:')
print(target['"Target"'].value_counts()[0]/len(target))

Доля формул в выборке:
0.0335370841327
Доля НЕ формул в выборке:
0.966462915867


In [5]:
# в качестве теста - взять один текст от лучшего размечающего
test_indices = data[data['"Text_id"'] == 'Puzhaeva_Тургенев. Где тонко там и рвется'].index.tolist()
texts_test = data[['"Text_id"','Text']][data['"Text_id"'] == 'Puzhaeva_Тургенев. Где тонко там и рвется']
X_test = data[test_indices[0]:test_indices[-1]+1]
y_test = np.array(target[test_indices[0]:test_indices[-1]+1]).reshape(-1,)
X_train = data.drop(test_indices)
y_train = np.array(target.drop(test_indices)).reshape(-1,)
X_train.shape, y_train.shape

((64153, 27), (64153,))

In [6]:
# векторизаторы по словам и символам
tfidf = TfidfVectorizer(min_df = 10, ngram_range = (1,2))
tfidf_char = TfidfVectorizer(min_df = 10, ngram_range = (3,4),analyzer='char')

In [7]:
# фиттим на всех данных, трансформируем текст клауз тренировочной и тестовой выборок, а также все данные 
tfidf.fit(data['Text'])
tfidf_train = tfidf.transform(X_train['Text'])
tfidf_test = tfidf.transform(X_test['Text'])
tfidf_texts = tfidf.transform(data['Text'])

tfidf_char.fit(data['Text'])
tfidf_char_train = tfidf_char.transform(X_train['Text'])
tfidf_char_test = tfidf_char.transform(X_test['Text'])
tfidf_char_texts = tfidf_char.transform(data['Text'])

In [8]:
# удаляем ненужные колонки, трансформируем наши существующие признаки в разреженную матрицу, чтобы объединить с существующими
# векторизированными признаками
X_train = X_train.drop(['Text','"Text_id"'],axis=1)
X_test = X_test.drop(['Text','"Text_id"'],axis=1)
X_train_sparse = csr_matrix(X_train)
X_test_sparse = csr_matrix(X_test)
alldata_sparse = csr_matrix(data.drop(['Text','"Text_id"'],axis=1))

In [9]:
# объединение матриц
X_train_tfidf = hstack((tfidf_train,tfidf_char_train,X_train_sparse))
X_test_tfidf = hstack((tfidf_test,tfidf_char_test,X_test_sparse))
data_tfidf = hstack((tfidf_texts,tfidf_char_texts,alldata_sparse))

X_test_tfidf

<3086x20361 sparse matrix of type '<class 'numpy.float64'>'
	with 101608 stored elements in COOrdinate format>

In [10]:
X_train_tfidf

<64153x20361 sparse matrix of type '<class 'numpy.float64'>'
	with 2033124 stored elements in COOrdinate format>

## Кластеризация

In [11]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 2,random_state=seed)
kmeans.fit(data_tfidf)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0)

In [12]:
kmeans_new = [1 if i == 0 else 0 for i in kmeans.labels_ ] # реальные метки классов, а не наоборот

In [13]:
# на кросс-валидации
from sklearn.metrics import classification_report, accuracy_score
print('Accuracy:')
print(accuracy_score(np.array(target['"Target"']).reshape(-1,), kmeans_new))
print(classification_report(np.array(target['"Target"']).reshape(-1,), kmeans_new))

Accuracy:
0.321450348756
             precision    recall  f1-score   support

          0       1.00      0.30      0.46     64984
          1       0.05      0.98      0.09      2255

avg / total       0.97      0.32      0.45     67239



In [14]:
# присоединить столбец с метками классов к матрице признаков
cluster_sparse = csr_matrix(np.array(kmeans.labels_).reshape(-1, 1))
clust_tfidf = hstack((data_tfidf, cluster_sparse))

In [15]:
clust_tfidf.shape

(67239, 20362)

## Обучение нескольких классификаторов на даных + метках кластеров

### Random Forest

In [16]:
clf = RandomForestClassifier(n_estimators=300,
                             criterion='entropy',
                             min_samples_leaf=5,
                             class_weight = {1:30,0:1},
                             random_state=seed)

In [17]:
# кроссвалидация по пяти фолдам
skf = StratifiedKFold(n_splits = 5, random_state = seed)
cv_pred = cross_val_predict(clf, clust_tfidf, np.array(target['"Target"']).reshape(-1,), cv = skf, verbose = 5)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  4.1min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  5.4min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.7min finished


In [18]:
# удаление предсказанных формул, находящихся после третьей псевдоклаузы - по теоретическим соображениям
print(np.sum(cv_pred))
for i,pr in enumerate(cv_pred):
    if data['First'][i] == 0:
        cv_pred[i] = 0
print(np.sum(cv_pred))

7264
6144


In [19]:
# на кросс-валидации
print('Accuracy:')
print(accuracy_score(np.array(target['"Target"']).reshape(-1,), cv_pred))
print(classification_report(np.array(target['"Target"']).reshape(-1,), cv_pred))

Accuracy:
0.929430836271
             precision    recall  f1-score   support

          0       0.99      0.93      0.96     64984
          1       0.30      0.81      0.44      2255

avg / total       0.97      0.93      0.94     67239



In [20]:
clf.fit(clust_tfidf,np.array(target['"Target"']).reshape(-1,))

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 30},
            criterion='entropy', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [21]:
 # топ-20 признаков с весами из решающего леса
fs = tfidf.get_feature_names() + tfidf_char.get_feature_names() + list(X_train.columns)
wfs = list(zip(fs,clf.feature_importances_))
wfs = sorted(wfs,key=lambda x: x[1],reverse = True)
for w in wfs[:20]:
    print(w[0],w[1],sep=' - ')

First - 0.113551485093
NOUN - 0.0366595635463
Subject - 0.0302320633282
Len - 0.0285895905731
Predicate - 0.0154264065805
ну - 0.0138512424436
NPRO - 0.0121761419333
да - 0.0120932432696
VERB - 0.0119049117129
PRCL - 0.011388894497
нет - 0.0110805051125
PREP - 0.00824441040849
INTJ - 0.00803984234774
нет - 0.00799921380934
PRED - 0.00611395138366
Object - 0.00554422212095
ADJF - 0.00548578082641
ADVB - 0.00461236890275
на  - 0.00388910700349
не  - 0.00354888426573


### Logistic Regression

In [22]:
# logit
from sklearn.linear_model import LogisticRegressionCV
logit = LogisticRegressionCV(class_weight={1:30,0:1},random_state=seed)
cv_logit = cross_val_predict(logit, clust_tfidf, np.array(target['"Target"']).reshape(-1,), cv = skf, verbose = 5)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   23.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   47.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.1min finished


In [23]:
print(np.sum(cv_logit))
for i,pr in enumerate(cv_logit):
    if data['First'][i] == 0:
        cv_logit[i] = 0
print(np.sum(cv_logit))

5440
5440


In [24]:
# на кросс-валидации
from sklearn.metrics import classification_report, accuracy_score
print('Accuracy:')
print(accuracy_score(np.array(target['"Target"']).reshape(-1,), cv_logit))
print(classification_report(np.array(target['"Target"']).reshape(-1,), cv_logit))

Accuracy:
0.939335802139
             precision    recall  f1-score   support

          0       0.99      0.94      0.97     64984
          1       0.33      0.80      0.47      2255

avg / total       0.97      0.94      0.95     67239



### Ridge Classifier

In [25]:
clf = RidgeClassifier(alpha=40,class_weight={1:30,0:1},random_state=seed)
cv_ridge = cross_val_predict(clf, clust_tfidf, np.array(target['"Target"']).reshape(-1,), cv = skf, verbose = 5)
print('Accuracy:')
print(accuracy_score(np.array(target['"Target"']).reshape(-1,), cv_ridge))
print(classification_report(np.array(target['"Target"']).reshape(-1,), cv_ridge))

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    5.7s remaining:    0.0s


Accuracy:
0.900102619016
             precision    recall  f1-score   support

          0       0.99      0.90      0.95     64984
          1       0.23      0.85      0.36      2255

avg / total       0.97      0.90      0.93     67239



[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.1s finished


In [26]:
print(np.sum(cv_ridge))
for i,pr in enumerate(cv_ridge):
    if data['First'][i] == 0:
        cv_ridge[i] = 0
print(np.sum(cv_ridge))

8316
6769


### Support Vector Classifier

In [27]:
clf = LinearSVC(C=0.05,
                class_weight={1:30,0:1},
                random_state=seed)
cv_svc = cross_val_predict(clf, clust_tfidf, np.array(target['"Target"']).reshape(-1,), cv = skf, verbose = 5)
print('Accuracy:')
print(accuracy_score(np.array(target['"Target"']).reshape(-1,), cv_svc))
print(classification_report(np.array(target['"Target"']).reshape(-1,), cv_svc))

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    7.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.2s remaining:    0.0s


Accuracy:
0.928374901471
             precision    recall  f1-score   support

          0       0.99      0.93      0.96     64984
          1       0.30      0.86      0.45      2255

avg / total       0.97      0.93      0.94     67239



[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.5s finished


In [28]:
print(np.sum(cv_svc))
for i,pr in enumerate(cv_svc):
    if data['First'][i] == 0:
        cv_svc[i] = 0
print(np.sum(cv_svc))

6423
6423


In [29]:
y_true = np.array(target['"Target"']).reshape(-1,)

## Взвешенное голосование 

In [30]:
cv_cum = [int(round((cv_pred[i]*0.2 + cv_ridge[i]*0.2 + kmeans_new[i]*0.2 
                 + cv_svc[i]*0.2 + cv_logit[i]*0.2))) for i in range(len(cv_pred))]

In [31]:
print('Accuracy:')
print(accuracy_score(np.array(target['"Target"']).reshape(-1,), cv_cum))
print(classification_report(np.array(target['"Target"']).reshape(-1,), cv_cum)) 

Accuracy:
0.929609304124
             precision    recall  f1-score   support

          0       0.99      0.93      0.96     64984
          1       0.30      0.86      0.45      2255

avg / total       0.97      0.93      0.95     67239



# тест

In [32]:
# preprocessing
tfidf = TfidfVectorizer(min_df = 10, ngram_range = (1,2))
tfidf_char = TfidfVectorizer(min_df = 10, ngram_range = (3,4),analyzer='char')

In [33]:
#models
forest = RandomForestClassifier(n_estimators=300,
                                criterion='entropy',
                                min_samples_leaf=5,
                                class_weight = {1:30,0:1},
                                random_state=seed)

logit = LogisticRegressionCV(class_weight={1:30,0:1},
                             random_state=seed)

kmeans = KMeans(n_clusters=2,
                random_state=seed)

ridge = RidgeClassifier(alpha=40,
                        class_weight={1:30,0:1},
                        random_state=seed)

svc = LinearSVC(C=0.05,
                class_weight={1:30,0:1},
                random_state=seed)

In [34]:
tfidf.fit(data['Text'])
tfidf_char.fit(data['Text'])

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(3, 4), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [35]:
kmeans.fit(data_tfidf)
forest.fit(clust_tfidf, y_true)
logit.fit(clust_tfidf, y_true)
svc.fit(clust_tfidf, y_true)
ridge.fit(clust_tfidf, y_true)

RidgeClassifier(alpha=40, class_weight={0: 1, 1: 30}, copy_X=True,
        fit_intercept=True, max_iter=None, normalize=False,
        random_state=42, solver='auto', tol=0.001)

In [37]:
test_data = pd.read_csv('test_data.csv', encoding = 'utf-8-sig', sep = ';')
test_target = pd.read_csv('test_target.csv',encoding = 'utf-8-sig', sep = ';' )

In [38]:
len(test_data)

4631

In [39]:
tfidf_test = tfidf.transform(test_data['Text'])
tfidf_char_test = tfidf_char.transform(test_data['Text'])
X_test_data = test_data.drop(['Text','"Text_id"'],axis=1)
test_data_sparse = csr_matrix(X_test_data)
test_data_tfidf = hstack((tfidf_test,tfidf_char_test,test_data_sparse))

In [40]:
clusters = kmeans.predict(test_data_tfidf)

In [41]:
true_clusters = [0 if x else 1 for x in clusters]

In [42]:
test_cluster_sparse = csr_matrix(clusters.reshape(-1, 1))
test_clust_tfidf = hstack((test_data_tfidf, test_cluster_sparse))

In [43]:
f_pred = forest.predict(test_clust_tfidf)
l_pred = logit.predict(test_clust_tfidf)
svc_pred = svc.predict(test_clust_tfidf)
r_pred = ridge.predict(test_clust_tfidf)

In [44]:
print(sum(f_pred))
for i,pr in enumerate(f_pred):
    if test_data['First'][i] == 0:
        f_pred[i] = 0
        l_pred[i] = 0
        svc_pred[i] = 0
        r_pred[i] = 0
print(sum(f_pred))

494
410


In [45]:
cv_cum_test = [int(round((f_pred[i]*0.2 + r_pred[i]*0.2 + true_clusters[i]*0.2 
                 + svc_pred[i]*0.2 + l_pred[i]*0.2))) for i in range(len(f_pred))]

In [46]:
# на кросс-валидации
from sklearn.metrics import classification_report, accuracy_score
print('Accuracy:')
print(accuracy_score(np.array(test_target['"Target"']).reshape(-1,), cv_cum_test))
print(classification_report(np.array(test_target['"Target"']).reshape(-1,), cv_cum_test))


Accuracy:
0.928741092637
             precision    recall  f1-score   support

          0       1.00      0.93      0.96      4518
          1       0.25      0.93      0.39       113

avg / total       0.98      0.93      0.95      4631



In [47]:
# записать таблицу в файл - всю таблицу со всеми признаками, предсказанием и реальными ответами
texts = pd.concat((data.reset_index(drop=True), pd.DataFrame(cv_cum), pd.DataFrame(target)), axis = 1) 
texts.columns = [x if x != 0 else 'predicted' for x in texts.columns]
texts.to_csv('predicted.csv',sep=';',index=False)

In [48]:
# записать только тестовую таблицу
test_texts = pd.concat((test_data.reset_index(drop=True), pd.DataFrame(cv_cum_test), test_target), axis = 1) 
test_texts.columns = [x if x != 0 else 'predicted' for x in texts.columns]
test_texts.to_csv('test_predicted.csv',sep=';',index=False)

# Обучить на всех пьесах и записать модели в файлы pickle

In [49]:
full_data = pd.read_csv('full_data.csv', encoding = 'utf-8-sig', sep = ';')
full_target = pd.read_csv('full_target.csv',encoding = 'utf-8-sig', sep = ';' )
len(full_data)

71870

In [58]:
tfidf.fit(full_data['Text'])
tfidf_char.fit(full_data['Text'])

pickle.dump(tfidf,open('tfidf.pickle','wb'))
pickle.dump(tfidf_char,open('tfidf_char.pickle','wb'))

In [51]:
tfidf_full = tfidf.transform(full_data['Text'])
tfidf_char_full = tfidf_char.transform(full_data['Text'])
X_full_data = full_data.drop(['Text','"Text_id"'],axis=1)
full_data_sparse = csr_matrix(X_full_data)
full_data_tfidf = hstack((tfidf_full,tfidf_char_full,full_data_sparse))

In [52]:
kmeans.fit(full_data_tfidf)
pickle.dump(kmeans,open('kmeans.pickle','wb'))

In [53]:
clusters = kmeans.labels_
true_clusters = [0 if x else 1 for x in clusters]

In [54]:
full_cluster_sparse = csr_matrix(clusters.reshape(-1, 1))
full_clust_tfidf = hstack((full_data_tfidf, full_cluster_sparse))

In [55]:
forest.fit(full_clust_tfidf, np.array(full_target['"Target"']).reshape(-1,))
logit.fit(full_clust_tfidf, np.array(full_target['"Target"']).reshape(-1,))
svc.fit(full_clust_tfidf, np.array(full_target['"Target"']).reshape(-1,))
ridge.fit(full_clust_tfidf, np.array(full_target['"Target"']).reshape(-1,))

pickle.dump(forest,open('forest.pickle','wb'))
pickle.dump(logit,open('logit.pickle','wb'))
pickle.dump(svc,open('svc.pickle','wb'))
pickle.dump(ridge,open('ridge.pickle','wb'))

In [59]:
tfidf_f = pickle.load(open('tfidf.pickle','rb'))
tfidf_f_full = tfidf_f.transform(full_data['Text'])

tfidf_full,tfidf_f_full

(<71870x3041 sparse matrix of type '<class 'numpy.float64'>'
 	with 154978 stored elements in Compressed Sparse Row format>,
 <71870x3041 sparse matrix of type '<class 'numpy.float64'>'
 	with 154978 stored elements in Compressed Sparse Row format>)