NB: Взболтать но не смешивать, shuffle объектов противопоказан (следующий объект зависит от предыдущего)

In [1]:
import numpy as np 
import pandas as pd
from scipy.sparse import csr_matrix, hstack, vstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from itertools import chain
import time
import pickle
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.decomposition import TruncatedSVD


from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

seed = 42
CONTEXT_WINDOW = (-5, 5)



In [2]:
w2v_model = Word2Vec.load('word2vec/araneum_none_fasttextskipgram_300_5_2018.model')
vectors = w2v_model.wv

In [3]:
data = pd.read_csv('train_data.csv', encoding = 'utf-8-sig', sep = ';',index_col=0)
data = data.fillna('')
data['First'] = np.zeros(len(data),dtype=int)
data.loc[data['Word_num'] <= 20,'First'] = 1
data.head()

Unnamed: 0,Text_-5,Lemma_-5,POS_-5,Target_-5,Text_-4,Lemma_-4,POS_-4,Target_-4,Text_-3,Lemma_-3,...,Text_3,Lemma_3,POS_3,Text_4,Lemma_4,POS_4,Text_5,Lemma_5,POS_5,First
0,,,,,,,,,,,...,.,.,PNCT,невольницы,невольница,NOUN,н.,н.,UNKN,1
1,,,,,,,,,,,...,невольницы,невольница,NOUN,н.,н.,UNKN,островский,островский,NOUN,1
2,,,,,,,,,,,...,н.,н.,UNKN,островский,островский,NOUN,.,.,PNCT,1
3,,,,,,,,,островский,островский,...,островский,островский,NOUN,.,.,PNCT,невольницы,невольница,NOUN,1
4,,,,,островский,островский,NOUN,O,александр,александр,...,.,.,PNCT,невольницы,невольница,NOUN,действие,действие,NOUN,1


In [4]:
target = pd.read_csv('train_target.csv',encoding = 'utf-8-sig', sep = ';' )
y = np.array(target['Target']).reshape(-1,)

target_labels = ['']+sorted(target['Target'].unique())
target_labels

['', 'B', 'I', 'L', 'O', 'U']

In [5]:
# в качестве теста - взять несколько текстов
test_texts = {'Puzhaeva_Гоголь. Игроки.txt',
             'Petrushevskaya_uroki_muzyki_ev_prov.txt',
             'Yad_EV_prov.txt'}
criterion = data['Text_id'].map(lambda x: x in test_texts)
test_indices = data[criterion].index.tolist()
X_test = data.loc[test_indices,:]
y_test = np.array(target.loc[test_indices,:]).reshape(-1,)
X_train = data.drop(test_indices)
y_train = np.array(target.drop(test_indices)).reshape(-1,)
train_indices = X_train.index
X_train.shape, X_test.shape

((426803, 41), (57354, 41))

In [6]:
all_vectors = []
cnt = 0
for word in data['Lemma']:
    try:
        all_vectors.append(vectors.word_vec(word))
    except KeyError:
        cnt += 1
        all_vectors.append(np.zeros(shape=(300,)))
print(cnt)
all_vectors_sparse = csr_matrix(all_vectors)
vectors_train = all_vectors_sparse[train_indices,:]
#vectors_test = all_vectors_sparse[test_indices,:]

319803


In [7]:
all_vectors = []
all_vectors_sparse = []
w2v_model = []

In [8]:
def vectorize(data,X_train,X_test,colname):
    print(colname)
    label_enc = LabelEncoder()
    onehot_enc = OneHotEncoder()
    if colname == 'Target':
        label_data = label_enc.fit_transform(target_labels)
        cols_to_transform = [colname+'_'+str(i) for i in range(CONTEXT_WINDOW[0],0)]
    else:
        label_data = label_enc.fit_transform(list(data[colname].unique())+[''])
        cols_to_transform = [colname] + [colname+'_'+str(i) for i in range(CONTEXT_WINDOW[0],CONTEXT_WINDOW[1]+1) if i]
    onehot_enc.fit(label_data.reshape(-1, 1))
    count_train = hstack([onehot_enc.transform(label_enc.transform(X_train[col]).reshape(-1, 1)) for col in cols_to_transform])
    count_test = hstack([onehot_enc.transform(label_enc.transform(X_test[col]).reshape(-1, 1)) for col in cols_to_transform])
    count_texts = hstack([onehot_enc.transform(label_enc.transform(data[col]).reshape(-1, 1)) for col in cols_to_transform])
    return count_texts,count_train,count_test,label_enc,onehot_enc

In [9]:
# to add df col to sparse matrix: np.array(column)[:,None]

cols_to_vect = ['Text','POS','Target'] # target is the last one, it is important
sparse_data = [vectorize(data,X_train,X_test,x) for x in cols_to_vect]
encoders = {cols_to_vect[i]:(x[-2],x[-1]) for i,x in enumerate(sparse_data)}
#data_vect = hstack([x[0] for x in sparse_data])
train_vect = hstack([vectors_train]+[x[1] for x in sparse_data])
#test_vect = hstack([x[2] for x in sparse_data])
train_vect.shape #, train_vect.shape, test_vect.shape

Text
POS
Target


(426803, 482614)

In [10]:
label_onehot = {}
for label in target_labels:
    encoded = encoders['Target'][1].transform(encoders['Target'][0].transform([label]).reshape(-1, 1))
    encoded = np.asarray(encoded.todense()).reshape(-1)
    label_onehot[label] = encoded
label_onehot

{'': array([1., 0., 0., 0., 0., 0.]),
 'B': array([0., 1., 0., 0., 0., 0.]),
 'I': array([0., 0., 1., 0., 0., 0.]),
 'L': array([0., 0., 0., 1., 0., 0.]),
 'O': array([0., 0., 0., 0., 1., 0.]),
 'U': array([0., 0., 0., 0., 0., 1.])}

In [11]:
data = []
sparse_data = []
encoders = []

In [12]:
from sklearn.feature_selection import SelectFromModel, SelectKBest
select_k = SelectKBest(k=100000)
#select_k.fit(data_vect, y)
selected_train = select_k.fit_transform(train_vect, y_train)

  f = msb / msw


In [13]:
selected_train.shape

(426803, 100000)

In [14]:
# кроссвалидация по пяти фолдам
skf = StratifiedKFold(n_splits = 5, random_state = seed)

In [15]:
class CustomLogisticRegression(LogisticRegression):
    def predict(self,X):
        """
        сначала нужно заменить существующие предыдущие таргеты предсказанными моделью
        для этого нужно
            1. векторизировать предыдущие предсказанные (обученные модельки лежат в encoders)
            2. заменить существующие ими
        и предсказать по этому измененному объекту
        """
        predicted = []
        for item in X.tolil():
            prev_pred = predicted[CONTEXT_WINDOW[0]:]
            if len(prev_pred) < -CONTEXT_WINDOW[0]:
                prev_pred = ['']*(-CONTEXT_WINDOW[0]-len(prev_pred)) + prev_pred
            encoded = [y for x in prev_pred for y in label_onehot[x]]
            item[0,-len(encoded):] = encoded
            predicted.append(super().predict(item)[0]) # something like this
        return predicted

In [16]:
logit = CustomLogisticRegression(random_state=seed,class_weight='balanced')
cv_logit = cross_val_predict(logit, selected_train, y_train, cv = skf, verbose = 5)
print('Accuracy:')
print(accuracy_score(y_train, cv_logit))
print(classification_report(y_train, cv_logit))

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  9.1min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 13.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 18.0min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 22.0min finished


Accuracy:
0.966865275080072
             precision    recall  f1-score   support

          B       0.28      0.36      0.31      2139
          I       0.13      0.23      0.17      2844
          L       0.22      0.28      0.25      2139
          O       0.99      0.98      0.98    419119
          U       0.30      0.61      0.40       562

avg / total       0.97      0.97      0.97    426803



## Исправление лейблов
Пока не работает нормально

In [17]:
from copy import deepcopy
logit_cv_backup = deepcopy(cv_logit)

In [22]:
from collections import Counter
Counter(cv_logit)

Counter({'B': 2728, 'I': 5247, 'L': 2718, 'O': 415018, 'U': 1092})

In [18]:
# for predictions correction
# features - several previous correct answers, several previous predictions, current prediction
from scipy.sparse import lil_matrix

pred_matrix = []
for i in range(len(cv_logit)):
    prev_target = y_train[max(0,i+CONTEXT_WINDOW[0]):i]
    prev_pred = cv_logit[max(0,i+CONTEXT_WINDOW[0]):i]
    if len(prev_target) < -CONTEXT_WINDOW[0]:
        prev_target = ['']*(-CONTEXT_WINDOW[0]-len(prev_target)) + list(prev_target)
        prev_pred = ['']*(-CONTEXT_WINDOW[0]-len(prev_pred)) + list(prev_pred)
    pred_matrix.append([y for x in prev_pred for y in label_onehot[x]]+list(label_onehot[cv_logit[i]]) +\
                       [y for x in prev_target for y in label_onehot[x]])
pred_matrix = lil_matrix(pred_matrix)

In [19]:
logit2 = CustomLogisticRegression(random_state=seed,class_weight='balanced')
cv_logit2 = cross_val_predict(logit2, pred_matrix, y_train, cv = skf, verbose = 5)
print('Accuracy:')
print(accuracy_score(y_train, cv_logit2))
print(classification_report(y_train, cv_logit2))

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   42.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.4min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.1min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.4min finished


Accuracy:
0.9598948460999571
             precision    recall  f1-score   support

          B       0.17      0.35      0.23      2139
          I       0.13      0.19      0.15      2844
          L       0.13      0.27      0.18      2139
          O       0.99      0.97      0.98    419119
          U       0.13      0.27      0.18       562

avg / total       0.97      0.96      0.97    426803



In [54]:
cv_logit = deepcopy(logit_cv_backup)

In [55]:
formula = False
for i in range(len(cv_logit)):
    if cv_logit[i] == 'B':
        formula = True
    elif cv_logit[i] == 'O' and formula:
        cv_logit[i] = 'I'
    elif cv_logit[i] == 'L' and formula:
        formula = False


for i in range(len(cv_logit)):
    if i > 0 and i < len(cv_logit)-1:
        if (cv_logit[i-1] == 'I' or cv_logit[i-1] == 'B') and cv_logit[i+1] == 'O':
            cv_logit[i] = 'L'
        elif cv_logit[i-1] == 'O' and (cv_logit[i+1] == 'I' or cv_logit[i-1] == 'B') and cv_logit[i] == 'I':
            cv_logit[i] = 'B'
        elif cv_logit[i-1] == 'O' and cv_logit[i+1] == 'O' and cv_logit[i] != 'O':
            cv_logit[i] = 'U'


In [56]:
print('Accuracy:')
print(accuracy_score(y_train, cv_logit))
print(classification_report(y_train, cv_logit))

Accuracy:
0.9545223440322584
             precision    recall  f1-score   support

          B       0.29      0.37      0.32      2139
          I       0.07      0.28      0.12      2844
          L       0.23      0.30      0.26      2139
          O       0.99      0.97      0.98    419119
          U       0.34      0.73      0.46       562

avg / total       0.97      0.95      0.96    426803



In [88]:
corrected = cv_logit2 != cv_logit

In [90]:
with open('results.csv','w',encoding='utf-8-sig') as f:
    f.write('true;pred;pred_corr;corr\n')
    f.write('\n'.join([';'.join(x) for x in zip(y_train,logit_cv_backup,cv_logit2,[str(x) for x in corrected])]))