NB: Взболтать но не смешивать, shuffle объектов противопоказан (следующий объект зависит от предыдущего)

In [1]:
import numpy as np 
import pandas as pd
from scipy.sparse import csr_matrix, lil_matrix, hstack, vstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_selection import SelectFromModel, SelectKBest
from itertools import chain
import time
import pickle
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.decomposition import TruncatedSVD


from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

seed = 42
CONTEXT_WINDOW = (-5, 5)

In [2]:
# кроссвалидация по пяти фолдам
skf = StratifiedKFold(n_splits = 5, random_state = seed)

In [3]:
class CustomLogisticRegression(LogisticRegression):
    def predict(self,X):
        """
        сначала нужно заменить существующие предыдущие таргеты предсказанными моделью
        для этого нужно
            1. векторизировать предыдущие предсказанные (обученные модельки лежат в encoders)
            2. заменить существующие ими
        и предсказать по этому измененному объекту
        """
        predicted = []
        for item in X.tolil():
            prev_pred = predicted[CONTEXT_WINDOW[0]:]
            if len(prev_pred) < -CONTEXT_WINDOW[0]:
                prev_pred = ['']*(-CONTEXT_WINDOW[0]-len(prev_pred)) + prev_pred
            encoded = np.fromiter((y for x in prev_pred for y in label_onehot[x]),dtype=int)[target_in_feats] # not tested
            item[0,-len(encoded):] = encoded
            predicted.append(super().predict(item)[0]) # something like this
        return np.array(predicted)

## Исправление лейблов

In [4]:
def rule_correction(test_logit):
    formula = False
    for i in range(len(test_logit)):
        if test_logit[i] == 'B':
            formula = True
            b_idx = i
            cur_idxs = []
        elif test_logit[i] == 'O' and formula:
            cur_idxs.append(i)
        elif test_logit[i] == 'L' and formula:
            if len(cur_idxs) < 5:
                test_logit[cur_idxs] = 'I'
            else:
                test_logit[b_idx] = 'U'
                test_logit[i] = 'U'
            cur_idxs = []
            formula = False

    for i in range(len(test_logit)):
        if i > 0 and i < len(test_logit)-1:
            if (test_logit[i-1] == 'I' or test_logit[i-1] == 'B') and test_logit[i+1] == 'O' and test_logit[i] == 'I':
                test_logit[i] = 'L'
            elif (test_logit[i-1] == 'O' or test_logit[i-1] == 'L') and (test_logit[i+1] == 'I' or test_logit[i+1] == 'L') \
                 and test_logit[i] == 'I':
                test_logit[i] = 'B'
            elif test_logit[i-1] == 'O' and test_logit[i+1] == 'O' and test_logit[i] != 'O':
                test_logit[i] = 'U'

    flen = 0
    for i in range(len(test_logit)):
        if test_logit[i] != 'O':
            flen += 1
        if test_logit[i] == 'O':
            if flen > 10:
                test_logit[i-flen:i] = 'O'
            if flen:
                flen = 0
    return test_logit

# Препроцессинг

In [5]:
w2v_model = Word2Vec.load('word2vec/araneum_none_fasttextskipgram_300_5_2018.model')
vectors = w2v_model.wv

In [6]:
data = pd.read_csv('train_data.csv', encoding = 'utf-8-sig', sep = ';',index_col=0)
data = data.fillna('')
data['First'] = np.zeros(len(data),dtype=int)
data.loc[data['Word_num'] <= 20,'First'] = 1
data.head()

Unnamed: 0,Text_-5,Lemma_-5,POS_-5,Target_-5,Text_-4,Lemma_-4,POS_-4,Target_-4,Text_-3,Lemma_-3,...,Text_3,Lemma_3,POS_3,Text_4,Lemma_4,POS_4,Text_5,Lemma_5,POS_5,First
0,,,,,,,,,,,...,.,.,PNCT,невольницы,невольница,NOUN,н.,н.,UNKN,1
1,,,,,,,,,,,...,невольницы,невольница,NOUN,н.,н.,UNKN,островский,островский,NOUN,1
2,,,,,,,,,,,...,н.,н.,UNKN,островский,островский,NOUN,.,.,PNCT,1
3,,,,,,,,,островский,островский,...,островский,островский,NOUN,.,.,PNCT,невольницы,невольница,NOUN,1
4,,,,,островский,островский,NOUN,O,александр,александр,...,.,.,PNCT,невольницы,невольница,NOUN,действие,действие,NOUN,1


In [7]:
target = pd.read_csv('train_target.csv',encoding = 'utf-8-sig', sep = ';' )
y = np.array(target['Target']).reshape(-1,)

target_labels = ['']+sorted(target['Target'].unique())
target_labels

['', 'B', 'I', 'L', 'O', 'U']

In [8]:
# в качестве теста - взять несколько текстов
test_texts = {'Puzhaeva_Гоголь. Игроки.txt',
             'Petrushevskaya_uroki_muzyki_ev_prov.txt',
             'Yad_EV_prov.txt'}
criterion = data['Text_id'].map(lambda x: x in test_texts)
test_indices = data[criterion].index.tolist()
X_test = data.loc[test_indices,:]
y_test = np.array(target.loc[test_indices,:]).reshape(-1,)
X_train = data.drop(test_indices)
y_train = np.array(target.drop(test_indices)).reshape(-1,)
train_indices = X_train.index
X_train.shape, X_test.shape

((426803, 41), (57354, 41))

In [9]:
all_vectors = []
cnt = 0
for word in data['Lemma']:
    try:
        all_vectors.append(vectors.word_vec(word))
    except KeyError:
        cnt += 1
        all_vectors.append(np.zeros(shape=(300,)))
print(cnt)
all_vectors_sparse = csr_matrix(all_vectors)
vectors_train = all_vectors_sparse[train_indices,:]
vectors_test = all_vectors_sparse[test_indices,:]

319803


In [10]:
all_vectors = []
all_vectors_sparse = []
w2v_model = []

In [11]:
def vectorize(data,X_train,X_test,colname):
    print(colname)
    label_enc = LabelEncoder()
    onehot_enc = OneHotEncoder()
    if colname == 'Target':
        label_data = label_enc.fit_transform(target_labels)
        cols_to_transform = [colname+'_'+str(i) for i in range(CONTEXT_WINDOW[0],0)]
    else:
        label_data = label_enc.fit_transform(list(data[colname].unique())+[''])
        cols_to_transform = [colname] + [colname+'_'+str(i) for i in range(CONTEXT_WINDOW[0],CONTEXT_WINDOW[1]+1) if i]
    onehot_enc.fit(label_data.reshape(-1, 1))
    count_train = hstack([onehot_enc.transform(label_enc.transform(X_train[col]).reshape(-1, 1)) for col in cols_to_transform])
    count_test = hstack([onehot_enc.transform(label_enc.transform(X_test[col]).reshape(-1, 1)) for col in cols_to_transform])
    count_texts = hstack([onehot_enc.transform(label_enc.transform(data[col]).reshape(-1, 1)) for col in cols_to_transform])
    return count_texts,count_train,count_test,label_enc,onehot_enc

In [12]:
# to add df col to sparse matrix: np.array(column)[:,None]

cols_to_vect = ['Text','POS','Target'] # target is the last one, it is important
sparse_data = [vectorize(data,X_train,X_test,x) for x in cols_to_vect]
encoders = {cols_to_vect[i]:(x[-2],x[-1]) for i,x in enumerate(sparse_data)}
train_vect = hstack([vectors_train]+[x[1] for x in sparse_data])
test_vect = hstack([vectors_test]+[x[2] for x in sparse_data])
train_vect.shape, test_vect.shape

Text
POS
Target


((426803, 482614), (57354, 482614))

In [13]:
label_onehot = {}
for label in target_labels:
    encoded = encoders['Target'][1].transform(encoders['Target'][0].transform([label]).reshape(-1, 1))
    encoded = np.asarray(encoded.todense()).reshape(-1)
    label_onehot[label] = encoded
label_onehot

{'': array([1., 0., 0., 0., 0., 0.]),
 'B': array([0., 1., 0., 0., 0., 0.]),
 'I': array([0., 0., 1., 0., 0., 0.]),
 'L': array([0., 0., 0., 1., 0., 0.]),
 'O': array([0., 0., 0., 0., 1., 0.]),
 'U': array([0., 0., 0., 0., 0., 1.])}

In [14]:
data = []
sparse_data = []
encoders = []

# All feats without w2v and selection

In [13]:
target_in_feats = [True]*(-CONTEXT_WINDOW[0]*len(target_labels))

In [17]:
logit = CustomLogisticRegression(random_state=seed,class_weight='balanced')
logit.fit(hstack([x[1] for x in sparse_data]), y_train)
test_logit = logit.predict(hstack([x[2] for x in sparse_data]))
print('Accuracy:')
print(accuracy_score(y_test, test_logit))
print(classification_report(y_test, test_logit))

test_logit = rule_correction(test_logit)
print('Accuracy:')
print(accuracy_score(y_test, test_logit))
print(classification_report(y_test, test_logit))

Accuracy:
0.964361683579175
             precision    recall  f1-score   support

          B       0.37      0.33      0.35       356
          I       0.24      0.22      0.23       792
          L       0.35      0.31      0.33       356
          O       0.98      0.98      0.98     55845
          U       0.07      0.40      0.11         5

avg / total       0.96      0.96      0.96     57354

Accuracy:
0.9695923562436796
             precision    recall  f1-score   support

          B       0.38      0.31      0.34       356
          I       0.38      0.18      0.24       792
          L       0.35      0.28      0.31       356
          O       0.98      0.99      0.98     55845
          U       0.07      0.40      0.11         5

avg / total       0.96      0.97      0.97     57354



# All feats with central w2v

In [15]:
target_in_feats = [True]*(-CONTEXT_WINDOW[0]*len(target_labels))

In [18]:
logit = CustomLogisticRegression(random_state=seed,class_weight='balanced')
logit.fit(train_vect, y_train)
test_logit = logit.predict(test_vect)
print('Accuracy:')
print(accuracy_score(y_test, test_logit))
print(classification_report(y_test, test_logit))

test_logit = rule_correction(test_logit)
print('Accuracy:')
print(accuracy_score(y_test, test_logit))
print(classification_report(y_test, test_logit))

Accuracy:
0.9633329846218224
             precision    recall  f1-score   support

          B       0.37      0.36      0.36       356
          I       0.23      0.22      0.23       792
          L       0.35      0.33      0.34       356
          O       0.98      0.98      0.98     55845
          U       0.03      0.20      0.06         5

avg / total       0.96      0.96      0.96     57354

Accuracy:
0.969313387034906
             precision    recall  f1-score   support

          B       0.38      0.33      0.35       356
          I       0.38      0.19      0.25       792
          L       0.35      0.30      0.32       356
          O       0.98      0.99      0.98     55845
          U       0.03      0.20      0.06         5

avg / total       0.96      0.97      0.97     57354



# All feats with all w2v including context
это если только арендовать сервера Пентагона

In [None]:
# на это нужно МНОГО RAM (гигов 30, если не больше)
all_vectors = lil_matrix((data.shape[0],(CONTEXT_WINDOW[1]-CONTEXT_WINDOW[0]+1)*300))
cnt = 0
for idx,i in enumerate(range(CONTEXT_WINDOW[0],CONTEXT_WINDOW[1]+1)):
    col = 'Lemma_'+str(i) if i else 'Lemma'
    for j,word in enumerate(data[col]):
        try:
            all_vectors[j,idx*300:idx*300+300] = vectors.word_vec(word)
        except KeyError:
            cnt += 1
print(cnt)
all_vectors_sparse = csr_matrix(all_vectors)
vectors_train = all_vectors_sparse[train_indices,:]
vectors_test = all_vectors_sparse[test_indices,:]

In [None]:
train_vect = hstack([vectors_train]+[x[1] for x in sparse_data])
test_vect = hstack([vectors_test]+[x[2] for x in sparse_data])
train_vect.shape, test_vect.shape

In [None]:
target_in_feats = [True]*(-CONTEXT_WINDOW[0]*len(target_labels))

In [None]:
logit = CustomLogisticRegression(random_state=seed,class_weight='balanced')
logit.fit(train_vect, y_train)
test_logit = logit.predict(test_vect)
print('Accuracy:')
print(accuracy_score(y_test, test_logit))
print(classification_report(y_test, test_logit))

test_logit = rule_correction(test_logit)
print('Accuracy:')
print(accuracy_score(y_test, test_logit))
print(classification_report(y_test, test_logit))

# Feature selection

In [None]:
# норм тема
# тут можно использовать обычные модели, потому что делается только фит без предикта
# очень долго, лучше грузить из pickle
select_model = SelectFromModel(LogisticRegression(penalty='l1',random_state=seed,class_weight='balanced'))
selected_train = select_model.fit_transform(train_vect, y_train)

with open('select_model.pickle','wb') as f:
    pickle.dump(select_model,f)

In [15]:
with open('select_model.pickle','rb') as f:
    select_model = pickle.load(f)
selected_train = select_model.transform(train_vect)

In [16]:
selected_test = select_model.transform(test_vect)

In [17]:
selected_train.shape, selected_test.shape

((426803, 5774), (57354, 5774))

In [18]:
# ААА НАДО УЧИТЫВАТЬ ТАРГЕТ ПРИ ФИЛЬТРАЦИИ ПРИЗНАКОВ
target_in_feats = select_model.get_support()[CONTEXT_WINDOW[0]*len(target_labels):]

In [99]:
logit = CustomLogisticRegression(random_state=seed,class_weight='balanced')
cv_logit = cross_val_predict(logit, selected_train, y_train, cv = skf, verbose = 5, n_jobs=4)
print('Accuracy:')
print(accuracy_score(y_train, cv_logit))
print(classification_report(y_train, cv_logit))

[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:  4.4min remaining:  6.6min
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:  8.0min finished


Accuracy:
0.9705601881898674
             precision    recall  f1-score   support

          B       0.31      0.44      0.36      2139
          I       0.20      0.26      0.22      2844
          L       0.25      0.36      0.29      2139
          O       0.99      0.98      0.99    419119
          U       0.31      0.69      0.43       562

avg / total       0.98      0.97      0.97    426803



In [20]:
logit = CustomLogisticRegression(random_state=seed,class_weight='balanced')
logit.fit(selected_train, y_train)
test_logit = logit.predict(selected_test)
print('Accuracy:')
print(accuracy_score(y_test, test_logit))
print(classification_report(y_test, test_logit))

test_logit = rule_correction(test_logit)
print('Accuracy:')
print(accuracy_score(y_test, test_logit))
print(classification_report(y_test, test_logit))

Accuracy:
0.9592356243679604
             precision    recall  f1-score   support

          B       0.35      0.37      0.36       356
          I       0.19      0.22      0.20       792
          L       0.32      0.35      0.33       356
          O       0.98      0.98      0.98     55845
          U       0.07      0.60      0.13         5

avg / total       0.96      0.96      0.96     57354

Accuracy:
0.9677790563866513
             precision    recall  f1-score   support

          B       0.35      0.36      0.36       356
          I       0.36      0.19      0.25       792
          L       0.32      0.33      0.32       356
          O       0.98      0.99      0.98     55845
          U       0.07      0.60      0.13         5

avg / total       0.96      0.97      0.97     57354



# Записать результаты в файл

In [63]:
with open('results.csv','w',encoding='utf-8-sig') as f:
    f.write('true;pred_corr\n')
    f.write('\n'.join([';'.join(x) for x in zip(y_test,test_logit)]))

In [60]:
X_test.to_csv('X_test.csv',encoding='utf-8-sig',sep=';')

## сделать лейблы бинарными и посмотреть качество 

In [42]:
y_test_backup = deepcopy(y_test)
logit_test_backup = deepcopy(test_logit)

In [None]:
test_logit = deepcopy(logit_test_backup)
y_test = deepcopy(y_test_backup)

In [43]:
test_logit = [1 if i != 'O' else 0 for i in test_logit]
y_test[y_test != 'O'] = 1
y_test[y_test == 'O'] = 0

In [44]:
print('Accuracy:')
print(accuracy_score(list(y_test), test_logit))
print(classification_report(list(y_test), test_logit))

Accuracy:
0.9684590438330369
             precision    recall  f1-score   support

          0       0.98      0.99      0.98     55845
          1       0.37      0.29      0.33      1509

avg / total       0.96      0.97      0.97     57354

