NB: Взболтать но не смешивать, shuffle объектов противопоказан (следующий объект зависит от предыдущего)

In [1]:
import numpy as np 
import pandas as pd
from scipy.sparse import csr_matrix, hstack, vstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
import time
import pickle
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression

seed = 42
CONTEXT_WINDOW = (-5, 5)

In [2]:
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

In [3]:
w2v_model = Word2Vec.load('word2vec/araneum_none_fasttextskipgram_300_5_2018.model')
# model = KeyedVectors.load_word2vec_format('araneum_none_fasttextskipgram_300_5_2018.tgz', binary=True)

In [4]:
vectors = w2v_model.wv

In [5]:
data = pd.read_csv('train_data.csv', encoding = 'utf-8-sig', sep = ';',index_col=0)
data = data.fillna('')
data.head()

Unnamed: 0,Text_-5,Lemma_-5,POS_-5,Target_-5,Text_-4,Lemma_-4,POS_-4,Target_-4,Text_-3,Lemma_-3,...,POS_2,Text_3,Lemma_3,POS_3,Text_4,Lemma_4,POS_4,Text_5,Lemma_5,POS_5
0,,,,,,,,,,,...,NOUN,.,.,PNCT,невольницы,невольница,NOUN,н.,н.,UNKN
1,,,,,,,,,,,...,PNCT,невольницы,невольница,NOUN,н.,н.,UNKN,островский,островский,NOUN
2,,,,,,,,,,,...,NOUN,н.,н.,UNKN,островский,островский,NOUN,.,.,PNCT
3,,,,,,,,,островский,островский,...,UNKN,островский,островский,NOUN,.,.,PNCT,невольницы,невольница,NOUN
4,,,,,островский,островский,NOUN,O,александр,александр,...,NOUN,.,.,PNCT,невольницы,невольница,NOUN,действие,действие,NOUN


In [6]:
data.shape

(484157, 40)

In [7]:
vectors.word_vec('очень').shape

(300,)

In [None]:
all_vectors = []
cnt = 0
for lemma in data["Lemma"]:
    try:
        all_vectors.append(vectors.word_vec(lemma))
    except KeyError:
        cnt += 1
        all_vectors.append(np.zeros(shape=(300,)))
print(cnt)

In [9]:
all_vectors = np.array(all_vectors)

In [10]:
all_vectors = pd.DataFrame(all_vectors)

In [11]:
data = pd.concat((data, all_vectors), axis=1)

In [12]:
data.shape

(484157, 340)

In [13]:
target = pd.read_csv('train_target.csv',encoding = 'utf-8-sig', sep = ';' )
y = np.array(target['Target']).reshape(-1,)

In [14]:
# в качестве теста - взять несколько текстов
test_texts = {'Puzhaeva_Гоголь. Игроки.txt',
             'Petrushevskaya_uroki_muzyki_ev_prov.txt',
             'Yad_EV_prov.txt'}
criterion = data['Text_id'].map(lambda x: x in test_texts)
test_indices = data[criterion].index.tolist()
X_test = data.loc[test_indices,:]
y_test = np.array(target.loc[test_indices,:]).reshape(-1,)
X_train = data.drop(test_indices)
y_train = np.array(target.drop(test_indices)).reshape(-1,)
X_train.shape, X_test.shape

((426803, 340), (57354, 340))

In [15]:
def vectorize(data,X_train,X_test,colname):
    print(colname)
    label_enc = LabelEncoder()
    onehot_enc = OneHotEncoder()
    if colname == 'Target':
        label_data = label_enc.fit_transform(['B','I','L','O','U',''])
        cols_to_transform = [colname+'_'+str(i) for i in range(CONTEXT_WINDOW[0],0)]
    else:
        label_data = label_enc.fit_transform(list(data[colname].unique())+[''])
        cols_to_transform = [colname] + [colname+'_'+str(i) for i in range(CONTEXT_WINDOW[0],CONTEXT_WINDOW[1]+1) if i]
    onehot_enc.fit(label_data.reshape(-1, 1))
    count_train = hstack([onehot_enc.transform(label_enc.transform(X_train[col]).reshape(-1, 1)) for col in cols_to_transform])
    count_test = hstack([onehot_enc.transform(label_enc.transform(X_test[col]).reshape(-1, 1)) for col in cols_to_transform])
    count_texts = hstack([onehot_enc.transform(label_enc.transform(data[col]).reshape(-1, 1)) for col in cols_to_transform])
    return count_texts,count_train,count_test,label_enc,onehot_enc

In [16]:
cols_to_vect = ['Text','Lemma','POS','Target'] # target is the last one, it is important
sparse_data = [vectorize(data,X_train,X_test,x) for x in cols_to_vect]
encoders = {cols_to_vect[i]:(x[-2],x[-1]) for i,x in enumerate(sparse_data)}
data_vect = hstack([x[0] for x in sparse_data])
train_vect = hstack([x[1] for x in sparse_data])
test_vect = hstack([x[2] for x in sparse_data])
data_vect.shape, train_vect.shape, test_vect.shape

Text
Lemma
POS
Target


((484157, 729418), (426803, 729418), (57354, 729418))

In [17]:
# кроссвалидация по пяти фолдам
skf = StratifiedKFold(n_splits = 5, random_state = seed)

In [18]:
class CustomLogisticRegression(LogisticRegression):
    def predict(self,X):
        """
        сначала нужно заменить существующие предыдущие таргеты предсказанными моделью
        для этого нужно
            1. векторизировать предыдущие предсказанные (обученные модельки лежат в encoders)
            2. заменить существующие ими
        и предсказать по этому измененному объекту
        """
        predicted = []
        for item in X.tocsr():
            dense_item = np.asarray(item.todense()).reshape(-1)
            prev_pred = predicted[CONTEXT_WINDOW[0]:]
            if len(prev_pred) < -CONTEXT_WINDOW[0]:
                prev_pred = ['']*(-CONTEXT_WINDOW[0]-len(prev_pred)) + prev_pred
            encoded = encoders['Target'][1].transform(encoders['Target'][0].transform(prev_pred).reshape(-1, 1))
            encoded = np.asarray(encoded.todense()).reshape(-1)
            dense_item[-len(encoded):] = encoded
            predicted.append(super().predict([dense_item])[0]) # something like this
        return predicted

In [19]:
logit = CustomLogisticRegression(random_state=seed)
cv_logit = cross_val_predict(logit, train_vect, y_train, cv = skf, verbose = 5)
print('Accuracy:')
print(accuracy_score(y_train, cv_logit))
print(classification_report(y_train, cv_logit))

# это ложь и провокация, сначала нужно убрать из теста проставленные ручками таргеты

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 20.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 40.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 60.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 80.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 101.6min finished


Accuracy:
0.979796299464
             precision    recall  f1-score   support

          B       0.38      0.11      0.16      2139
          I       0.13      0.05      0.07      2844
          L       0.29      0.08      0.13      2139
          O       0.98      1.00      0.99    419119
          U       0.44      0.39      0.42       562

avg / total       0.97      0.98      0.97    426803



In [20]:
with open('results.csv','w',encoding='utf-8-sig') as f:
    f.write('true;pred\n')
    f.write('\n'.join([';'.join(x) for x in zip(y_train,cv_logit)]))