# Дискурсивные формулы
## На данный момент:
+ 37 текстов
+ 113933 псевдоклаузы
+ Доля класса 1 (формулы) - 2.5 %
+ Доля класса 0 (не формулы) - 97.5 %

In [1]:
import numpy as np 
import pandas as pd
from scipy.sparse import csr_matrix, hstack, vstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
import time
import pickle
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression

class Profiler():
    def __enter__(self):
        self._startTime = time.time()
         
    def __exit__(self, type, value, traceback):
        print ("Время обучения: {:.3f} секунд".format(time.time() - self._startTime))

seed = 42

In [2]:
data = pd.read_csv('train_data.csv', encoding = 'utf-8-sig', sep = ';')
data.shape

(113933, 27)

In [3]:
target = pd.read_csv('train_target.csv',encoding = 'utf-8-sig', sep = ';' )
target.ix[data['First'] == 0,'"Target"'] = 0
y = np.array(target['"Target"']).reshape(-1,)

In [4]:
print('Доля формул в выборке:')
print(target['"Target"'].value_counts()[1]/len(target))
print('Доля НЕ формул в выборке:')
print(target['"Target"'].value_counts()[0]/len(target))

Доля формул в выборке:
0.0250322557995
Доля НЕ формул в выборке:
0.974967744201


In [5]:
# в качестве теста - взять несколько текстов
test_texts = {'Puzhaeva_Гоголь. Игроки',
             'Petrushevskaya_uroki_muzyki_ev_prov',
             'Yad_EV_prov'}
criterion = data['"Text_id"'].map(lambda x: x in test_texts)
test_indices = data[criterion].index.tolist()
texts_test = data[['"Text_id"','Text']][criterion]
X_test = data.loc[test_indices,:]
y_test = np.array(target.loc[test_indices,:]).reshape(-1,)
X_train = data.drop(test_indices)
y_train = np.array(target.drop(test_indices)).reshape(-1,)
X_train.shape, X_test.shape

((100002, 27), (13931, 27))

In [6]:
# векторизаторы по словам и символам
count = CountVectorizer(min_df = 10, ngram_range = (1,2))
count_char = CountVectorizer(min_df = 10, ngram_range = (3,4),analyzer='char')

In [7]:
# фиттим на всех данных, трансформируем текст клауз тренировочной и тестовой выборок, а также все данные 
count.fit(data['Text'])
count_train = count.transform(X_train['Text'])
count_test = count.transform(X_test['Text'])
count_texts = count.transform(data['Text'])

count_char.fit(data['Text'])
count_char_train = count_char.transform(X_train['Text'])
count_char_test = count_char.transform(X_test['Text'])
count_char_texts = count_char.transform(data['Text'])

In [8]:
# удаляем ненужные колонки, трансформируем наши существующие признаки в разреженную матрицу, чтобы объединить с существующими
# векторизированными признаками
X_train = X_train.drop(['Text','"Text_id"'],axis=1)
X_test = X_test.drop(['Text','"Text_id"'],axis=1)
X_train_sparse = csr_matrix(X_train)
X_test_sparse = csr_matrix(X_test)
alldata_sparse = csr_matrix(data.drop(['Text','"Text_id"'],axis=1))

In [9]:
# объединение матриц
X_train_count = hstack((count_train,count_char_train,X_train_sparse))
X_test_count = hstack((count_test,count_char_test,X_test_sparse))
data_count = hstack((count_texts,count_char_texts,alldata_sparse))

X_train_count,X_test_count

(<100002x28696 sparse matrix of type '<class 'numpy.int64'>'
 	with 3530359 stored elements in COOrdinate format>,
 <13931x28696 sparse matrix of type '<class 'numpy.int64'>'
 	with 473828 stored elements in COOrdinate format>)

## Feature selection

In [10]:
fs = count.get_feature_names() + count_char.get_feature_names() + list(data.columns)

# вычисляем ненужные признаки (да здравствует l1-регуляризация)
logit_preproc = LogisticRegression(class_weight={1:30,0:1},random_state=seed,penalty='l1',solver='liblinear')
logit_preproc.fit(X_train_count,y_train)

# убираем ненужные признаки
X_train_imp = X_train_count.tocsr()[:,np.nonzero(logit_preproc.coef_[0])[0]]
X_test_imp = X_test_count.tocsr()[:,np.nonzero(logit_preproc.coef_[0])[0]]
data_imp = data_count.tocsr()[:,np.nonzero(logit_preproc.coef_[0])[0]]
fs_clean = [fs[i] for i in np.nonzero(logit_preproc.coef_[0])[0]]
X_train_imp.shape, X_test_imp.shape

((100002, 2142), (13931, 2142))

## Бейзлайн

In [11]:
data.columns

Index(['"Text_id"', 'Text', 'Len', 'Subject', 'Object', 'Predicate',
       'Emotions', 'Imperative', 'Question', 'First', 'NOUN', 'ADJF', 'ADJS',
       'COMP', 'VERB', 'INFN', 'PRTF', 'PRTS', 'GRND', 'NUMR', 'ADVB', 'NPRO',
       'PRED', 'PREP', 'CONJ', 'PRCL', 'INTJ'],
      dtype='object')

In [12]:
# правиловый
# первые в реплике перед восклиц знаком и вопрос знаком, если там есть союзы или частицы
from copy import deepcopy
pred_base = deepcopy(target)
pred_base['"Target"'] = 0
pred_base.loc[X_test.loc[(X_test['First'] == 1) & ((X_test['Emotions'] == 1) | (X_test['Question'] == 1)) &
            ((X_test['PRCL'] != 0) | (X_test['CONJ'] != 0))].index,'"Target"'] = 1
pred_base = np.array(pred_base.loc[X_test.index,'"Target"']).reshape(-1,)

In [13]:
print(classification_report(y_test,pred_base))

             precision    recall  f1-score   support

          0       0.98      0.97      0.98     13594
          1       0.18      0.25      0.21       337

avg / total       0.96      0.95      0.96     13931



## Выбросы

In [16]:
from sklearn.ensemble import IsolationForest
isolforest = IsolationForest(random_state=seed)
isolforest.fit(X_train_count,y_train)
out_pred = isolforest.predict(X_test_count)

In [17]:
out_pred[out_pred == 1] = 0
out_pred[out_pred == -1] = 1
print(classification_report(y_test,out_pred))

             precision    recall  f1-score   support

          0       0.97      0.92      0.95     13594
          1       0.00      0.00      0.00       337

avg / total       0.95      0.90      0.92     13931



## Обучение нескольких классификаторов на данных

In [26]:
# кроссвалидация по пяти фолдам
skf = StratifiedKFold(n_splits = 5, random_state = seed)

### Random Forest

In [30]:
forest = RandomForestClassifier(n_estimators=300,
                             criterion='entropy',
                             min_samples_leaf=5,
                             class_weight = {1:30,0:1},
                             random_state=seed)

In [31]:
cv_pred = cross_val_predict(forest, X_train_imp, y_train, cv = skf, verbose = 5)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   48.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.4min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  3.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.0min finished


In [32]:
# удаление предсказанных формул, находящихся после третьей псевдоклаузы - по теоретическим соображениям
print(np.sum(cv_pred))
idx = X_train['First'].index
for i,pr in enumerate(cv_pred):
    if X_train['First'][idx[i]] == 0:
        cv_pred[i] = 0
print(np.sum(cv_pred))

8388
8076


In [33]:
# на кросс-валидации
print('Accuracy:')
print(accuracy_score(y_train, cv_pred))
print(classification_report(y_train, cv_pred))

Accuracy:
0.934211315774
             precision    recall  f1-score   support

          0       0.99      0.94      0.97     97487
          1       0.25      0.80      0.38      2515

avg / total       0.98      0.93      0.95    100002



In [None]:
clf.fit(clust_count,np.array(target['"Target"']).reshape(-1,))

In [None]:
 # топ-20 признаков с весами из решающего леса
fs = tfidf.get_feature_names() + count_char.get_feature_names() + list(X_train.columns)
wfs = list(zip(fs,clf.feature_importances_))
wfs = sorted(wfs,key=lambda x: x[1],reverse = True)
for w in wfs[:20]:
    print(w[0],w[1],sep=' - ')

### Logistic Regression

In [34]:
# logit
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(class_weight={1:30,0:1},random_state=seed)
cv_logit = cross_val_predict(logit, X_train_imp, y_train, cv = skf, verbose = 5)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.9s finished


In [35]:
print(np.sum(cv_logit))
idx = X_train['First'].index
for i,pr in enumerate(cv_logit):
    if X_train['First'][idx[i]] == 0:
        cv_logit[i] = 0
print(np.sum(cv_logit))

7116
7114


In [36]:
# на кросс-валидации
from sklearn.metrics import classification_report, accuracy_score
print('Accuracy:')
print(accuracy_score(y_train, cv_logit))
print(classification_report(y_train, cv_logit))

Accuracy:
0.945591088178
             precision    recall  f1-score   support

          0       1.00      0.95      0.97     97487
          1       0.29      0.83      0.43      2515

avg / total       0.98      0.95      0.96    100002



### Ridge Classifier

In [27]:
clf = RidgeClassifier(alpha=40,class_weight={1:30,0:1},random_state=seed)
cv_ridge = cross_val_predict(clf, X_train_imp, y_train, cv = skf, verbose = 5)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    6.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   10.7s finished


In [28]:
print(np.sum(cv_ridge))
idx = X_train['First'].index
for i,pr in enumerate(cv_ridge):
    if X_train['First'][idx[i]] == 0:
        cv_ridge[i] = 0
print(np.sum(cv_ridge))

10934
9056


In [29]:
print('Accuracy:')
print(accuracy_score(y_train, cv_ridge))
print(classification_report(y_train, cv_ridge))

Accuracy:
0.926871462571
             precision    recall  f1-score   support

          0       1.00      0.93      0.96     97487
          1       0.24      0.85      0.37      2515

avg / total       0.98      0.93      0.95    100002



### Support Vector Classifier

In [37]:
clf = LinearSVC(C=0.05,
                class_weight={1:30,0:1},
                random_state=seed)
cv_svc = cross_val_predict(clf, X_train_imp, y_train, cv = skf, verbose = 5)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    9.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   12.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   16.0s finished


In [38]:
print(np.sum(cv_svc))
idx = X_train['First'].index
for i,pr in enumerate(cv_svc):
    if X_train['First'][idx[i]] == 0:
        cv_svc[i] = 0
print(np.sum(cv_svc))

7360
7354


In [39]:
print('Accuracy:')
print(accuracy_score(y_train, cv_svc))
print(classification_report(y_train, cv_svc))

Accuracy:
0.943691126177
             precision    recall  f1-score   support

          0       1.00      0.95      0.97     97487
          1       0.29      0.84      0.43      2515

avg / total       0.98      0.94      0.96    100002



### Эксперименты с вероятностями

In [31]:
from sklearn.metrics import roc_auc_score, f1_score, recall_score

max_score = 0
max_threshold = 0
max_alpha = 0
for alpha in np.linspace(0.0, 1.0, 51): #переберем числа от 0.01 до 0.99
    print(alpha)
    for threshold in np.linspace(0.4,1.0,31):
        w = cv_pred * alpha + cv_logit * (1 - alpha) #подберем веса для двух моделей
        w = np.where(w > threshold, 1, 0)
        score = f1_score(y_train,w) * recall_score(y_train,w)
        if score > max_score:
            max_threshold = threshold
            max_alpha = alpha
            max_score = score


cv_cum = cv_pred * max_alpha + cv_logit * (1 - max_alpha) #берем модели с весами, которые нашли выше
cv_cum = np.where(cv_cum > max_threshold, 1, 0)
print(classification_report(y_train,cv_cum))

0.0
0.02
0.04
0.06
0.08
0.1
0.12
0.14
0.16
0.18
0.2
0.22
0.24
0.26
0.28
0.3
0.32
0.34
0.36
0.38
0.4
0.42
0.44
0.46
0.48
0.5
0.52
0.54
0.56
0.58
0.6
0.62
0.64
0.66
0.68
0.7
0.72
0.74
0.76
0.78
0.8
0.82
0.84
0.86
0.88
0.9
0.92
0.94
0.96
0.98
1.0
             precision    recall  f1-score   support

          0       1.00      0.96      0.98    107488
          1       0.33      0.80      0.47      2576

avg / total       0.98      0.96      0.97    110064



  'precision', 'predicted', average, warn_for)


In [32]:
max_alpha, max_threshold

(0.0, 0.68000000000000005)

## Взвешенное голосование 

In [40]:
cv_cum = [int(round((cv_pred[i]*0.25 + cv_ridge[i]*0.25 + 
                 + cv_svc[i]*0.25 + cv_logit[i]*0.25))) for i in range(len(cv_pred))]

In [41]:
print('Accuracy:')
print(accuracy_score(y_train, cv_cum))
print(classification_report(y_train, cv_cum)) 
# было 30 86

Accuracy:
0.947541049179
             precision    recall  f1-score   support

          0       1.00      0.95      0.97     97487
          1       0.30      0.82      0.44      2515

avg / total       0.98      0.95      0.96    100002



# тест

In [18]:
# preprocessing
count = CountVectorizer(min_df = 10, ngram_range = (1,2))
count_char = CountVectorizer(min_df = 10, ngram_range = (3,4),analyzer='char')

In [19]:
#models
forest = RandomForestClassifier(n_estimators=300,
                                criterion='entropy',
                                min_samples_leaf=5,
                                class_weight = {1:30,0:1},
                                random_state=seed)

logit = LogisticRegression(class_weight={1:30,0:1},
                             random_state=seed)


ridge = RidgeClassifier(alpha=40,
                        class_weight={1:30,0:1},
                        random_state=seed)

svc = LinearSVC(C=0.05,
                class_weight={1:30,0:1},
                random_state=seed)

In [20]:
count.fit(data['Text'])
count_char.fit(data['Text'])

CountVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(3, 4), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

### без текстовых признаков

In [21]:
forest.fit(X_train, y_train)
logit.fit(X_train, y_train)
svc.fit(X_train, y_train)
ridge.fit(X_train, y_train)

f_pred = forest.predict(X_test)
l_pred = logit.predict(X_test)
s_pred = svc.predict(X_test)
r_pred = ridge.predict(X_test)

idx = X_test.index
for i,pr in enumerate(f_pred):
    if X_test['First'][idx[i]] == 0:
        f_pred[i] = 0
        l_pred[i] = 0
        s_pred[i] = 0
        r_pred[i] = 0

print('forest',classification_report(y_test,f_pred),sep='\n')
print('logit',classification_report(y_test,l_pred),sep='\n')
print('svc',classification_report(y_test,s_pred),sep='\n')
print('ridge',classification_report(y_test,r_pred),sep='\n')

cv_cum_test = [int(round((f_pred[i]*0.25 + r_pred[i]*0.25 +
                 + s_pred[i]*0.25 + l_pred[i]*0.25))) for i in range(len(f_pred))]

# на кросс-валидации
from sklearn.metrics import classification_report, accuracy_score
print('Accuracy:')
print(accuracy_score(y_test, cv_cum_test))
print(classification_report(y_test, cv_cum_test))

forest
             precision    recall  f1-score   support

          0       0.99      0.88      0.94     13594
          1       0.14      0.78      0.24       337

avg / total       0.97      0.88      0.92     13931

logit
             precision    recall  f1-score   support

          0       1.00      0.83      0.91     13594
          1       0.12      0.87      0.20       337

avg / total       0.97      0.84      0.89     13931

svc
             precision    recall  f1-score   support

          0       1.00      0.83      0.91     13594
          1       0.11      0.86      0.20       337

avg / total       0.97      0.83      0.89     13931

ridge
             precision    recall  f1-score   support

          0       1.00      0.80      0.89     13594
          1       0.10      0.90      0.18       337

avg / total       0.98      0.80      0.87     13931

Accuracy:
0.837915440385
             precision    recall  f1-score   support

          0       1.00      0.84      

### все признаки

In [22]:
forest.fit(X_train_count, y_train)
logit.fit(X_train_count, y_train)
svc.fit(X_train_count, y_train)
ridge.fit(X_train_count, y_train)

f_pred = forest.predict(X_test_count)
l_pred = logit.predict(X_test_count)
s_pred = svc.predict(X_test_count)
r_pred = ridge.predict(X_test_count)

idx = X_test.index
for i,pr in enumerate(f_pred):
    if X_test['First'][idx[i]] == 0:
        f_pred[i] = 0
        l_pred[i] = 0
        s_pred[i] = 0
        r_pred[i] = 0

print('forest',classification_report(y_test,f_pred),sep='\n')
print('logit',classification_report(y_test,l_pred),sep='\n')
print('svc',classification_report(y_test,s_pred),sep='\n')
print('ridge',classification_report(y_test,r_pred),sep='\n')

cv_cum_test = [int(round((f_pred[i]*0.25 + r_pred[i]*0.25 +
                 + s_pred[i]*0.25 + l_pred[i]*0.25))) for i in range(len(f_pred))]


from sklearn.metrics import classification_report, accuracy_score
print('Accuracy:')
print(accuracy_score(y_test, cv_cum_test))
print(classification_report(y_test, cv_cum_test))# на кросс-валидации

forest
             precision    recall  f1-score   support

          0       0.99      0.95      0.97     13594
          1       0.27      0.73      0.39       337

avg / total       0.98      0.95      0.96     13931

logit
             precision    recall  f1-score   support

          0       0.99      0.95      0.97     13594
          1       0.28      0.73      0.40       337

avg / total       0.98      0.95      0.96     13931

svc
             precision    recall  f1-score   support

          0       0.99      0.95      0.97     13594
          1       0.28      0.75      0.41       337

avg / total       0.98      0.95      0.96     13931

ridge
             precision    recall  f1-score   support

          0       0.99      0.94      0.97     13594
          1       0.26      0.80      0.39       337

avg / total       0.98      0.94      0.95     13931

Accuracy:
0.952192950973
             precision    recall  f1-score   support

          0       0.99      0.96      

### все признаки + feature selection

In [23]:
forest.fit(X_train_imp, y_train)
logit.fit(X_train_imp, y_train)
svc.fit(X_train_imp, y_train)
ridge.fit(X_train_imp, y_train)

f_pred = forest.predict(X_test_imp)
l_pred = logit.predict(X_test_imp)
s_pred = svc.predict(X_test_imp)
r_pred = ridge.predict(X_test_imp)

idx = X_test.index
for i,pr in enumerate(f_pred):
    if X_test['First'][idx[i]] == 0:
        f_pred[i] = 0
        l_pred[i] = 0
        s_pred[i] = 0
        r_pred[i] = 0

print('forest',classification_report(y_test,f_pred),sep='\n')
print('logit',classification_report(y_test,l_pred),sep='\n')
print('svc',classification_report(y_test,s_pred),sep='\n')
print('ridge',classification_report(y_test,r_pred),sep='\n')

cv_cum_test = [int(round((f_pred[i]*0.25 + r_pred[i]*0.25 +
                 + s_pred[i]*0.25 + l_pred[i]*0.25))) for i in range(len(f_pred))]

# на кросс-валидации
from sklearn.metrics import classification_report, accuracy_score
print('Accuracy:')
print(accuracy_score(y_test, cv_cum_test))
print(classification_report(y_test, cv_cum_test))# на кросс-валидации

forest
             precision    recall  f1-score   support

          0       0.99      0.95      0.97     13594
          1       0.26      0.74      0.38       337

avg / total       0.98      0.94      0.96     13931

logit
             precision    recall  f1-score   support

          0       0.99      0.95      0.97     13594
          1       0.27      0.74      0.40       337

avg / total       0.98      0.95      0.96     13931

svc
             precision    recall  f1-score   support

          0       0.99      0.95      0.97     13594
          1       0.27      0.75      0.40       337

avg / total       0.98      0.95      0.96     13931

ridge
             precision    recall  f1-score   support

          0       1.00      0.94      0.97     13594
          1       0.24      0.81      0.38       337

avg / total       0.98      0.93      0.95     13931

Accuracy:
0.949321656737
             precision    recall  f1-score   support

          0       0.99      0.95      

In [47]:
# записать таблицу в файл - всю таблицу со всеми признаками, предсказанием и реальными ответами
texts = pd.concat((data.reset_index(drop=True), pd.DataFrame(cv_cum), pd.DataFrame(target)), axis = 1) 
texts.columns = [x if x != 0 else 'predicted' for x in texts.columns]
texts.to_csv('predicted.csv',sep=';',index=False)

In [48]:
# записать только тестовую таблицу
test_texts = pd.concat((test_data.reset_index(drop=True), pd.DataFrame(cv_cum_test), test_target), axis = 1) 
test_texts.columns = [x if x != 0 else 'predicted' for x in texts.columns]
test_texts.to_csv('test_predicted.csv',sep=';',index=False)

# Обучить на всех пьесах и записать модели в файлы pickle

In [42]:
# preprocessing
count = CountVectorizer(min_df = 10, ngram_range = (1,2))
count_char = CountVectorizer(min_df = 10, ngram_range = (3,4),analyzer='char')

forest = RandomForestClassifier(n_estimators=300,
                                criterion='entropy',
                                min_samples_leaf=5,
                                class_weight = {1:30,0:1},
                                random_state=seed)

logit = LogisticRegression(class_weight={1:30,0:1},
                             random_state=seed)

ridge = RidgeClassifier(alpha=40,
                        class_weight={1:30,0:1},
                        random_state=seed)

svc = LinearSVC(C=0.05,
                class_weight={1:30,0:1},
                random_state=seed)


In [43]:
full_data = pd.read_csv('train_data.csv', encoding = 'utf-8-sig', sep = ';')
full_target = pd.read_csv('train_target.csv',encoding = 'utf-8-sig', sep = ';' )
len(full_data)

113933

In [44]:
count.fit(full_data['Text'])
count_char.fit(full_data['Text'])

pickle.dump(count,open('count.pickle','wb'))
pickle.dump(count_char,open('count_char.pickle','wb'))

In [45]:
count_full = count.transform(full_data['Text'])
count_char_full = count_char.transform(full_data['Text'])
X_full_data = full_data.drop(['Text','"Text_id"'],axis=1)
full_data_sparse = csr_matrix(X_full_data)
full_data_count = hstack((count_full,count_char_full,full_data_sparse))

In [46]:
forest.fit(full_data_count, np.array(full_target['"Target"']).reshape(-1,))
logit.fit(full_data_count, np.array(full_target['"Target"']).reshape(-1,))
svc.fit(full_data_count, np.array(full_target['"Target"']).reshape(-1,))
ridge.fit(full_data_count, np.array(full_target['"Target"']).reshape(-1,))

pickle.dump(forest,open('forest.pickle','wb'))
pickle.dump(logit,open('logit.pickle','wb'))
pickle.dump(svc,open('svc.pickle','wb'))
pickle.dump(ridge,open('ridge.pickle','wb'))

In [47]:
count_f = pickle.load(open('count.pickle','rb'))
count_f_full = count_f.transform(full_data['Text'])

count_full,count_f_full

(<113933x4929 sparse matrix of type '<class 'numpy.int64'>'
 	with 277970 stored elements in Compressed Sparse Row format>,
 <113933x4929 sparse matrix of type '<class 'numpy.int64'>'
 	with 277970 stored elements in Compressed Sparse Row format>)