In [1]:
import numpy as np
import pandas as pd
import sklearn 

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
heavy_water_data = pd.read_csv('./shuffled-full-set-hashed.csv', names=["label", "words"], header=None)

In [4]:
heavy_water_data.size

124408

In [5]:
heavy_water_data.head()

Unnamed: 0,label,words
0,DELETION OF INTEREST,e04a09c87692 d6b72e591b91 5d066f0246f1 ed41171...
1,RETURNED CHECK,a3b334c6eefd be95012ebf2b 41d67080e078 ff1c26e...
2,BILL,586242498a88 9ccf259ca087 54709b24b45f 6bf9c0c...
3,BILL,cd50e861f48b 6ca2dd348663 d38820625542 f077614...
4,BILL,9db5536263d8 1c303d15eb65 3f89b4673455 b73e657...


In [6]:
y = heavy_water_data['label']
x = heavy_water_data['words']

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x = x.astype(str)
y = y.astype(str)

In [9]:
# Split train and test data sets, 7:3
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [10]:
x_train_raw = x_train.copy()
x_test_raw = x_test.copy()
y_train_raw = y_train.copy()
y_test_raw = y_test.copy()

In [11]:
# Vectorize the words
vectorizer = TfidfVectorizer(sublinear_tf=True)
# vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=200)
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)


In [12]:
# Target names
target_names = y.unique().tolist()

In [13]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import pickle

In [14]:
# Benchmark of classifier
def benchmark(clf, name):
    print('Training: {0}'.format(name))
    print(clf)
    
    model = clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    
#     accuracy
    acc = accuracy_score(y_test, y_pred)
    print('Accuracy: %.08f'%acc)
    
    f1 = f1_score(y_test, y_pred, average='macro')
    
#     report  
    print(classification_report(y_test, y_pred, target_names=target_names))
    
#     confusion matrix
    print(confusion_matrix(y_test, y_pred))
        
    return model, acc, f1

In [15]:
# Add model to result
def add_model(result, name, model, acc, f1):
    if name not in result:
        result[name] = {}
    result[name]['model'] = model
    result[name]['accuracy'] = acc
    result[name]['f1'] = f1

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

In [17]:
result = {}

In [20]:
# Logistic Regression
name = 'Logistic Regression'
model, acc, f1 = benchmark(LogisticRegression(), name)
add_model(result, name, model, acc, f1)

Training: Logistic Regression
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Accuracy: 0.85280249
                         precision    recall  f1-score   support

   DELETION OF INTEREST       0.83      0.25      0.38        60
         RETURNED CHECK       0.88      0.91      0.89      5717
                   BILL       0.00      0.00      0.00        82
          POLICY CHANGE       0.80      0.90      0.85      2652
    CANCELLATION NOTICE       0.80      0.85      0.82      2948
            DECLARATION       0.94      0.80      0.86       257
     CHANGE ENDORSEMENT       0.69      0.11      0.19       282
     NON-RENEWAL NOTICE       0.91      0.86      0.89      1433
                 BINDER       0.92      0.57      0.71       237
   REINSTATEMENT NOTICE       0.9

In [19]:
# Ridge Regression
name = 'Ridge Classifier'
model, acc, f1 = benchmark(RidgeClassifier(tol=1e-3), name)
add_model(result, name, model, acc, f1)

Training: Ridge Classifier
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='auto',
        tol=0.001)
Accuracy: 0.87337906
                         precision    recall  f1-score   support

   DELETION OF INTEREST       0.81      0.73      0.77        60
         RETURNED CHECK       0.90      0.91      0.90      5717
                   BILL       0.29      0.07      0.12        82
          POLICY CHANGE       0.82      0.91      0.87      2652
    CANCELLATION NOTICE       0.84      0.88      0.86      2948
            DECLARATION       0.92      0.81      0.86       257
     CHANGE ENDORSEMENT       0.60      0.15      0.23       282
     NON-RENEWAL NOTICE       0.93      0.89      0.91      1433
                 BINDER       0.88      0.71      0.79       237
   REINSTATEMENT NOTICE       0.61      0.27      0.37        71
      EXPIRATION NOTICE       0.95      0.85      0.90       202

In [21]:
# Perceptron
name = 'Perceptron'
model, acc, f1 = benchmark(Perceptron(n_iter=100), name)
# add_model(result, name, model, acc, f1)

Training: Perceptron
Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      max_iter=None, n_iter=100, n_jobs=1, penalty=None, random_state=0,
      shuffle=True, tol=None, verbose=0, warm_start=False)




Accuracy: 0.86041153
                         precision    recall  f1-score   support

   DELETION OF INTEREST       0.63      0.77      0.69        60
         RETURNED CHECK       0.87      0.91      0.89      5717
                   BILL       0.19      0.06      0.09        82
          POLICY CHANGE       0.88      0.85      0.87      2652
    CANCELLATION NOTICE       0.86      0.82      0.84      2948
            DECLARATION       0.88      0.83      0.85       257
     CHANGE ENDORSEMENT       0.36      0.28      0.31       282
     NON-RENEWAL NOTICE       0.91      0.90      0.91      1433
                 BINDER       0.88      0.70      0.78       237
   REINSTATEMENT NOTICE       0.48      0.46      0.47        71
      EXPIRATION NOTICE       0.93      0.88      0.90       202
INTENT TO CANCEL NOTICE       0.82      0.85      0.83      3173
            APPLICATION       0.92      0.94      0.93      1337
            BILL BINDER       0.83      0.92      0.87       211

  

In [22]:
# Passive Aggressive 
name = "Passive-Aggressive"
model, acc, f1 = benchmark(PassiveAggressiveClassifier(n_iter=100), name)
# add_model(result, name, model, acc, f1)

Training: Passive-Aggressive
PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
              fit_intercept=True, loss='hinge', max_iter=None, n_iter=100,
              n_jobs=1, random_state=None, shuffle=True, tol=None,
              verbose=0, warm_start=False)




Accuracy: 0.86952095
                         precision    recall  f1-score   support

   DELETION OF INTEREST       0.78      0.77      0.77        60
         RETURNED CHECK       0.89      0.90      0.90      5717
                   BILL       0.31      0.12      0.18        82
          POLICY CHANGE       0.83      0.91      0.87      2652
    CANCELLATION NOTICE       0.85      0.86      0.86      2948
            DECLARATION       0.90      0.83      0.87       257
     CHANGE ENDORSEMENT       0.40      0.31      0.35       282
     NON-RENEWAL NOTICE       0.92      0.90      0.91      1433
                 BINDER       0.83      0.76      0.79       237
   REINSTATEMENT NOTICE       0.65      0.46      0.54        71
      EXPIRATION NOTICE       0.93      0.90      0.91       202
INTENT TO CANCEL NOTICE       0.85      0.84      0.84      3173
            APPLICATION       0.94      0.94      0.94      1337
            BILL BINDER       0.96      0.91      0.93       211

  

In [23]:
# Random Forest
name = "Random forest"
model, acc, f1 = benchmark(RandomForestClassifier(n_estimators=5), name)
# add_model(result, name, model, acc, f1)

Training: Random forest
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Accuracy: 0.78153467
                         precision    recall  f1-score   support

   DELETION OF INTEREST       0.66      0.38      0.48        60
         RETURNED CHECK       0.80      0.89      0.84      5717
                   BILL       0.26      0.10      0.14        82
          POLICY CHANGE       0.76      0.82      0.79      2652
    CANCELLATION NOTICE       0.71      0.76      0.74      2948
            DECLARATION       0.79      0.79      0.79       257
     CHANGE ENDORSEMENT       0.26      0.09      0.13       282
     NON-RENEW

In [24]:
# SGD Elastic Net
name = "SGD elastic net"
model, acc, f1 = benchmark(SGDClassifier(alpha=.0001, n_iter=3, penalty="elasticnet"), name)
# add_model(result, name, model, acc, f1)

Training: SGD elastic net
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=3,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)




Accuracy: 0.85869682
                         precision    recall  f1-score   support

   DELETION OF INTEREST       0.84      0.36      0.50        72
         RETURNED CHECK       0.88      0.92      0.90      5729
                   BILL       0.00      0.00      0.00        88
          POLICY CHANGE       0.82      0.88      0.85      2707
    CANCELLATION NOTICE       0.84      0.84      0.84      2885
            DECLARATION       0.93      0.82      0.87       260
     CHANGE ENDORSEMENT       0.67      0.02      0.04       282
     NON-RENEWAL NOTICE       0.91      0.87      0.89      1417
                 BINDER       0.89      0.65      0.75       216
   REINSTATEMENT NOTICE       1.00      0.21      0.34        78
      EXPIRATION NOTICE       0.88      0.84      0.86       180
INTENT TO CANCEL NOTICE       0.81      0.85      0.83      3191
            APPLICATION       0.93      0.94      0.94      1318
            BILL BINDER       0.95      0.85      0.90       239

  

In [18]:
# SVM classifier
name = "SVM classifier"
model, acc, f1 = benchmark(LinearSVC(tol=1e-3), name)
add_model(result, name, model, acc, f1)

Training: SVM classifier
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
     verbose=0)
Accuracy: 0.87568321
                         precision    recall  f1-score   support

   DELETION OF INTEREST       0.78      0.75      0.76        60
         RETURNED CHECK       0.90      0.90      0.90      5717
                   BILL       0.29      0.10      0.15        82
          POLICY CHANGE       0.83      0.91      0.87      2652
    CANCELLATION NOTICE       0.84      0.88      0.86      2948
            DECLARATION       0.91      0.84      0.88       257
     CHANGE ENDORSEMENT       0.53      0.23      0.32       282
     NON-RENEWAL NOTICE       0.93      0.90      0.92      1433
                 BINDER       0.86      0.73      0.79       237
   REINSTATEMENT NOTICE       0.68      0.37      0.48        71
      EXPIRATION NOTICE 

In [28]:
# Naive Bayes
name = "Naive Bayes"
model, acc, f1 = benchmark(MultinomialNB(), name)
# add_model(result, name, model, acc, f1)

Training: Naive Bayes
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Accuracy: 0.76010074
                         precision    recall  f1-score   support

   DELETION OF INTEREST       0.00      0.00      0.00        72
         RETURNED CHECK       0.84      0.85      0.85      5729
                   BILL       0.00      0.00      0.00        88
          POLICY CHANGE       0.68      0.89      0.77      2707
    CANCELLATION NOTICE       0.65      0.82      0.72      2885
            DECLARATION       0.60      0.81      0.69       260
     CHANGE ENDORSEMENT       0.33      0.02      0.04       282
     NON-RENEWAL NOTICE       0.90      0.75      0.82      1417
                 BINDER       0.53      0.28      0.37       216
   REINSTATEMENT NOTICE       0.00      0.00      0.00        78
      EXPIRATION NOTICE       1.00      0.03      0.06       180
INTENT TO CANCEL NOTICE       0.74      0.61      0.67      3191
            APPLICATION       0.90      0.80      0.

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [30]:
# Bernouli
name = "Bernoulli Naive Bayes"
model, acc, f1 = benchmark(BernoulliNB(), name)
# add_model(result, name, model, acc, f1)

Training: Bernoulli Naive Bayes
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
Accuracy: 0.68781481
                         precision    recall  f1-score   support

   DELETION OF INTEREST       0.11      0.47      0.18        72
         RETURNED CHECK       0.94      0.74      0.83      5729
                   BILL       0.10      0.53      0.17        88
          POLICY CHANGE       0.86      0.66      0.75      2707
    CANCELLATION NOTICE       0.72      0.61      0.66      2885
            DECLARATION       0.61      0.82      0.70       260
     CHANGE ENDORSEMENT       0.15      0.48      0.23       282
     NON-RENEWAL NOTICE       0.88      0.75      0.81      1417
                 BINDER       0.42      0.69      0.52       216
   REINSTATEMENT NOTICE       0.11      0.56      0.19        78
      EXPIRATION NOTICE       0.54      0.79      0.64       180
INTENT TO CANCEL NOTICE       0.77      0.58      0.66      3191
            APPLICATION       

In [31]:
# KNN 
name = "kNN"
model, acc, f1 = benchmark(KNeighborsClassifier(n_neighbors=10), name)
# add_model(result, name, model, acc, f1)

Training: kNN
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')
Accuracy: 0.72880720
                         precision    recall  f1-score   support

   DELETION OF INTEREST       0.74      0.35      0.47        72
         RETURNED CHECK       0.84      0.81      0.82      5729
                   BILL       0.42      0.06      0.10        88
          POLICY CHANGE       0.48      0.83      0.61      2707
    CANCELLATION NOTICE       0.83      0.64      0.72      2885
            DECLARATION       0.89      0.82      0.86       260
     CHANGE ENDORSEMENT       0.46      0.06      0.10       282
     NON-RENEWAL NOTICE       0.89      0.86      0.88      1417
                 BINDER       0.86      0.56      0.68       216
   REINSTATEMENT NOTICE       0.77      0.29      0.43        78
      EXPIRATION NOTICE       0.90      0.50      0.64       180
INTENT TO CANCEL NO

In [24]:
result

{'Logistic Regression': {'accuracy': 0.85280248633586964,
  'f1': 0.66120911770769764,
  'model': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False)},
 'Ridge Classifier': {'accuracy': 0.87337905905047686,
  'f1': 0.73661431176238257,
  'model': RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
          max_iter=None, normalize=False, random_state=None, solver='auto',
          tol=0.001)},
 'SVM classifier': {'accuracy': 0.87568320651591469,
  'f1': 0.75454958827038499,
  'model': LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
       intercept_scaling=1, loss='squared_hinge', max_iter=1000,
       multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
       verbose=0)}}

In [27]:
# Ensemble models
def filter_cols(a, b, c):
    
    if c == b:
        return c
    elif c == a:
        return c
    elif b == a:
        return b
    else:
        return c
    
def ensemble_models(result, x_test, y_test):
    
    y_pred = {}
    
    keys = list(result.keys())
    
    for name in keys:
        
        model = result[name]['model']
        y_pred[name] = model.predict(x_test)
        
    y_pred_df = pd.DataFrame(y_pred)
    y_pred_ens = y_pred_df.apply(lambda row: filter_cols(row[keys[0]], row[keys[1]], row[keys[2]]), axis=1)
    y_pred_ens = np.array(y_pred_ens)
    
    print('Training: {0}'.format('Ensemble Method'))
    
#     accuracy
    acc = accuracy_score(y_test, y_pred_ens)
    print('Accuracy: %.08f'%acc)
    
    f1 = f1_score(y_test, y_pred_ens, average='macro')
    print('F1: %.08f'%f1)
    
#     report  
    print(classification_report(y_test, y_pred_ens, target_names=target_names))
    
#     confusion matrix
    print(confusion_matrix(y_test, y_pred_ens))
    
    return y_pred_ens, acc, f1

In [28]:
# Get all result
y_pred, acc, f1 = ensemble_models(result, x_test, y_test)

Training: Ensemble Method
Accuracy: 0.87311113
F1: 0.73770658
                         precision    recall  f1-score   support

   DELETION OF INTEREST       0.83      0.73      0.78        60
         RETURNED CHECK       0.90      0.91      0.90      5717
                   BILL       0.30      0.07      0.12        82
          POLICY CHANGE       0.82      0.91      0.86      2652
    CANCELLATION NOTICE       0.84      0.88      0.86      2948
            DECLARATION       0.92      0.81      0.87       257
     CHANGE ENDORSEMENT       0.60      0.15      0.24       282
     NON-RENEWAL NOTICE       0.93      0.89      0.91      1433
                 BINDER       0.88      0.71      0.79       237
   REINSTATEMENT NOTICE       0.63      0.27      0.38        71
      EXPIRATION NOTICE       0.94      0.84      0.89       202
INTENT TO CANCEL NOTICE       0.85      0.85      0.85      3173
            APPLICATION       0.95      0.95      0.95      1337
            BILL BINDER    

In [29]:
# Handle special cases:
# BILL, CHANGE ENDORSEMENT -misclassified as-> RETURNED CHECK, POLICY CHANGE

In [32]:
# use keras and lstm for classification
from keras.preprocessing import sequence
from keras.preprocessing import text
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb
from keras.models import load_model
from sklearn.preprocessing import LabelEncoder

Using TensorFlow backend.


In [38]:
from keras.utils.np_utils import to_categorical

In [39]:
targets = ['BILL', 'CHANGE ENDORSEMENT', 'RETURNED CHECK', 'POLICY CHANGE']

In [40]:
y_target_col = y.map(lambda x: x in targets)
# Only consider 4 types
x_target = x[y_target_col]
y_target = y[y_target_col]

In [41]:
maxlen = 150
batch_size = 32

print('Loading data...')

# Load data from x and y
x_target_train, x_target_test, y_target_train, y_target_test = train_test_split(x_target, y_target, test_size=0.3)
print(x_target_train.shape[0], 'train sequences')
print(x_target_test.shape[0], 'test sequences')

# Tokenizer
print ('Tokenize data...')
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(x_target_train)
x_target_train = tokenizer.texts_to_sequences(x_target_train)
x_target_test = tokenizer.texts_to_sequences(x_target_test)

# Encoding categorical target
print ('Encoding categorical target...')
lb_encoder = LabelEncoder()
y_target_train = lb_encoder.fit_transform(y_target_train)
y_target_test = lb_encoder.transform(y_target_test)

# Pad sequences
print ('Pad sequences...')
max_features = 795980
print('Pad sequences (samples x time)')
x_target_train = sequence.pad_sequences(x_target_train, maxlen=maxlen)
x_target_test = sequence.pad_sequences(x_target_test, maxlen=maxlen)
print('x_train shape:', x_target_train.shape)
print('x_test shape:', x_target_test.shape)
y_target_train = np.array(y_target_train)
y_target_test = np.array(y_target_test)

# Change target into matrix
y_target_train_cat = to_categorical(y_target_train)
y_target_test_cat = to_categorical(y_target_test)

model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(100)))
model.add(Dropout(0.7))
model.add(Dense(4, activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss = 'categorical_crossentropy', optimizer = 'rmsprop',  metrics=['accuracy'])

print('Train...')
model.fit(x_target_train, 
          y_target_train_cat,
          batch_size=batch_size,
          epochs=4,
          validation_data=[x_target_test, y_target_test_cat])

Loading data...
21863 train sequences
9370 test sequences
Tokenize data...
Encoding categorical target...
Pad sequences...
Pad sequences (samples x time)
x_train shape: (21863, 150)
x_test shape: (9370, 150)
Train...


  num_elements)


Train on 21863 samples, validate on 9370 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x10aa62b00>

### Ensemble model + Keras LSTM model

In [34]:
with open('vectorizer1.pkl', 'rb') as file:
    vectorizer1 = pickle.load(file)
# vectorizer1 = vectorizer

with open('vectorizer2.pkl', 'rb') as file:
    vectorizer2 = pickle.load(file)

with open('label_encoder.pkl', 'rb') as file:
    label_encoder = pickle.load(file)

# with open('ensemble_model.pkl', 'rb') as file:
#     ensemble_model = pickle.load(file)

ensemble_model = {}
for name in result.keys():
    ensemble_model[name] = result[name]['model']

# keras_model = load_model('keras_model.h5')

candidates = ['RETURNED CHECK', 'POLICY CHANGE']
targets = ['DELETION OF INTEREST', 'BILL',
           'CANCELLATION NOTICE', 'DECLARATION', 'CHANGE ENDORSEMENT',
           'NON-RENEWAL NOTICE', 'BINDER', 'REINSTATEMENT NOTICE',
           'EXPIRATION NOTICE', 'INTENT TO CANCEL NOTICE', 'APPLICATION',
           'BILL BINDER']

In [35]:
ensemble_model

{'Logistic Regression': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 'Ridge Classifier': RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
         max_iter=None, normalize=False, random_state=None, solver='auto',
         tol=0.001),
 'SVM classifier': LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
      intercept_scaling=1, loss='squared_hinge', max_iter=1000,
      multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
      verbose=0)}

In [44]:
""""
Get final result

"""


def get_result(words):
    x = pd.Series(words)
    x_train = vectorizer1.transform(x)
    y = get_result_ensemble(x_train)
    if y in candidates:
        try:
            y_lstm = get_result_lstm(words)
            if y_lstm in targets:
                y = y_lstm
        except:
            pass
    return y


"""
Get ensemble methods result
"""


def get_result_ensemble(x):
    a = ensemble_model['Logistic Regression']
    b = ensemble_model['Ridge Classifier']
    c = ensemble_model['SVM classifier']

    a_res = a.predict(x).tolist()[0]
    b_res = b.predict(x).tolist()[0]
    c_res = c.predict(x).tolist()[0]

    res = filter_cols(a_res, b_res, c_res)

    return res


"""
Filter result from 3 models
"""


def filter_cols(a, b, c):
    if c == b:
        return c
    elif c == a:
        return c
    elif b == a:
        return b
    else:
        return c


"""
For specific 4 types using lstm
"""


def get_result_lstm(x):
    x_train = vectorizer2.texts_to_sequences([x])
    x_train = sequence.pad_sequences(x_train, maxlen=150)

    x_train = np.array(x_train)
    x_train = x_train.reshape((1, 150))
    y_pred_prob = keras_model.predict(x_train)

    index = np.argmax(y_pred_prob)
    y_pred = label_encoder.inverse_transform(index)
    return y_pred

In [None]:
y_test_pred = x_test_raw.map(lambda x: get_result(x))

In [None]:
acc = accuracy_score(y_test, y_test_pred)
acc 
print ('Accuracy: %.08f'%acc)

In [None]:
# Building a pipeline

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

estimators = [("tfidf", TfidfVectorizer()), ("lr", LogisticRegression())]
model = Pipeline(estimators)


# Grid search
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
params = {"lr__C":[0.01, 0.1, 1, 10], #regularization param of logistic regression
          "tfidf__min_df": [200, 300], #min count of words 
          "tfidf__ngram_range": [(1,1), (1,2)], #1-grams or 2-grams
          }

grid = GridSearchCV(estimator=model, param_grid=params, scoring="roc_auc")  # use the default 3-fold cross validation
grid.fit(raw_train.values.astype('U'), labels)
print("The best paramenter set is : \n", grid.best_params_)