## Import Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
df = pd.read_pickle('df_train_api.pk')

In [3]:
df.head()

Unnamed: 0,groups,type,data,label,coords
270,ACME VISUAL SYSTEMS,"(acm visual system, text,text,text, 2337, 1653)",14754,0,"[511.0, 143.0, 1217.0, 189.0, 864.0, 166.0]"
271,TAX INVOICE,"(tax invoic, text,text, 2337, 1653)",14754,0,"[717.0, 264.0, 877.0, 284.0, 797.0, 274.0]"
272,ACME VISUAL SYSTEMS,"(acm visual system, text,text,text, 2337, 1653)",14754,0,"[166.0, 351.0, 435.0, 369.0, 300.5, 360.0]"
273,"808, JANTA FLAT, GTB ENCLAVE, NANAD NAGARI","(number janta flat gtb enclav nanad nagari, nu...",14754,0,"[167.0, 384.0, 712.0, 403.0, 439.5, 393.5]"
274,DELHI-110093,"(delhi-110093, text, 2337, 1653)",14754,0,"[168.0, 417.0, 323.0, 436.0, 245.5, 426.5]"


In [4]:
X = df[['groups']]
y = df['label']

In [5]:
X.head()

Unnamed: 0,groups
270,ACME VISUAL SYSTEMS
271,TAX INVOICE
272,ACME VISUAL SYSTEMS
273,"808, JANTA FLAT, GTB ENCLAVE, NANAD NAGARI"
274,DELHI-110093


## Preprocessing data (word to vector)

In [6]:
from keras.preprocessing.text import Tokenizer
max_features = 4000
tokenizer = Tokenizer(num_words=max_features, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', split=' ', lower=True, )
tokenizer.fit_on_texts(X['groups'].values)

X = tokenizer.texts_to_sequences(X['groups'].values)

# add padding
from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(X, maxlen=40)


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Test Train Split

In [7]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Evaluation Function

In [11]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [12]:
def print_score(clf, X_train, X_test, y_train, y_test, train = True):
#     print accuracy score, classification report, confusion metrics
    if train:
#         training performance
        print('Train Result : \n')
        print('Accuracy Score {0:.4f}\n'.format(accuracy_score(y_train, clf.predict(X_train))))
        print('Classification Report : \n {} \n'.format(classification_report(y_train, clf.predict(X_train))))
        print('Confusion Metrics : \n {} \n'.format(confusion_matrix(y_train, clf.predict(X_train))))
        
        res = cross_val_score(clf, X_train, y_train, cv = 10, scoring='accuracy')
        print('Average Accuracy : {0:.4f}\n'.format(np.mean(res)))
        print('Accuracy SD : {0:.4f}\n'.format(np.std(res)))
        
    elif train == False:
#         test performance
        print('Test Result : \n')
        print('Accuracy Score {0:.4f}\n'.format(accuracy_score(y_test, clf.predict(X_test))))
        print('Classification Report : \n {}\n'.format(classification_report(y_test, clf.predict(X_test))))
        print('Confusion Metrics : \n {} \n'.format(confusion_matrix(y_test, clf.predict(X_test))))


## Using Bagging Clasifier with Decision Tree

In [13]:
from sklearn.ensemble import BaggingClassifier

In [14]:
bag_clf = BaggingClassifier(n_estimators=300)

In [15]:
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=300, n_jobs=1, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

In [16]:
print_score(bag_clf, X_train, X_test, y_train, y_test, train = True)
print_score(bag_clf, X_train, X_test, y_train, y_test, train = False)

Train Result : 

Accuracy Score 0.9932

Classification Report : 
              precision    recall  f1-score   support

          0       0.99      1.00      1.00      3591
          1       0.97      0.94      0.95        62
          2       1.00      0.95      0.97        58
          8       0.93      0.87      0.90        15
         14       0.94      0.85      0.89        20
         18       0.98      0.82      0.90        57

avg / total       0.99      0.99      0.99      3803
 

Confusion Metrics : 
 [[3587    2    0    0    1    1]
 [   4   58    0    0    0    0]
 [   3    0   55    0    0    0]
 [   2    0    0   13    0    0]
 [   2    0    0    1   17    0]
 [  10    0    0    0    0   47]] 

Average Accuracy : 0.9435

Accuracy SD : 0.0084

Test Result : 

Accuracy Score 0.9443

Classification Report : 
              precision    recall  f1-score   support

          0       0.96      0.98      0.97       902
          1       0.50      0.57      0.53         7
        