## Importing Libraries

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [33]:
import pickle as pkl

In [34]:
df = pd.read_pickle('df_train_api.pk')

In [35]:
df.head()

Unnamed: 0,groups,type,data,label,coords
270,ACME VISUAL SYSTEMS,"(acm visual system, text,text,text, 2337, 1653)",14754,0,"[511.0, 143.0, 1217.0, 189.0, 864.0, 166.0]"
271,TAX INVOICE,"(tax invoic, text,text, 2337, 1653)",14754,0,"[717.0, 264.0, 877.0, 284.0, 797.0, 274.0]"
272,ACME VISUAL SYSTEMS,"(acm visual system, text,text,text, 2337, 1653)",14754,0,"[166.0, 351.0, 435.0, 369.0, 300.5, 360.0]"
273,"808, JANTA FLAT, GTB ENCLAVE, NANAD NAGARI","(number janta flat gtb enclav nanad nagari, nu...",14754,0,"[167.0, 384.0, 712.0, 403.0, 439.5, 393.5]"
274,DELHI-110093,"(delhi-110093, text, 2337, 1653)",14754,0,"[168.0, 417.0, 323.0, 436.0, 245.5, 426.5]"


In [36]:
df = df[df.label != 0]

In [37]:
df.shape

(261, 5)

In [38]:
# df[df['label'] == 18]

In [39]:
X = df[['groups']]
X.head()

Unnamed: 0,groups
278,GS-032
281,22/08/2017
310,"Rs16,640"
345,N-10-01/2016-17
346,10-11-2017


In [40]:
y = df['label']

## Word tokenizer

In [41]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer

In [42]:
c_vectorizer = CountVectorizer()
h_vectorizer = HashingVectorizer()
f_vectorizer = TfidfVectorizer()

In [43]:
h_vectorizer.fit(df['groups'].values)

HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 1), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None, strip_accents=None,
         token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None)

In [44]:
X = h_vectorizer.transform(df['groups'])
X = X.toarray()
X.shape

(261, 1048576)

## Import PCA

In [45]:
from sklearn.decomposition import PCA

In [46]:
pca = PCA()

In [None]:
pca.fit(X)

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_)*100)
plt.xlabel("No. of components")
plt.ylabel("cummulative explained Variance");

In [18]:
X_pca = pca.transform(X)

In [19]:
X_pca.shape

(4754, 300)

## Train test split and scaling

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size = 0.2, random_state = 42)

In [22]:
# sc = StandardScaler()
# sc.fit(X_train)
# X_train_sc = sc.transform(X_train)
# X_test_sc = sc.transform(X_test)

## Evaluation function

In [23]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [24]:
def print_score(clf, X_train, X_test, y_train, y_test, train = True):
#     print accuracy score, classification report, confusion metrics
    if train:
#         training performance
        print('Train Result : \n')
        print('Accuracy Score {0:.4f}\n'.format(accuracy_score(y_train, clf.predict(X_train))))
        print('Classification Report : \n {} \n'.format(classification_report(y_train, clf.predict(X_train))))
        print('Confusion Metrics : \n {} \n'.format(confusion_matrix(y_train, clf.predict(X_train))))
        
#         res = cross_val_score(clf, X_train, y_train, cv = 10, scoring='accuracy')
#         print('Average Accuracy : {0:.4f}\n'.format(np.mean(res)))
#         print('Accuracy SD : {0:.4f}\n'.format(np.std(res)))
        
    elif train == False:
#         test performance
        print('Test Result : \n')
        print('Accuracy Score {0:.4f}\n'.format(accuracy_score(y_test, clf.predict(X_test))))
        print('Classification Report : \n {}\n'.format(classification_report(y_test, clf.predict(X_test))))
        print('Confusion Metrics : \n {} \n'.format(confusion_matrix(y_test, clf.predict(X_test))))


## Model training

In [25]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [26]:
clf = BaggingClassifier(n_estimators=1000, base_estimator=RandomForestClassifier())

In [27]:
# clf.fit(X_train, y_train)