In [76]:
import logging
from numpy import random
import re    # for regular expressions 
import nltk  # for text manipulation 
import string 
import warnings 
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt  
from wordcloud import WordCloud # data visualization library
#text features can be constructed using assorted techniques – Bag of Words, TF-IDF, and Word Embeddings.
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer #tfidf and Bag-of-Words Features.
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from nltk.corpus import stopwords
import gensim # library for word2vec
%matplotlib inline
pd.set_option("display.max_colwidth", 200) 
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [77]:
train = pd.read_csv('train_data_processed.csv')

In [78]:
train

Unnamed: 0,txn_id,description,payee_merchant,Category,expense_description
0,HPCTU2018224161563211041797HP,airport to residence,Meru Cabs Head Office,Travelling Expenses,airport residence meru cabs head office
1,HPCTU20183141727182589567887HP,Super tile emporium Camelia sales Tirupati impex,local Delhi,Daily Field Allowance,super tile emporium camelia sales tirupati impex local delhi
2,HPCTU201822725998634726214HP,Avni Tiles Tile Art Office Sunny sanitation,Local Delhi,Daily Field Allowance,avni tiles tile art office sunny sanitation local delhi
3,HPCTU2018312017296722783314HP,Avni tiles Aggrawal Kamakshi mkt Vijay Laxmi,local Delhi,Daily Field Allowance,avni tiles aggrawal kamakshi mkt vijay laxmi local delhi
4,HPCTU20183181615331670677816HP,Avni tiles and sanitary Aggrawal Marble Bhatia and associates Vijay bath,Local Delhi,Daily Field Allowance,avni tiles sanitary aggrawal marble bhatia associates vijay bath local delhi
5,HPCTU2018312019113386801213HP,Office Resonanance Bath Hut Taneja paints,local Delhi,Daily Field Allowance,office resoance bath hut taneja paints local delhi
6,HPCTU20183181613542483272675HP,Le season Bindal ceramics Sign Marble Royal saffron,Local Delhi,Daily Field Allowance,season bindal ceramics sign marble royal saffron local delhi
7,HPCTU201822735101364413826HP,Shekhar Dayal Bhatia and associates Avni tiles,Local Delhi,Daily Field Allowance,shekhar dayal bhatia associates avni tiles local delhi
8,HPCTU201822737474227482363HP,Prakash Marble Super tiles emporium Camelia sales Tirupati impex,Local Delhi,Daily Field Allowance,prakash marble super tiles emporium camelia sales tirupati impex local delhi
9,HPCTU201822730545421584209HP,Vijay Enterprise Ashiana GBMS,Local Delhi,Daily Field Allowance,vijay enterprise ashiana gbms local delhi


In [79]:
train.Category.value_counts()

Daily Field Allowance       28715
Travelling Expenses         15141
Boarding                     5983
Lodging                      1470
Staff Welfare Expenses        844
Courier                       670
Meal Allowances               349
Sales Promotion Expenses       83
Name: Category, dtype: int64

In [80]:
train['expense_description'].astype(str).apply(lambda x: len(x.split(' '))).sum()

586212

In [81]:
train.isnull().sum()

txn_id                    0
description            1056
payee_merchant            7
Category                  0
expense_description       0
dtype: int64

In [35]:
# train = train[~train.expense_description.isnull()]
# train.to_csv('train_data_processed.csv', encoding='utf-8', index=False)

In [36]:
X = train.expense_description
y = train.Category
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42, stratify=y)

In [42]:
y_train.value_counts()

Daily Field Allowance       21536
Travelling Expenses         11356
Boarding                     4487
Lodging                      1102
Staff Welfare Expenses        633
Courier                       503
Meal Allowances               262
Sales Promotion Expenses       62
Name: Category, dtype: int64

# Naive Bayes Classifier for Multinomial Models

In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [38]:
my_categories = ['Daily Field Allowance', 'Travelling Expenses', 'Boarding', 'Lodging', 'Staff Welfare Expenses', 'Courier', 'Meal Allowances', 'Sales Promotion Expenses']

In [39]:
%%time
from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_categories))

accuracy 0.869986480396575
                          precision    recall  f1-score   support

   Daily Field Allowance       0.85      0.74      0.79      1496
     Travelling Expenses       0.99      0.53      0.69       167
                Boarding       0.89      0.99      0.93      7179
                 Lodging       1.00      0.11      0.20       368
  Staff Welfare Expenses       1.00      0.08      0.15        87
                 Courier       0.00      0.00      0.00        21
         Meal Allowances       1.00      0.59      0.74       211
Sales Promotion Expenses       0.83      0.83      0.83      3785

               micro avg       0.87      0.87      0.87     13314
               macro avg       0.82      0.48      0.54     13314
            weighted avg       0.87      0.87      0.86     13314

Wall time: 343 ms


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# Linear Support Vector Machine

In [40]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...dom_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [41]:
%%time
y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_categories))

accuracy 0.8444494517049722
                          precision    recall  f1-score   support

   Daily Field Allowance       0.90      0.80      0.84      1496
     Travelling Expenses       0.98      0.76      0.86       167
                Boarding       0.80      0.99      0.89      7179
                 Lodging       0.97      0.51      0.67       368
  Staff Welfare Expenses       1.00      0.08      0.15        87
                 Courier       0.00      0.00      0.00        21
         Meal Allowances       0.99      0.69      0.82       211
Sales Promotion Expenses       0.93      0.65      0.77      3785

               micro avg       0.84      0.84      0.84     13314
               macro avg       0.82      0.56      0.62     13314
            weighted avg       0.86      0.84      0.83     13314

Wall time: 304 ms


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# Logistic Regression

In [47]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=-1, C=1e5)),
               ])
logreg.fit(X_train, y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False))])

In [48]:
%%time
y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_categories))

accuracy 0.9029592909719093
                          precision    recall  f1-score   support

   Daily Field Allowance       0.83      0.82      0.83      1496
     Travelling Expenses       0.96      0.92      0.94       167
                Boarding       0.94      0.95      0.95      7179
                 Lodging       0.72      0.66      0.69       368
  Staff Welfare Expenses       0.46      0.43      0.44        87
                 Courier       0.80      0.38      0.52        21
         Meal Allowances       0.97      0.91      0.94       211
Sales Promotion Expenses       0.87      0.88      0.88      3785

               micro avg       0.90      0.90      0.90     13314
               macro avg       0.82      0.74      0.77     13314
            weighted avg       0.90      0.90      0.90     13314

Wall time: 305 ms


# Doc2vec and Logistic Regression

In [49]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
import re

In [50]:
def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the expense.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(gensim.models.doc2vec.TaggedDocument(v.split(), [label]))
    return labeled

In [51]:
X_train, X_test, y_train, y_test = train_test_split(train.expense_description, train.Category, random_state=0, test_size=0.25)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

In [53]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

100%|███████████████████████████████████████████████████████████████████████| 53255/53255 [00:00<00:00, 3814338.45it/s]


In [54]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=5)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|███████████████████████████████████████████████████████████████████████| 53255/53255 [00:00<00:00, 3807641.26it/s]
100%|███████████████████████████████████████████████████████████████████████| 53255/53255 [00:00<00:00, 3562198.54it/s]
100%|███████████████████████████████████████████████████████████████████████| 53255/53255 [00:00<00:00, 3559870.90it/s]
100%|███████████████████████████████████████████████████████████████████████| 53255/53255 [00:00<00:00, 3559984.37it/s]
100%|███████████████████████████████████████████████████████████████████████| 53255/53255 [00:00<00:00, 4106853.58it/s]
100%|███████████████████████████████████████████████████████████████████████| 53255/53255 [00:00<00:00, 4111994.62it/s]
100%|███████████████████████████████████████████████████████████████████████| 53255/53255 [00:00<00:00, 3345479.95it/s]
100%|███████████████████████████████████████████████████████████████████████| 53255/53255 [00:00<00:00, 1841463.32it/s]
100%|███████████████████████████████████

Wall time: 4min 2s


In [55]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

In [56]:
train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')

In [57]:
%%time
logreg = LogisticRegression(n_jobs=-1, C=1e5)
logreg.fit(train_vectors_dbow, y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=-1, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [58]:
%%time
logreg = logreg.fit(train_vectors_dbow, y_train)
y_pred = logreg.predict(test_vectors_dbow)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


In [59]:
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_categories))

accuracy 0.8765209553853087
                          precision    recall  f1-score   support

   Daily Field Allowance       0.86      0.83      0.84      1461
     Travelling Expenses       0.84      0.91      0.87       180
                Boarding       0.90      0.94      0.92      7161
                 Lodging       0.85      0.66      0.74       383
  Staff Welfare Expenses       0.53      0.17      0.26        94
                 Courier       0.12      0.31      0.17        16
         Meal Allowances       0.89      0.86      0.88       224
Sales Promotion Expenses       0.85      0.81      0.83      3795

               micro avg       0.88      0.88      0.88     13314
               macro avg       0.73      0.69      0.69     13314
            weighted avg       0.88      0.88      0.87     13314



# Word2vec and Logistic Regression

In [60]:
# Let’s train a Word2Vec model on our corpus.

tokenized_expense = train['expense_description'].apply(lambda x: x.split()) #tokenizing

model_w2v = gensim.models.Word2Vec(
            tokenized_expense,
            size=200, # desired no. of features/independent variables
            window=5, # context window size
            min_count=2,
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling
            workers = 8, # no. of cores
            seed = 34)

model_w2v.train(tokenized_expense, total_examples= len(train['expense_description']), epochs = 20)

(9373400, 11724240)

In [61]:
model_w2v.wv.most_similar(positive='dinner')

[('restaurants', 0.5369762182235718),
 ('syeed', 0.530868411064148),
 ('lunch', 0.5235869288444519),
 ('sradha', 0.5230610966682434),
 ('vimrash', 0.522161602973938),
 ('mandakinee', 0.5182027816772461),
 ('breakfast', 0.514902651309967),
 ('sarovar', 0.5142581462860107),
 ('aahar', 0.5075251460075378),
 ('snacks', 0.5055705308914185)]

In [62]:
wv = model_w2v

In [63]:
wv.init_sims(replace=True)

In [65]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1,size))
    count = 0.
    for word in tokens:
        try:
            vec +=model_w2v[word].reshape((1,size))
            count +=1.
        except KeyError: # handling the case where the token is not in vocabulary 
                        continue
    if count !=0:
        vec /=count
    return vec

In [67]:
# Preparing word2vec feature set

wordvec_arrays = np.zeros((len(tokenized_expense), 200))
for i in range (len(tokenized_expense)):
    wordvec_arrays[i,:] = word_vector(tokenized_expense[i], 200)
    wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.shape

(53255, 200)

In [68]:
%%time
from sklearn.linear_model import LogisticRegression

xtrain_w2v, xvalid_w2v, ytrain, yvalid = train_test_split(wordvec_df, train.Category, random_state=42, test_size=0.25, stratify=train.Category)

logreg = LogisticRegression(n_jobs=-1, C=1e5)
logreg = logreg.fit(xtrain_w2v, ytrain)
y_pred = logreg.predict(xvalid_w2v)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


Wall time: 30.8 s


In [69]:
print('accuracy %s' % accuracy_score(y_pred, yvalid))
print(classification_report(yvalid, y_pred,target_names=my_categories))

accuracy 0.9010815682739973
                          precision    recall  f1-score   support

   Daily Field Allowance       0.87      0.86      0.87      1496
     Travelling Expenses       0.96      0.90      0.93       167
                Boarding       0.92      0.97      0.94      7179
                 Lodging       0.92      0.65      0.76       368
  Staff Welfare Expenses       0.57      0.20      0.29        87
                 Courier       0.50      0.19      0.28        21
         Meal Allowances       0.96      0.90      0.93       211
Sales Promotion Expenses       0.88      0.84      0.86      3785

               micro avg       0.90      0.90      0.90     13314
               macro avg       0.82      0.69      0.73     13314
            weighted avg       0.90      0.90      0.90     13314



In [70]:
test = pd.read_csv('test_data.csv')

In [72]:
test.head()

Unnamed: 0,txn_id,description,payee_merchant,Category,expense_description
0,HPVCL2018125133122718952586HP,PROCESS AS PER REQUEST PLS SUBMIT UR TRAVEL CLAIM FOR EARLIER PERIOD TO PROCESS NEW MONEY REQUEST,Wallet load,Category not found,process per request pls submit travel claim earlier period process new money request wallet load
1,HPVCT201812610509887694821HP,,Fuel Daya Petroleum Calicut IN,Category not found,fuel daya petroleum calicut
2,HPVCT201812610513668632314HP,,Surcharge Fuel Daya Petroleum Calicut IN,Category not found,surcharge fuel daya petroleum calicut
3,HPVCT2018225641309134132757HP,,Fuel SHANKAR AGENCIES TIRUNELVELI IN,Category not found,fuel shankar agencies tirunelveli
4,HPVCT2018224102717887296487HP,,HOTEL JAYA KOZHIKODE IN,Category not found,hotel jaya kozhikode


In [75]:
# pred_w2v_lr = logreg.predict(test.expense_description)