In [1]:
"""Module to load data.
Consists of functions to load data from a CSV do the following:
    - Read the required fields (texts and labels).
    - Do any pre-processing if required. For example, make sure all label
        values are in range [0, num_classes-1].
    - Split the data into training and validation sets.
    - Shuffle the training data.
"""
import os
import pandas as pd
import numpy as np
import sys
from time import gmtime, strftime
import argparse

FLAGS = None

def load_data_from_csv (data_path,
                        validation_split=0.2,
                        seed=123):
    """Loads the mortgage customer complaint dataset.
        # Arguments
            data_path: string, path to the data directory.
            validation_split: float, percentage of data to use for validation.
            seed: int, seed for randomizer.
        # Returns
            A tuple of training and validation data.
            Number of training samples: 
            Number of test samples: 
            Number of categories:  
        # References
            https://www.kaggle.com/tmorrison/mortgage-complaints/data
            Download and uncompress archive from:
            https://www.kaggle.com/cfpb/us-consumer-finance-complaints/downloads/consumer_complaints.csv/1
        """
    print('load_data_from_csv is called: ',data_path)
    columns = (1, 5)  # 1 - product, 5 - consumer_complaint_narrative.
    data = _load_and_shuffle_data(data_path,'consumer_complaints.csv', columns, seed)
    print('data is loaded')
    print(len(data[0][0]),len(data[1][0]))

    return data

def _load_and_shuffle_data(data_path,
                           file_name,
                           cols,
                           seed,
                           separator=',',
                           header=0):
    """Loads and shuffles the dataset using pandas.
    # Arguments
        data_path: string, path to the data directory.
        file_name: string, name of the data file.
        cols: list, columns to load from the data file.
        seed: int, seed for randomizer.
        separator: string, separator to use for splitting data.
        header: int, row to use as data header.
    """
    print('_load_and_shuffle_data is called: file_name: step 1 ', data_path, file_name)
    np.random.seed(seed)
    #data_path = os.path.join(data_path, file_name)
    data_path = './data/consumer_complaints.csv.zip'
    print('_load_and_shuffle_data is called: data_path:', data_path)
    data = pd.read_csv(data_path, compression='zip',usecols=cols, sep=separator,dtype={'consumer_complaint_narrative': object})
    data = data.dropna(axis=0, how='any')
    print('column names', data.columns)
    #data = pd.read_csv(data_path, compression='zip', dtype={'consumer_complaint_narrative': object})
    data = data.reindex(np.random.permutation(data.index))
    texts = list(data['consumer_complaint_narrative'])
    labels = np.array(data['product'])
    print('length of texts', len(texts))
    print('length of ccn',len(labels))
    return _split_training_and_validation_sets(texts, labels, .2)


def _split_training_and_validation_sets(texts, labels, validation_split):
    """Splits the texts and labels into training and validation sets.
    # Arguments
        texts: list, text data.
        labels: list, label data.
        validation_split: float, percentage of data to use for validation.
    # Returns
        A tuple of training and validation data.
    """
    num_training_samples = int((1 - validation_split) * len(texts))
    return ((texts[:num_training_samples], labels[:num_training_samples]),
            (texts[num_training_samples:], labels[num_training_samples:]))

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', type=str, default='./data',
                        help='input data directory')
    FLAGS, unparsed = parser.parse_known_args()

    # Using the IMDb movie reviews dataset to demonstrate training n-gram model
    data = load_data_from_csv(FLAGS.data_dir)

load_data_from_csv is called:  ./data
_load_and_shuffle_data is called: file_name: step 1  ./data consumer_complaints.csv
_load_and_shuffle_data is called: data_path: ./data/consumer_complaints.csv.zip
column names Index(['product', 'consumer_complaint_narrative'], dtype='object')
length of texts 66806
length of ccn 66806
data is loaded
53444 13362


In [2]:
"""Module to explore data.
Contains functions to help study, visualize and understand datasets.
"""

import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

def get_num_classes(labels):
    """Gets the total number of classes.
    # Arguments
        labels: list, label values.
            There should be at lease one sample for values in the
            range (0, num_classes -1)
    # Returns
        int, total number of classes.
    # Raises
        ValueError: if any label value in the range(0, num_classes - 1)
            is missing or if number of classes is <= 1.
    """
    num_classes = max(labels) + 1
    missing_classes = [i for i in range(num_classes) if i not in labels]
    if len(missing_classes):
        raise ValueError('Missing samples with label value(s) '
                         '{missing_classes}. Please make sure you have '
                         'at least one sample for every label value '
                         'in the range(0, {max_class})'.format(
                            missing_classes=missing_classes,
                            max_class=num_classes - 1))

    if num_classes <= 1:
        raise ValueError('Invalid number of labels: {num_classes}.'
                         'Please make sure there are at least two classes '
                         'of samples'.format(num_classes=num_classes))
    return num_classes


def get_num_words_per_sample(sample_texts):
    """Gets the median number of words per sample given corpus.
    # Arguments
        sample_texts: list, sample texts.
    # Returns
        int, median number of words per sample.
    """
    print('inside get_num_words_per_sample_:')
    num_words = [len(s.split()) for s in sample_texts]
    return np.median(num_words)


def plot_frequency_distribution_of_ngrams(sample_texts,
                                          ngram_range=(1, 2),
                                          num_ngrams=50):
    """Plots the frequency distribution of n-grams.
    # Arguments
        samples_texts: list, sample texts.
        ngram_range: tuple (min, mplt), The range of n-gram values to consider.
            Min and mplt are the lower and upper bound values for the range.
        num_ngrams: int, number of n-grams to plot.
            Top `num_ngrams` frequent n-grams will be plotted.
    """
    # Create args required for vectorizing.
    kwargs = {
            'ngram_range': (1, 1),
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': 'word',  # Split text into word tokens.
    }
    vectorizer = CountVectorizer(**kwargs)
    print('vectorizer: ',vectorizer)
    # This creates a vocabulary (dict, where keys are n-grams and values are
    # idxices). This also converts every text to an array the length of
    # vocabulary, where every element idxicates the count of the n-gram
    # corresponding at that idxex in vocabulary.
    vectorized_texts = vectorizer.fit_transform(sample_texts)

    # This is the list of all n-grams in the index order from the vocabulary.
    all_ngrams = list(vectorizer.get_feature_names())
    num_ngrams = min(num_ngrams, len(all_ngrams))
    # ngrams = all_ngrams[:num_ngrams]

    # Add up the counts per n-gram ie. column-wise
    all_counts = vectorized_texts.sum(axis=0).tolist()[0]

    # Sort n-grams and counts by frequency and get top `num_ngrams` ngrams.
    all_counts, all_ngrams = zip(*[(c, n) for c, n in sorted(
        zip(all_counts, all_ngrams), reverse=True)])
    ngrams = list(all_ngrams)[:num_ngrams]
    counts = list(all_counts)[:num_ngrams]

    idx = np.arange(num_ngrams)
    plt.bar(idx, counts, width=0.8, color='b')
    plt.xlabel('N-grams')
    plt.ylabel('Frequencies')
    plt.title('Frequency distribution of n-grams')
    plt.xticks(idx, ngrams, rotation=45)
    plt.show()


def plot_sample_length_distribution(sample_texts):
    """Plots the sample length distribution.
    # Arguments
        samples_texts: list, sample texts.
    """
    plt.hist([len(s) for s in sample_texts], 50)
    plt.xlabel('Length of a sample')
    plt.ylabel('Number of samples')
    plt.title('Sample length distribution')
    plt.show()


def plot_class_distribution(labels):
    """Plots the class distribution.
    # Arguments
        labels: list, label values.
            There should be at lease one sample for values in the
            range (0, num_classes -1)
    """
    num_classes = get_num_classes(labels)
    count_map = Counter(labels)
    counts = [count_map[i] for i in range(num_classes)]
    idx = np.arange(num_classes)
    plt.bar(idx, counts, width=0.8, color='b')
    plt.xlabel('Class')
    plt.ylabel('Number of samples')
    plt.title('Class distribution')
    plt.xticks(idx, idx)
    plt.show()

In [3]:
"""Module to vectorize data.
Converts the given training and validation texts into numerical tensors.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

import numpy as np
import sys
# Vectorization parameters

# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)

# Limit on the number of features. We use the top 20K features.
TOP_K = 20000

# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'

# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2

# Limit on the length of text sequences. Sequences longer than this
# will be truncated.
MAX_SEQUENCE_LENGTH = 500


def ngram_vectorize(train_texts, train_labels, val_texts):
    """Vectorizes texts as ngram vectors.
    1 text = 1 tf-idf vector the length of vocabulary of uni-grams + bi-grams.
    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.
    # Returns
        x_train, x_val: vectorized training and validation texts
    """
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)

    # Vectorize validation texts.
    x_val = vectorizer.transform(val_texts)

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train)
    x_val = selector.transform(x_val)

    x_train = x_train.astype('float32')
    x_val = x_val.astype('float32')
    return x_train, x_val

def vectorize_labels(labels):

    unique_labels = np.unique(labels)

    x= dict(zip(unique_labels,range(0,len(unique_labels))))
    print('x is ',x)
    print('label 0',labels[0])
    print('label 1',labels[1])
    print(x[labels[0]])
    print(x[labels[1]])
    y_labels = np.array([])
    for i in range(len(labels)):
      y_labels=np.append(y_labels,x[labels[i]])

    print(len(labels))
    print(len(y_labels))

    return  y_labels

In [4]:
"""Module to create model.
Helper functions to create a multi-layer perceptron model and a separable CNN
model. These functions take the model hyper-parameters as input. This will
allow us to create model instances with slightly varying architectures.
"""
from tensorflow.python.keras import models

from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout

import numpy as np

def mlp_model(layers, units, dropout_rate, input_shape, num_classes):
    """Creates an instance of a multi-layer perceptron model.
    # Arguments
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of the layers.
        dropout_rate: float, percentage of input to drop at Dropout layers.
        input_shape: tuple, shape of input to the model.
        num_classes: int, number of output classes.
    # Returns
        An MLP model instance.
    """
    print('==================== BUILDING THE MLP MODEL ================================ ')

    op_units, op_activation = _get_last_layer_units_and_activation(num_classes)
    model = models.Sequential()
    model.add(Dropout(rate=dropout_rate, input_shape=input_shape))

    print('============================ UNITS: ', units)
    print('============================ DROPOUT RATE: ', dropout_rate)
    print('============================ LAYERS: ', layers)
    print('============================ INPUT SHAPE: ', input_shape)
    print('============================ NUM CLASSES: ', num_classes)
    for _ in range(layers-1):
        model.add(Dense(units=units, activation='relu'))
        model.add(Dropout(rate=dropout_rate))
        print('==================IN LOOP ========== DROPOUT RATE: ', dropout_rate)

    #model.add(Flatten())
    print('============================ OP UNITS: ', op_units)
    print('============================ OP ACTIVATION: ', op_activation)
    model.add(Dense(units=op_units, activation=op_activation))
    print(model)
    return model


def _get_last_layer_units_and_activation(num_classes):
    """Gets the # units and activation function for the last network layer.
    # Arguments
        num_classes: int, number of classes.
    # Returns
        units, activation values.
    """
    if num_classes == 2:
        activation = 'sigmoid'
        units = 1
    else:
        activation = 'softmax'
        units = num_classes
    return units, activation

In [5]:
"""Module to train n-gram model.
Vectorizes training and validation texts into n-grams and uses that for
training a n-gram model - a simple multi-layer perceptron model. We use n-gram
model for text classification when the ratio of number of samples to number of
words per sample for the given dataset is very small (<~1500).
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys
#import time
from time import gmtime, strftime
import tensorflow as tf
import numpy as np

#import build_model_kj
#import load_data_ratna
#import load_data_kj

#import vectorize_data_kj
#import explore_data_kj

FLAGS = None


def train_ngram_model(data,
                      learning_rate=1e-3,
                      epochs=5,
                      batch_size=128,
                      layers=2,
                      units=64,
                      dropout_rate=0.2):
    """Trains n-gram model on the given dataset.
    # Arguments
        data: tuples of training and test texts and labels.
        learning_rate: float, learning rate for training model.
        epochs: int, number of epochs.
        batch_size: int, number of samples per batch.
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of Dense layers in the model.
        dropout_rate: float: percentage of input to drop at Dropout layers.
    # Raises
        ValueError: If validation data has label values which were not seen
            in the training data.
    """
    # Get the data.
    print('==================== BEGINNING THE TRAINING ================================ ')
    (train_texts, train_labels), (val_texts, val_labels) = data

    # Verify that validation labels are in the same range as training labels.
    #num_classes = explore_data_kj.get_num_classes(train_labels)
    #print('printing num clsses: ',num_classes)
    num_classes = 11
   # unexpected_labels = [v for v in val_labels if v not in range(num_classes)]
    #if len(unexpected_labels):
   #     raise ValueError('Unexpected label values found in the validation set:'
    #                     ' {unexpected_labels}. Please make sure that the '
    #                     'labels in the validation set are in the same range '
    #                     'as training labels.'.format(
   #                          unexpected_labels=unexpected_labels))
    print('==================== VECTORIZING THE TEXTS ================================ ')

    # Vectorize texts.
    train_labels=vectorize_labels(train_labels)
    val_labels = vectorize_labels(val_labels)

    x_train, x_val = ngram_vectorize(
        train_texts, train_labels, val_texts)

    print('==================== CREATING A MODEL INSTANCE ================================ ')

    # Create model instance.
    model = mlp_model(layers=layers,
                                  units=units,
                                  dropout_rate=dropout_rate,
                                  input_shape=x_train.shape[1:],
                                  num_classes=num_classes)

    # Compile model with learning parameters.
    print('==================== COMPILING THE MODEL WITH LEARNING PARAMS ================================ ')

    if num_classes == 2:
        loss = 'binary_crossentropy'
    else:
        loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate,amsgrad=True)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2)]

    # Train and validate model.
    print('==================== TRAINING AND VALIDATING THE MODEL ================================ ')
    print('x_train',x_train[0])
    print('train labels',train_labels[0])

    history = model.fit(
            x_train,
            train_labels,
            epochs=epochs,
            callbacks=callbacks,
            validation_data=(x_val, val_labels),
            verbose=2,  # Logs once per epoch.
            batch_size=batch_size)

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
            acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    # evaluate the model
   # scores = model.evaluate(X, Y, verbose=0)
   # print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))

    # serialize model to JSON
    model_json = model.to_json()
    with open("model.json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights("model.h5")

    # Save model.
    model_name='./models/compliants-mgmt-mlp_' +strftime("%d_%b_%Y_%H_%M_%S", gmtime())+'.h5'
    #model_name.append(model_name,str(strftime("%d_%b_%Y_%H_%M_%S", gmtime())))
    #model_name.append(model_name,'.h5')
    print(model_name)
    model.save(model_name)


    return history['val_acc'][-1], history['val_loss'][-1]


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', type=str, default='./data',
                        help='input data directory')
    FLAGS, unparsed = parser.parse_known_args()

    # Using the IMDb movie reviews dataset to demonstrate training n-gram model
    #data = load_data_ratna.load_data_from_csv(FLAGS.data_dir)
    starttime = gmtime()
    print('start time ',strftime("%d_%b_%Y_%H_%M_%S",starttime ))
    data = load_data_from_csv(123)
    print('data loaded and back to train ===============')
    train_ngram_model(data)
    endtime = gmtime()
    print('end time ',strftime("%d_%b_%Y_%H_%M_%S",endtime ))
   # print('total time taken in mins ',(strftime("%M_%S",endtime-starttime)) )

start time  13_Aug_2018_18_18_39
load_data_from_csv is called:  123
_load_and_shuffle_data is called: file_name: step 1  123 consumer_complaints.csv
_load_and_shuffle_data is called: data_path: ./data/consumer_complaints.csv.zip
column names Index(['product', 'consumer_complaint_narrative'], dtype='object')
length of texts 66806
length of ccn 66806
data is loaded
53444 13362
x is  {'Bank account or service': 0, 'Consumer Loan': 1, 'Credit card': 2, 'Credit reporting': 3, 'Debt collection': 4, 'Money transfers': 5, 'Mortgage': 6, 'Other financial service': 7, 'Payday loan': 8, 'Prepaid card': 9, 'Student loan': 10}
label 0 Debt collection
label 1 Student loan
4
10
53444
53444
x is  {'Bank account or service': 0, 'Consumer Loan': 1, 'Credit card': 2, 'Credit reporting': 3, 'Debt collection': 4, 'Money transfers': 5, 'Mortgage': 6, 'Other financial service': 7, 'Payday loan': 8, 'Prepaid card': 9, 'Student loan': 10}
label 0 Consumer Loan
label 1 Student loan
1
10
13362
13362
<tensorflow.

In [6]:
from tensorflow.python.keras import models
import numpy as np
from keras.models import model_from_json
from keras.models import load_model
from sklearn.feature_extraction.text import TfidfVectorizer
#import build_model_kj

def predict_kj(input=None):
    #model = models.Sequential()
    model = load_model('./models/compliants-mgmt-mlp_13_Aug_2018_17_51_16.h5')
    print('==================== PREPARTING A PREDICTION KJ ================================ ')
    #x = np.array([])
    #x = np.append(x,'SOMEONE USE MY INFORMATION. THIS ISNT MY ACCOUNT.')
    print(model.predict_classes(input))


### Feed in test data
test = ['SOMEONE USE MY INFORMATION. THIS ISNT MY ACCOUNT.','I am not sure why I received an OTP']
test_df = pd.DataFrame(test)
test_data = test_df[0]

if __name__ == '__main__':
    print('Calling predict')
    (train_texts, train_labels), (val_texts, val_labels) = data
    x_train, x_test = ngram_vectorize(
        train_texts, train_labels, test_data)
    print('Prediction Successful')
    predict_kj(x_test)

    # try this https://www.opencodez.com/python/text-classification-using-keras.htm


Using TensorFlow backend.


Calling predict
Prediction Successful
[3 4]
