In [11]:
import pandas as pd
import numpy as np
import os
import urllib
import scipy as sp
import scipy.io
from collections import Counter
import itertools
import re

from keras.models import Sequential, Model
from keras.layers import Input, Dense, Embedding, merge, Convolution2D, MaxPooling2D, Dropout, Merge
from keras.layers.core import Reshape, Flatten
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
%matplotlib inline

data_us_o=pd.read_csv('dump.csv')

In [12]:
def one_hot_encoding(idx):
    """
    encoding categorical numbers to binary
    """
    y = np.zeros((len(idx),max(idx)+1))
    y[np.arange(len(idx)), idx] = 1
    return y


def one_hot_decoding(mtx):
    """
    decoding binary to categorical numbers
    """
    y = np.nonzero(mtx)[1]
    return y


def clean_str(string):
    string = re.sub(r"\s+", " ", string)
    return string.strip().lower()

def text2ngrams(texts, n=2):
    sen = []
    sens = []
    for sentence in texts:
        for i in range(len(sentence)-n+1):
            sen += [sentence[i:i+n]]
        sens.append(sen)
        sen = []
    return sens

def pad_sentences(sentences, padding_word="<PAD/>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences


def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    vocabulary_inv = list(sorted(vocabulary_inv))
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]


def build_input_data(sentences, labels, vocabulary):
    """
    Maps sentences and labels to vectors based on a vocabulary.
    """
    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return [x, y]


def load_data():
    """
    Loads and preprocessed data for the dataset.
    Returns input vectors, labels, vocabulary, and inverse vocabulary.
    """
    # Load and preprocess data
    sentences, labels = load_data_and_labels()
    sentences_padded = pad_sentences(sentences)
    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    x, y = build_input_data(sentences_padded, labels, vocabulary)
    return [x, y, vocabulary, vocabulary_inv]

In [13]:
data_us = data_us_o.rename(columns={'data.description':'DATA_DESCRIPTION'})

# Data Insight & Feature Extraction

In [14]:
# DATA_DESCRIPTION

d_desc = [clean_str(s) for s in data_us.DATA_DESCRIPTION.values[:10000]] # get rid of specioal characters
d_desc = text2ngrams(d_desc, n=2)

In [15]:
d_desc_padded = pad_sentences(d_desc)
vocabulary, vocabulary_inv = build_vocab(d_desc_padded)

# pad shorter sentences to the same size
data_feature = np.array([[vocabulary[word] for word in sentence] for sentence in d_desc_padded])
len(vocabulary),len(vocabulary_inv)

(1955, 1955)

In [16]:
# target/category/label
data_us_o.target.value_counts() # categorical data
data_us["target"] = data_us["target"].astype('category')
data_us["target"] = data_us["target"].cat.codes

In [17]:
y = one_hot_encoding(data_us.target.values[:10000])
data_feature.shape, y.shape

((10000, 49), (10000, 22))

In [18]:
# split data into train and test set

train_x, test_x, train_y, test_y \
= train_test_split(data_feature,y,test_size=0.1, random_state=1)

In [19]:
train_x.shape, test_x.shape, train_y.shape, test_y.shape

((9000, 49), (1000, 49), (9000, 22), (1000, 22))

In [20]:
sequence_length = data_feature.shape[1]
vocabulary_size = len(vocabulary_inv)
embedding_dim = 256
filter_sizes = [3,4,5]
num_filters = 512
drop = 0.5

nb_epoch = 100
batch_size = 30

# this returns a tensor
inputs = Input(shape=(sequence_length,), dtype='int32')
embedding = Embedding(output_dim=embedding_dim, input_dim=vocabulary_size, input_length=sequence_length)(inputs)
reshape = Reshape((sequence_length,embedding_dim,1))(embedding)

conv_0 = Convolution2D(num_filters, filter_sizes[0], embedding_dim, border_mode='valid', init='normal', activation='relu', dim_ordering='tf')(reshape)
conv_1 = Convolution2D(num_filters, filter_sizes[1], embedding_dim, border_mode='valid', init='normal', activation='relu', dim_ordering='tf')(reshape)
conv_2 = Convolution2D(num_filters, filter_sizes[2], embedding_dim, border_mode='valid', init='normal', activation='relu', dim_ordering='tf')(reshape)

maxpool_0 = MaxPooling2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), border_mode='valid', dim_ordering='tf')(conv_0)
maxpool_1 = MaxPooling2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), border_mode='valid', dim_ordering='tf')(conv_1)
maxpool_2 = MaxPooling2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), border_mode='valid', dim_ordering='tf')(conv_2)

merged_tensor = merge([maxpool_0, maxpool_1, maxpool_2], mode='concat', concat_axis=1)
flatten = Flatten()(merged_tensor)
# reshape = Reshape((3*num_filters,))(merged_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(output_dim=22, activation='softmax')(dropout)

# this creates a model that includes
model = Model(input=inputs, output=output)

checkpoint = ModelCheckpoint('weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

model.compile(optimizer='adagrad', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_2 (InputLayer)             (None, 49)            0                                            
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 49, 256)       500480      input_2[0][0]                    
____________________________________________________________________________________________________
reshape_2 (Reshape)              (None, 49, 256, 1)    0           embedding_2[0][0]                
____________________________________________________________________________________________________
conv2d_1 (Conv2D)                (None, 47, 1, 512)    393728      reshape_2[0][0]                  
___________________________________________________________________________________________

  app.launch_new_instance()
  name=name)


In [None]:
model.fit(train_x, train_y, epochs=10, verbose=2)  # batch_size=batch_size, starts training

In [None]:
score = model.evaluate(test_x, test_y, batch_size=256, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

In [None]:
import seaborn as sn
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

# predict_y = model.predict(test_x)
predict_y_c = model.predict_classes(test_x)
y_true = one_hot_decoding(test_y)
y_pred = predict_y_c

print('Prec/Recall/F1-Score: ', precision_recall_fscore_support(y_true+1, y_pred+1, average='micro'))

CM = confusion_matrix(y_true, y_pred)

df_cm = pd.DataFrame(CM, range(max(y_true)+1), range(max(y_true)+1))
#plt.figure(figsize = (10,7))
sn.set(font_scale=1.6)#for label size
plt.figure(figsize=(30, 30))
sn.heatmap(df_cm, cmap="YlGnBu", annot=True,annot_kws={"size": 16})# font size