In [80]:
from __future__ import division, print_function
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import re
import string
import nltk

import warnings
warnings.filterwarnings("ignore")

## Read Data

Function to take a text file and convert its contents in a dataframe splitting the text by sentences

In [81]:

def createDataframe(file_name):
    f=open(file_name, "r", encoding="utf8")
    if f.mode == 'r':
        contents =f.read()
        tokenized = nltk.sent_tokenize(contents)
        df = pd.DataFrame(tokenized)
        df["Text"] = df
        df = df.drop([0], axis = 1) 
        return df

Importing the files into the respective dataframes

In [82]:
df_dog= createDataframe('dogWiki.txt')
df_cat = createDataframe('catWiki.txt')
df_test = createDataframe('testSentences.txt')

Assigning labels 1 for dog and 0 for cat

In [83]:
df_dog['Label'] = 1
df_cat['Label'] = 0
df = pd.concat([df_dog,df_cat])

Resetting the index so that the sentences and labels are randomized as the two dataframes were concatenated one below the other

In [84]:
df = df.sample(frac=1).reset_index(drop=True)
data=df

In [85]:
dog = []
cat = []
for l in data.Label:
    if l == 0:
        dog.append(0)
        cat.append(1)
    elif l == 1:
        dog.append(1)
        cat.append(0)

In [86]:
data['Dog']= dog
data['Cat']= cat

In [87]:
data.head()

Unnamed: 0,Text,Label,Dog,Cat
0,"[158] In the United States, about 80% of house...",0,0,1
1,Notable exceptions once included:\n\nAborigina...,1,1,0
2,The frequency and size of meals varies between...,0,0,1
3,Representations of dogs became more elaborate ...,1,1,0
4,A cat falling from heights of up to 3 meters c...,0,0,1


## Clean Data

Performed the following steps to clean the data:
<li> Removed punctuations </li>
<li> Removed digits </li>
<li> Tokenized the test on words </li>
<li> Removed stop words from english </li>
<li> Lower cased all words </li>

In [88]:
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
def preprocess(data):
    def remove_punct(text):
        text_nopunct = ''
        text_nopunct = re.sub('['+string.punctuation+']', '', text)
        text_nopunct = ''.join([i for i in text_nopunct if not i.isdigit()])
        return text_nopunct


    data['Text_Clean'] = data['Text'].apply(lambda x: remove_punct(x))
    tokens = [word_tokenize(sen) for sen in data.Text_Clean]
    def lower_token(tokens): 
        return [w.lower() for w in tokens]    
    
    lower_tokens = [lower_token(token) for token in tokens]
    stoplist = stopwords.words('english')
    def remove_stop_words(tokens): 
        return [word for word in tokens if word not in stoplist]
    filtered_words = [remove_stop_words(sen) for sen in lower_tokens]
    result = [' '.join(sen) for sen in filtered_words]
    data['Text_Final'] = result
    data['tokens'] = filtered_words
    #data = data[['Text_Final', 'tokens']]
    return data

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joelj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [89]:

data = preprocess(data)

In [90]:
data = data[['Text_Final', 'tokens', 'Label', 'Dog', 'Cat']]

In [91]:
data.head()

Unnamed: 0,Text_Final,tokens,Label,Dog,Cat
0,united states household cats neutered,"[united, states, household, cats, neutered]",0,0,1
1,notable exceptions included aboriginal tasmani...,"[notable, exceptions, included, aboriginal, ta...",1,1,0
2,frequency size meals varies individuals,"[frequency, size, meals, varies, individuals]",0,0,1
3,representations dogs became elaborate individu...,"[representations, dogs, became, elaborate, ind...",1,1,0
4,cat falling heights meters right land paws,"[cat, falling, heights, meters, right, land, p...",0,0,1


## Splitting into train and test

In [92]:
data_train, data_test = train_test_split(data, test_size=0.10, random_state=42)

### Creating volabulary for training

In [93]:
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

7808 words total, with a vocabulary size of 3271
Max sentence length is 75


In [94]:
all_test_words = [word for tokens in data_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

903 words total, with a vocabulary size of 611
Max sentence length is 27


## Loading word2vec model

Please fownload the pretrained word2vec model from the link below and save it in the same directory as the notebook:
https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz 

In [95]:
word2vec_path = 'GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [96]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

## Get Embeddings

Creating the embeddings for training data

In [97]:
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)

In [98]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

## Tokenize and pad sequences

Creating tokens for the training vocab

In [99]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["Text_Final"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["Text_Final"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 3271 unique tokens.


Final data to feed the CNN model for training

In [100]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [101]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(3272, 300)


In [102]:
test_sequences = tokenizer.texts_to_sequences(data_test["Text_Final"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

## Define CNN

In [103]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]  #Training the model on various filters to check for properties of N-grams

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [104]:
label_names = ['Dog', 'Cat']

In [105]:
y_train = data_train[label_names].values

In [106]:
x_train = train_cnn_data
y_tr = y_train

### CNN model created

In [107]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 50, 300)      981600      input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 49, 200)      120200      embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 48, 200)      180200      embedding_2[0][0]                
____________________________________________________________________________________________

## Train CNN

In [108]:
num_epochs = 3
batch_size = 34

### Training model on our data

In [109]:
hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)

Train on 549 samples, validate on 62 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


## Test CNN

In [110]:
predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)



In [111]:
labels = [1, 0]

In [112]:
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

In [113]:
sum(data_test.Label==prediction_labels)/len(prediction_labels)

0.9411764705882353

In [114]:
data_test.Label.value_counts()

0    40
1    28
Name: Label, dtype: int64

# Testing for unseen data in test file

In [115]:
def unitTest(test):
    test = createDataframe(test)
    test=preprocess(test)
    test_sequences1 = tokenizer.texts_to_sequences(test["Text_Final"].tolist())
    test_cnn_data1 = pad_sequences(test_sequences1, maxlen=MAX_SEQUENCE_LENGTH)
    predictions1 = model.predict(test_cnn_data1, batch_size=1024, verbose=1)
    prediction1_labels=[]
    for p in predictions1:
        prediction1_labels.append(labels[np.argmax(p)])
    class_label = ['Dog','Cat']
    prediction2_labels=[]
    for p in predictions1:
        prediction2_labels.append(class_label[np.argmax(p)])
    test['Predictions'] = prediction2_labels
    display= test[['Text','tokens', 'Predictions']]
    return display

In [116]:
cat = 'catTest.txt'
dog = 'dogTest.txt'


In [117]:
catDisplay = unitTest(cat)




In [118]:
catDisplay.head()

Unnamed: 0,Text,tokens,Predictions
0,Felix is a cat.,"[felix, cat]",Cat
1,Cats are good.,"[cats, good]",Cat
2,Tiger belongs to the cat family,"[tiger, belongs, cat, family]",Cat


In [122]:
dogDisplay = unitTest(dog)



In [123]:
dogDisplay.head()

Unnamed: 0,Text,tokens,Predictions
0,Tommy is a dog.,"[tommy, dog]",Dog
1,Dogs like bones.,"[dogs, like, bones]",Dog
2,Phillip is a dog man.,"[phillip, dog, man]",Dog


In [119]:
testSentences = 'testSentences.txt'

In [120]:
testDisplay=unitTest(testSentences)



In [121]:
testDisplay.head(10)

Unnamed: 0,Text,tokens,Predictions
0,This animal is similar to the other felid spec...,"[animal, similar, felid, species]",Cat
1,This animal is similar to the wolf and fox.,"[animal, similar, wolf, fox]",Dog
2,This animal can detect a drug when hidden.,"[animal, detect, drug, hidden]",Dog
3,One type of animal acts as a guard of things.,"[one, type, animal, acts, guard, things]",Dog
4,Whiskers coughed up a hairball today.,"[whiskers, coughed, hairball, today]",Cat
5,This animal can understand a hand signal if pr...,"[animal, understand, hand, signal, properly, t...",Dog
6,He has a kitten.,[kitten],Cat
7,This animal will catch a mouse when it seems i...,"[animal, catch, mouse, seems, impossible]",Cat
8,He carried a python across the street.,"[carried, python, across, street]",Cat
9,Python programming with machine learning has n...,"[python, programming, machine, learning, nothi...",Dog
