In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

### imports

In [11]:
print(__doc__)
from itertools import combinations

from keras import backend as K
from keras.layers import Activation
from keras.layers import Input, Lambda, Dense, Dropout, Convolution1D, MaxPooling1D, Flatten
from keras.models import Sequential, Model
from keras.optimizers import RMSprop

import numpy as np
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

Automatically created module for IPython interactive environment


### utilities

In [33]:
def get_data(total_sample_size, train_doc_vectors, train_labels, labels):
    data = np.nan_to_num(train_doc_vectors / train_doc_vectors.sum(0))
    
    #get the new size
    dim1 = data.shape[0]
    dim2 = data.shape[1]

    count = 0
    
    #initialize the numpy array with the shape of [total_sample, no_of_pairs, dim1, dim2]
    X = np.zeros([total_sample_size, 2, dim2, 1])  # 2 is for pairs
    Y = np.zeros([total_sample_size, 1])
    
    size = total_sample_size/ (len(labels) * 2)
    
    for label in labels:
        similar_indices = np.where(train_labels == label)[0]
        disimilar_indices = np.where(train_labels != label)[0]
        
        for comb in combinations(similar_indices, 2):
            if count % size == 0:
                break

            #store the images to the initialized numpy array
            x_geuine_pair[count, 0] = data[comb[0]].reshape(-1, 1)
            x_geuine_pair[count, 1] = data[comb[1]].reshape(-1, 1)

            #as we are drawing images from the same directory we assign label as 1. (genuine pair)
            y_genuine[count] = 1
            count += 1

        for comb in combinations(similar_indices, 2):
            if count % size == 0:
                break

            X[count, 0] = data[comb[0]].reshape(-1, 1)
            X[count, 1] = data[comb[1]].reshape(-1, 1)

            #as we are drawing images from the different directory we assign label as 0. (imposite pair)
            Y[count] = 0
            count += 1

    return X, Y

def build_base_network(input_shape):
    
    seq = Sequential()
    
    nb_filter = [6, 12]
    kernel_size = 1000
    
    
    #convolutional layer 1
    seq.add(Convolution1D(nb_filter[0], kernel_size, input_shape=input_shape))
    seq.add(Activation('relu'))
    seq.add(MaxPooling1D(pool_size=kernel_size))  
    seq.add(Dropout(.25))
    
    #convolutional layer 2
    seq.add(Convolution1D(nb_filter[1], kernel_size))
    seq.add(Activation('relu'))
    seq.add(MaxPooling1D(pool_size=kernel_size)) 
    seq.add(Dropout(.25))

    #flatten 
    seq.add(Flatten())
    seq.add(Dense(128, activation='relu'))
    
    seq.add(Dropout(0.1))
    seq.add(Dense(64, activation='relu'))
    return seq

def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))


def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def contrastive_loss(y_true, y_pred):
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

### load dataset

In [34]:
# total number of samples needed
datasize = 100

# retrieve dataset
docs = fetch_20newsgroups(subset='train', shuffle=False, remove=('headers', 'footers', 'quotes'))
docs, old_labels, classes = docs.data[:datasize], docs.target[:datasize], docs.target_names

### clean dataset

In [35]:
# clean out the new line characters from text in docs
def clean_doc(doc):
    ''' remove unwanter characters line new line '''

    unwanted_chrs = [')', '(', '{', '}', '\t', '\n', '\r', "'", '"', "!"]
    doc = doc.lower()
    for unwanted_chr in unwanted_chrs:
        doc = doc.replace(unwanted_chr, ' ')

    return doc.strip()

In [36]:
labels = []
clean_docs = []

# the new classes
label_classes = ['sci/comp', 'other']
# label_classes = list(set([x.split('.')[0] for x in classes]))

for index, doc in enumerate(docs):
    cd = clean_doc(doc)
    
    if cd and not cd.isspace():
        clean_docs.append(cd)
        
        cl = classes[old_labels[index]]
        if cl in ['sci', 'comp']:
            labels.append(0)
            
        else:
            labels.append(1)
        #labels.append(label_classes.index(classes[old_labels[index]].split('.')[0]))

labels = np.array(labels)
print(clean_docs[0])

morgan and guzman will have era s 1 run higher than last year, and  the cubs will be idiots and not pitch harkey as much as hibbard.  castillo won t be good  i think he s a stud pitcher


In [37]:
print(f"there are {len(clean_docs)} docs and {len(label_classes)} classes: {label_classes}")

there are 96 docs and 2 classes: ['sci/comp', 'other']


### count words

In [38]:
# initialize the count vectorizer
# count_vectorizer = CountVectorizer()
count_vectorizer = TfidfVectorizer()

# fit it to dataset
train_docs, test_docs, train_labels, test_labels = train_test_split(clean_docs, labels, test_size=.33, random_state=42)
count_vectorizer.fit(train_docs)
vocabulary = count_vectorizer.get_feature_names()

print("word_count is", len(vocabulary))

word_count is 4973


### Prepare Datatset

In [39]:
# create doc count vectors
train_doc_vectors = count_vectorizer.transform(train_docs).toarray()
# train_doc_vectors = normalize(train_doc_vectors, norm="l1", axis=1)
# train_doc_vectors = normalize(train_doc_vectors, norm="l1", axis=0)

test_doc_vectors = count_vectorizer.transform(test_docs).toarray()
# test_doc_vectors = normalize(test_doc_vectors, norm="l1", axis=1)
# test_doc_vectors = normalize(test_doc_vectors, norm="l1", axis=0)

print(f"{len(train_labels)} train_docs, {len(test_labels)} test docs")

64 train_docs, 32 test docs


In [40]:
total_sample_size = 10000
X, Y = get_data(total_sample_size, train_doc_vectors, test_doc_vectors, list(range(len(label_classes))))
X.shape, Y.shape

((10000, 2, 4973, 1), (10000, 1))

In [41]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25)

## Model

In [42]:
input_dim = x_train.shape[2:]
img_a = Input(shape=input_dim)
img_b = Input(shape=input_dim)

In [43]:
input_dim

(4973, 1)

In [44]:
base_network = build_base_network(input_dim)
feat_vecs_a = base_network(img_a)
feat_vecs_b = base_network(img_b)

In [45]:
distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([feat_vecs_a, feat_vecs_b])

In [46]:
epochs = 13
rms = RMSprop()

In [47]:
model = Model(inputs=[img_a, img_b], outputs=distance)

In [48]:
model.compile(loss=contrastive_loss, optimizer=rms)

In [49]:
img_1 = x_train[:, 0]
img2 = x_train[:, 1]

In [51]:
model.fit([img_1, img2], y_train, validation_split=.25, batch_size=128, verbose=2, epochs=epochs)

Train on 5625 samples, validate on 1875 samples
Epoch 1/13


KeyboardInterrupt: 

In [None]:
pred = model.predict([x_test[:, 0], x_test[:, 1]])

In [183]:
label_classes

['sci', 'soc', 'misc', 'rec', 'alt', 'comp', 'talk']

In [None]:
len(X)

In [133]:
clf.predict(X[-1:])

array([0])

In [136]:
clf.predict_proba(X[-1:]), y[-1:]

(array([[0.94751738, 0.05248262]]), array([0]))

In [135]:
clf.score(X, y)

0.9701492537313433

In [141]:
XX = np.nan_to_num(test_doc_vectors / test_doc_vectors.sum(0))
yy = (test_labels == 5).astype(int)

clf.score(XX, yy)

  """Entry point for launching an IPython kernel.


0.7878787878787878

In [142]:
len(test_labels), len(yy)

(33, 33)

In [99]:
# word_word_distr.to_clipboard(f"word_word_distr_{datasize}.csv")
# topic_word_distr.to_csv(f"topic_word_distr_{datasize}.csv")
# re_topic_word_distr.to_csv(f"re_topic_word_distr_{datasize}.csv")