In [32]:
from keras.models import Sequential
import keras
from keras.layers import Merge, LSTM, Dense,GRU,SimpleRNN
from keras.preprocessing import sequence
from keras.layers.wrappers import Bidirectional
import numpy as np
import pickle
from sklearn.cluster import KMeans
from sklearn.svm import LinearSVC
from pprint import pprint
from matplotlib import pyplot as plt

In [18]:
pad_size = 15
class Classifier:
    def __init__(self,n_class=3,batch_size=100,pad_size=20):
        encoder_a = Sequential()
        encoder_a.add(SimpleRNN(output_dim=50, batch_input_shape=(None, pad_size, 100), return_sequences=False,dropout_U=0.4))
        #encoder_a.add(GRU(100, input_shape=(timesteps, data_dim)))

        encoder_b = Sequential()
        encoder_b.add(SimpleRNN(output_dim=50, batch_input_shape=(None, pad_size, 100), return_sequences=False,dropout_U=0.4))
        #encoder_b.add(GRU(100, input_shape=(timesteps, data_dim)))

        decoder = Sequential()
        decoder.add(Merge([encoder_a, encoder_b], mode='concat'))
        decoder.add(Dense(n_class, activation='softmax'))

        decoder.compile(loss='categorical_crossentropy',
                        optimizer='rmsprop',
                        metrics=['mean_squared_error'])
        self.decoder = decoder

In [19]:
def clustering(train_user,train_system,valid_user,valid_system,y_train,y_valid,n=3):
    splitPoint = len(train_user)
    user = np.sum(np.r_[train_user,valid_user],axis=1)
    system = np.sum(np.r_[train_system,valid_system],axis=1)
    vec = np.c_[system,user]
    kmeans = KMeans(n_clusters=n)
    kmeans.fit(vec)
    result_train_user,result_valid_user = dataSplit(train_user,valid_user,splitPoint,kmeans,n)
    result_train_system,result_valid_system = dataSplit(train_system,valid_system,splitPoint,kmeans,n)
    result_train_y,result_valid_y = dataSplit(y_train,y_valid,splitPoint,kmeans,n)
    return kmeans,result_train_user,result_train_system,result_valid_user,result_valid_system,result_train_y,result_valid_y

def dataSplit(train,valid,splitPoint,kmeans,n):
    result_train = [[] for i in range(n)]
    result_valid = [[] for i in range(n)]
    labels_train = kmeans.labels_[:splitPoint]
    labels_valid = kmeans.labels_[splitPoint:]
    for i,label in enumerate(labels_train):
        result_train[label].append(train[i])
    for i,label in enumerate(labels_valid):
        result_valid[label].append(valid[i])
    return result_train,result_valid

In [20]:
def binarize(labels):
    return np.array([[0,0,1] if label[2] > 0.5 else [0,1,0] if label[1] > 0.5 else [1,0,0] for label in labels])

In [21]:
def f_measure(model,values=[1]):
    P = 0
    C = 0
    R = 0
    for pred,corr in zip(model.predict([x_val_a,x_val_b]),y_val):
        if pred.argmax() in values:
            P += 1
        if corr.argmax() in values:
            C += 1
        if pred.argmax() in values and corr.argmax() in values:
            R += 1

    recall = R/P if P>0 else 0
    precision = R/C if C>0 else 0
    try:
        f = (2*recall*precision)/(recall+precision)
    except:
        f = 0
    return {"recall":recall,"precision":precision,"f_measure":f}

In [131]:
with open('corpus.pickle',mode='rb') as f:
    corpus = pickle.load(f)
# generate dummy training data

user,system,labels = zip(*corpus['vectorized']['train'])
x_train_a_all = sequence.pad_sequences(user,pad_size,dtype=np.float32)
#x_train_a = sequence.pad_sequences(system,pad_size,dtype=np.float32)
x_train_b_all = sequence.pad_sequences(system,pad_size,dtype=np.float32)
y_train_all = np.array(labels)

# generate dummy validation data
user,system,labels = zip(*corpus['vectorized']['valid'])
x_val_a_all = sequence.pad_sequences(system,pad_size,dtype=np.float32)
x_val_b_all = sequence.pad_sequences(user,pad_size,dtype=np.float32)
y_val_all = np.array(labels)

In [143]:
np.max(np.array([len(corpus["tokenized"]["train"][i][1]) for i in range(len(corpus["tokenized"]["train"]))]))

92

In [23]:
n = 1
kmeans_model,train_user_cluster,train_system_cluster,valid_user_cluster,valid_system_cluster,train_y_cluster,valid_y_cluster \
    = clustering(x_train_a_all,x_train_b_all,x_val_a_all,x_val_b_all,y_train_all,y_val_all,n)

In [64]:
result = {"numOfClusters":n,"kmeansModel":kmeans_model,"classifiers":[],"histories":[]}
for i in range(n):
    x_train_a = np.array(train_user_cluster[i])
    x_train_b = np.array(train_system_cluster[i])
    y_train = np.array(train_y_cluster[i])
    
    x_val_a = np.array(valid_user_cluster[i])
    x_val_b = np.array(valid_system_cluster[i])
    y_val = np.array(valid_y_cluster[i])

    decoder = Classifier(pad_size=pad_size).decoder
    history = decoder.fit([x_train_a, x_train_b], y_train,
                batch_size=512, nb_epoch=4,
                validation_data=([x_val_a, x_val_b], y_val),shuffle=True)
    result["classifiers"].append({
        "O":f_measure(decoder,[0]),
        "T":f_measure(decoder,[1]),
        "X":f_measure(decoder,[2]),
        "T-X":f_measure(decoder,[1,2]),
        "model":decoder,
    })
    result["histories"].append(history)
    x = list(range(len(results.history["loss"])))
    plt.plot(x, results.history['loss'], label='loss')
    plt.plot(x, results.history['val_loss'], label='loss')
    plt.plot(x, results.history['val_mean_squared_error'], label='loss')
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))

  """
  if __name__ == '__main__':
  del sys.path[0]


Train on 81312 samples, validate on 1100 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [65]:
x = list(range(len(results.history["loss"])))
plt.figure()
plt.xlabel("number of epoch")
plt.ylabel("value of loss / mse")
plt.title("learning curve of cluster "+str(i))
plt.plot(x, results.history['loss'], label='train_loss')
plt.plot(x, results.history['val_loss'], label='val_loss')
plt.plot(x, results.history['val_mean_squared_error'], label='val_mean_squared_error')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))

<matplotlib.legend.Legend at 0x7fbd7a7101d0>

In [67]:
result

{'classifiers': [{'O': {'f_measure': 0.7285886610373944,
    'precision': 0.760705289672544,
    'recall': 0.6990740740740741},
   'T': {'f_measure': 0, 'precision': 0.0, 'recall': 0.0},
   'T-X': {'f_measure': 0.16974169741697417,
    'precision': 0.1503267973856209,
    'recall': 0.19491525423728814},
   'X': {'f_measure': 0.15593220338983052,
    'precision': 0.1402439024390244,
    'recall': 0.17557251908396945},
   'model': <keras.models.Sequential at 0x7fbd7e3c59b0>}],
 'histories': [<keras.callbacks.History at 0x7fbd7ada9eb8>],
 'kmeansModel': KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
     n_clusters=1, n_init=10, n_jobs=1, precompute_distances='auto',
     random_state=None, tol=0.0001, verbose=0),
 'numOfClusters': 1}

In [52]:
with open("test.pickle","wb") as f:
    pickle.dump([,"a"],f)