In [1]:
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam, Adadelta
from keras.models import Model
from sklearn.model_selection import train_test_split
from data_helpers import load_data
from data_helpers import load_data_embeddings_vocab
from data_helpers import get_embeddings
from data_helpers import load_test_data_separate_files
import h5py
from imblearn.over_sampling import SMOTE
import numpy as np
from collections import Counter
import simplejson

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:
def doSMOTE(X_train, y_train):
    print("y_train: " + str(len(y_train)))
    
    y_train_bin = []
    for y in y_train:
        if (y[0] < y[1]):
            y_train_bin.append(1)
        else:
            y_train_bin.append(0)
    y_train_bin = np.array(y_train_bin)
    
    print('Original dataset shape {}'.format(Counter(y_train_bin)))
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_sample(X_train, y_train_bin)
    print('Resampled dataset shape {}'.format(Counter(y_res)))
    X_res = np.array(X_res, dtype='int')
    y_res_mod = []
    for y in y_res:
        if y == 0:
            y_res_mod.append([1, 0])
        else:
            y_res_mod.append([0, 1])
    y_res_mod=np.array(y_res_mod)
    return [X_res, y_res_mod]

def countF1(predicted, actual):
    true_pos = 0
    true_neg = 0
    false_pos = 0
    false_neg = 0
    for i in range(len(predicted)):
        if(predicted[i] == 1 and actual[i] == 1):
            true_pos = true_pos+1
        if(predicted[i] == 1 and actual[i] == 0):
            false_pos = false_pos+1
        if(predicted[i] == 0 and actual[i] == 1):
            false_neg = false_neg+1
        if(predicted[i] == 0 and actual[i] == 0):
            true_neg = true_neg+1
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    print("true positive: " + str(true_pos))
    print("true negative: " + str(true_neg))
    print("false positive: " + str(false_pos))
    print("false negative: " + str(false_neg))
    if (precision + recall > 0):
        return (2*precision*recall) / (precision+recall)
    else:
        return 0

In [7]:
def startTraining(pos_data, neg_data, model_path, class_id):
    print('Loading data - class: ' + str(class_id))
#     x, y, vocabulary, vocabulary_inv = load_data("../data101617/dataNoClassCategory-0.txt", "../data101617/dataNoClassNegCategory-0.txt")
#     x, y, vocabulary, vocabulary_inv = load_data(pos_data, neg_data)
    x, y, vocabulary, vocabulary_inv = load_data_embeddings_vocab(pos_data, neg_data, "../word2vec/wiki.id.vec")
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.08)
#     X_test, y_test = load_test_data_separate_files(pos_data, neg_data, vocabulary, x.shape[1])
#     print(X_train)
    print(len(vocabulary))
    
    write_train_test(X_train, X_test, y_train, y_test, model_path)
    write_vocabulary(vocabulary, model_path)
    
    sequence_length = x.shape[1]
    vocabulary_size = len(vocabulary_inv)
    embedding_dim = 300
    filter_sizes = [3,4,5]
    num_filters = 100
    drop = 0.5
    epochs = 10
    batch_size = 50
    
    # this returns a tensor
    print("Creating Model... - class: " + str(class_id))
    inputs = Input(shape=(sequence_length,), dtype='int32')
    print(inputs)
    embedding_matrix = get_embeddings("../word2vec/wiki.id.vec", vocabulary_inv, embedding_dim)
    embedding = Embedding(input_dim=vocabulary_size+1, output_dim=embedding_dim, weights=[embedding_matrix], input_length=sequence_length, trainable=False)(inputs)
    reshape = Reshape((sequence_length,embedding_dim,1))(embedding)

    conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
    conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
    conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

    maxpool_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
    maxpool_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
    maxpool_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

    concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
    flatten = Flatten()(concatenated_tensor)
    dropout = Dropout(drop)(flatten)
    output = Dense(units=2, activation='softmax')(dropout)

    # this creates a model that includes
    model = Model(inputs=inputs, outputs=output)

    checkpoint = ModelCheckpoint(model_path + '/weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=False, mode='auto')
#     adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    adam = Adam()

    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    print("Traning Model...")
    model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[checkpoint], validation_data=(X_test, y_test))  # starts training
 
    # serialize model to JSON
    model_json = model.to_json()
    with open(model_path + "/model.json", "w") as json_file:
        json_file.write(simplejson.dumps(simplejson.loads(model_json), indent=4))
    
    model.save_weights(model_path + "/model_weights.h5")
    print("Saved model to disk")
    
    print("Start validating -- #test data: " + str(len(X_test)))
    y_prob = model.predict(X_test) 
    y_classes=[]
    for cls in y_prob:
        if(cls[0] > cls[1]):
            y_classes.append(0)
        else:
            y_classes.append(1)
    print(y_classes)
    
    y_test_classes=[]
    for cls in y_test:
        if(cls[0] > cls[1]):
            y_test_classes.append(0)
        else:
            y_test_classes.append(1)
    print(y_test_classes)
    print(countF1(y_classes, y_test_classes))
    
    print("Start validating -- #train data: " + str(len(X_train)))
    y_prob = model.predict(X_train) 
    y_classes=[]
    for cls in y_prob:
        if(cls[0] > cls[1]):
            y_classes.append(0)
        else:
            y_classes.append(1)
#     print(y_classes)
    y_test_classes=[]
    for cls in y_train:
        if(cls[0] > cls[1]):
            y_test_classes.append(0)
        else:
            y_test_classes.append(1)
#     print(y_test_classes)
    print(countF1(y_classes, y_test_classes))

In [4]:
import pickle
def write_vocabulary(vocabulary, model_path):
    pickle_out = open(model_path + "/vocabulary.pickle","wb")
    pickle.dump(vocabulary, pickle_out)
    pickle_out.close()
    
def load_vocabulary(model_path):
    pickle_in = open(model_path + "/vocabulary.pickle","rb")
    vocabulary = pickle.load(pickle_in)
    pickle_in.close()
    return vocabulary
    
def write_train_test(x_train, x_test, y_train, y_test, model_path):
    file_name = {
        "x_train":x_train,
        "x_test":x_test,
        "y_train":y_train,
        "y_test":y_test
    }
    
    for name in file_name:
        pickle_out = open(model_path + '/' + name + ".pickle","wb")
        pickle.dump(file_name.get(name), pickle_out)
        pickle_out.close()

In [5]:
for i in range (7,11):
    positive_data_file = "../data070218/dataAnnotate-" + str(i) + "-pos.txt"
    negative_data_file = "../data070218/dataAnnotate-" + str(i) + "-neg.txt"
    model_path = "model110218_newsagg/" + str(i)
    startTraining(positive_data_file, negative_data_file, model_path, i)

Loading data - class: 7


KeyboardInterrupt: 