In [1]:
import pandas as pd
import numpy as np
import pickle

from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.layers import Input
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Dense
from keras.optimizers import RMSprop, Adam, Nadam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.models import Model
from keras.models import load_model
#from keras.utils.training_utils import multi_gpu_model
#from keras.layers import GlobalAveragePooling2D, Dense, Input
#from keras.models import Model, model_from_json, load_model

Using TensorFlow backend.


In [2]:
MAXWORD = 250
BATCHSIZE = 256
EMBEDDINGS_DIMENSION = 300
NUMWORD = len(pickle.load(open('selected_word.pkl','rb'))) + 1
LEARNING_RATE = 0.002
DROPOUT_RATE = 0.3

# List all identities
identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

In [3]:
# Convert taget and identity columns to booleans
def convert_to_bool(df, col_name):
    df[col_name] = np.where(df[col_name] >= 0.5, True, False)
    
def convert_dataframe_to_bool(df):
    bool_df = df.copy()
    for col in ['target'] + identity_columns:
        convert_to_bool(bool_df, col)
    return bool_df

# Load data

In [4]:
data = pd.read_csv('/data/jigsaw/train.csv.zip', compression='zip')
data.drop(columns='comment_text', inplace=True)
data = convert_dataframe_to_bool(data)

In [5]:
vectorized = pickle.load(open('train_vectorized.pkl','rb'))
data.loc[:,'vectorized'] = vectorized
data.head(3)

Unnamed: 0,id,target,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,...,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count,vectorized
0,59848,False,0.0,0.0,0.0,0.0,0.0,,,,...,rejected,0,0,0,0,0,0.0,0,4,"[47611, 24881, 44071, 10130, 24978, 49, 41166,..."
1,59849,False,0.0,0.0,0.0,0.0,0.0,,,,...,rejected,0,0,0,0,0,0.0,0,4,"[47427, 53114, 45, 53446, 47611, 52705, 28553,..."
2,59852,False,0.0,0.0,0.0,0.0,0.0,,,,...,rejected,0,0,0,0,0,0.0,0,4,"[47611, 24881, 45999, 1685, 50441, 12505, 3710..."


In [6]:
# shuffle data
data = data.sample(frac=1.0)

In [7]:
# split data
n_train = int(data.shape[0]*0.8)
train = data[:n_train]
valid = data[n_train:]
train.shape, valid.shape

((1443899, 45), (360975, 45))

# Build model

In [8]:
EPOCHS = 70
STEPS = int(train.shape[0]/BATCHSIZE)

In [9]:
def generator(vector, label, batch_size=256):
    start = 0
    while True:
        if start >= len(vector):
            start %= batch_size
        batch_x = vector[start:start+batch_size]
        batch_x = pad_sequences(batch_x, padding='post', maxlen=MAXWORD)
        
        
        _tmp_y = label[start:start+batch_size]
        _tmp_y = np.where(_tmp_y, 1, 0)
        #batch_y = label[start:start+batch_size]
        
        batch_y = np.zeros((len(_tmp_y),2))
        batch_y[np.arange(len(_tmp_y)), _tmp_y] = 1

        start += batch_size
        yield batch_x, batch_y

In [10]:
train_gen = generator(train.vectorized.values, train.target.values, batch_size=BATCHSIZE)
valid_gen = generator(valid.vectorized.values, valid.target.values, batch_size=BATCHSIZE)

In [11]:
# Create model layers.
def get_convolutional_neural_net_layers():
    """Returns (input_layer, output_layer)"""
    sequence_input = Input(shape=(MAXWORD,), dtype='int32')
    embedding_layer = Embedding(NUMWORD,
                                EMBEDDINGS_DIMENSION,
                                #weights=[embedding_matrix],
                                input_length=MAXWORD,
                                trainable=False)
    x = embedding_layer(sequence_input)
    x = Conv1D(128, 2, activation='relu', padding='same')(x)
    x = MaxPooling1D(5, padding='same')(x)
    x = Conv1D(128, 3, activation='relu', padding='same')(x)
    x = MaxPooling1D(5, padding='same')(x)
    x = Conv1D(128, 4, activation='relu', padding='same')(x)
    x = MaxPooling1D(40, padding='same')(x)
    x = Flatten()(x)
    x = Dropout(DROPOUT_RATE)(x)
    x = Dense(128, activation='relu')(x)
    preds = Dense(2, activation='softmax')(x)
    return sequence_input, preds

In [12]:
# Compile model.
print('compiling model')
input_layer, output_layer = get_convolutional_neural_net_layers()
model = Model(input_layer, output_layer)
model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(lr=LEARNING_RATE),
              metrics=['acc'])

compiling model


In [13]:
model_h5_filename = 'tmp_model.h5'
callbacks = [
    ReduceLROnPlateau(monitor='val_top_3_accuracy', factor=0.75, patience=3, min_delta=0.001,
                          mode='max', min_lr=1e-5, verbose=1),
    ModelCheckpoint(model_h5_filename, monitor='val_top_3_accuracy', mode='max', save_best_only=True,
                    save_weights_only=True),
]

In [None]:
hist = model.fit_generator(
    train_gen, steps_per_epoch=STEPS, epochs=EPOCHS, verbose=1,
    validation_data=valid_gen, validation_steps=100,
    callbacks = callbacks
)

Epoch 1/70