In [None]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.layers import Dense, Input, Embedding, SpatialDropout1D, Dropout, Activation, BatchNormalization, \
    concatenate, Bidirectional, Conv1D, GlobalMaxPooling1D, CuDNNGRU
from keras.models import Model, Sequential
from keras.optimizers import Adam
from fastText import load_model
import keras.backend as K
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping, Callback
from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras.backend.tensorflow_backend as KTF
from collections import defaultdict

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import Constant
from sklearn.linear_model import LogisticRegression
from readability import Readability
from readability.exceptions import ReadabilityException

os.environ["CUDA_VISIBLE_DEVICES"]="3"

def get_session(gpu_fraction=1.0):
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True
    return tf.Session(config=config)
    num_threads = os.environ.get('OMP_NUM_THREADS')
    gpu_options = tf.GPUOptions(allow_growth=True)

    if num_threads:
        return tf.Session(config=tf.ConfigProto(
            gpu_options=gpu_options, intra_op_parallelism_threads=num_threads))
    else:
        return tf.Session(config=tf.ConfigProto(allow_growth=True))

KTF.set_session(get_session())

In [None]:
DROPOUT = 0.1
BATCH_SIZE = 64
EPOCHS = 20
RUNS = 10
GUARDIAN = True #True to run experiments on user comments from TheGuardian.com. False to run them on product reviews from Amazon.com
COLUMN = 'comment_text'
VOCABULARY_SIZE = 200000
MAX_SEQUENCE_LENGTH = 125

In [None]:
if GUARDIAN:
    ft_model = load_model('guardian-300.bin')
else:
    ft_model = load_model('amazon-300.bin')
EMBEDDING_DIM = ft_model.get_dimension()

In [None]:
def metric_readability(comment_words):
    try:
        return Readability(comment_words).ari().score
    except ReadabilityException:
        return 1
    
def enrich_df_with_features(df):
    df['ARI'] = df['comment_text'].apply(metric_readability)
    df['comment_length'] = df['comment_text'].apply(len)
    df_user_info = df.groupby('comment_author_id').mean()[['ARI','comment_length','upvotes','children']].add_suffix('_AVG').reset_index()
    df_user_info = df_user_info[['comment_author_id', 'ARI_AVG','comment_length_AVG','upvotes_AVG','children_AVG']]
    return df_user_info

In [None]:
def normalize(s):
    # transform to lowercase characters
    s = str(s)
    s = s.lower()
    if 'http' in s or 'www' in s:
        s = 'url'
    # Isolate punctuation
    s = re.sub(r'([\'\"\.\(\)\!\?\-\\\/\,])', r' \1 ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|\n])', ' ', s)
    return s

def probabilities_to_classes(a):
    if a[0]>a[1]:
        return [1,0]
    else:
        return [0,1]

In [None]:
def load_data(category,percentage='10',fraction=1.0):
    if GUARDIAN:
        top = pd.read_csv('comments_top_and_bottom_'+category+'_'+percentage+'percent.csv', usecols=['comment_id', 'comment_author_id', 'comment_text', 'class','children', 'upvotes'])
        top = top[top['class']==1]
        bottom = pd.read_csv('comments_top_and_bottom_'+category+'_'+percentage+'percent.csv', usecols=['comment_id', 'comment_author_id', 'comment_text', 'class','children', 'upvotes'])
        bottom = bottom[bottom['class']==0]
    else:
        top = pd.read_csv('reviews_books_top_'+percentage+'percent.csv')
        top.columns = [COLUMN]
        bottom = pd.read_csv('reviews_books_bottom_'+percentage+'percent.csv')
        bottom.columns = [COLUMN]

    top['comment_length'] = top['comment_text'].astype(str).apply(len)
    bottom['comment_length'] = bottom['comment_text'].astype(str).apply(len)
    
    #If there are more than MAX_SEQUENCE_LENGTH tokens, use only the last MAX_SEQUENCE_LENGTH tokens of the comment
    top[COLUMN] = top[COLUMN].apply(lambda x: ' '.join(str(x).split()[-MAX_SEQUENCE_LENGTH:]))
    bottom[COLUMN] = bottom[COLUMN].apply(lambda x: ' '.join(str(x).split()[-MAX_SEQUENCE_LENGTH:]))
    
    top['top'] = True
    top['bottom'] = False
    bottom['top'] = False
    bottom['bottom'] = True
    
    top_and_bottom = top.append(bottom)
    top_and_bottom = top_and_bottom.sample(frac=fraction, random_state=42)
    top_and_bottom[COLUMN] = top_and_bottom[COLUMN].astype(str)
    return top_and_bottom

def gru_model(embedding_layer):
    inp = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') 
    embedded_sequences = embedding_layer(inp)
    x = SpatialDropout1D(DROPOUT)(embedded_sequences)
    x = Bidirectional(CuDNNGRU(32, return_sequences=False))(x)
    x = Dense(16, activation="relu")(x)
    x = Dropout(DROPOUT)(x)
    x = Dense(2, activation="softmax")(x)
    return Model(inputs=inp, outputs=x)

def cnn_model(embedding_layer):
    conv_filters = 128
    
    inp = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') 
    emb = embedding_layer(inp)

    # Specify each convolution layer and their kernel size i.e. n-grams 
    conv1_1 = Conv1D(filters=conv_filters, kernel_size=3)(emb)
    btch1_1 = BatchNormalization()(conv1_1)
    drp1_1  = Dropout(0.2)(btch1_1)
    actv1_1 = Activation('relu')(drp1_1)
    glmp1_1 = GlobalMaxPooling1D()(actv1_1)

    conv1_2 = Conv1D(filters=conv_filters, kernel_size=4)(emb)
    btch1_2 = BatchNormalization()(conv1_2)
    drp1_2  = Dropout(0.2)(btch1_2)
    actv1_2 = Activation('relu')(drp1_2)
    glmp1_2 = GlobalMaxPooling1D()(actv1_2)

    conv1_3 = Conv1D(filters=conv_filters, kernel_size=5)(emb)
    btch1_3 = BatchNormalization()(conv1_3)
    drp1_3  = Dropout(0.2)(btch1_3)
    actv1_3 = Activation('relu')(drp1_3)
    glmp1_3 = GlobalMaxPooling1D()(actv1_3)

    conv1_4 = Conv1D(filters=conv_filters, kernel_size=6)(emb)
    btch1_4 = BatchNormalization()(conv1_4)
    drp1_4  = Dropout(0.2)(btch1_4)
    actv1_4 = Activation('relu')(drp1_4)
    glmp1_4 = GlobalMaxPooling1D()(actv1_4)

    # Gather all convolution layers
    cnct = concatenate([glmp1_1, glmp1_2, glmp1_3, glmp1_4], axis=1)
    drp1 = Dropout(0.2)(cnct)

    dns1  = Dense(32, activation='relu')(drp1)
    btch1 = BatchNormalization()(dns1)
    drp2  = Dropout(0.2)(btch1)

    out = Dense(2, activation='softmax')(drp2)
    x = out
    return Model(inputs=inp, outputs=x)

def evaluate_model(x_test_text, model_type, x_train, y_train, x_val, y_val, x_test, y_test, embedding_layer):
    if model_type == 'GRU':
        model = gru_model(embedding_layer)
    else:
        model = cnn_model(embedding_layer)

    earlyStopping = EarlyStopping(monitor='val_loss', patience=2, verbose=0, mode='auto')

    model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size = BATCH_SIZE, verbose=0, callbacks=[earlyStopping], epochs=EPOCHS)
    y_test_pred = model.predict(x_test, verbose=0, batch_size=512)

    yc = np.asarray(y_test_pred)
    y_test_pred_binary = np.apply_along_axis(probabilities_to_classes, 1, yc)
    score = accuracy_score(y_test, y_test_pred_binary)

    ex_dict = {
    'y_true': y_test[:,0],
    'y_pred': y_test_pred_binary[:,0],
    'x_test_text': x_test_text
    }
    columns = ['y_true', 'y_pred', 'x_test_text']
    index = np.arange(1, len(y_test[:,0])+1, 1)

    # Passing a dictionary to create a dataframe
    # key: column name
    # value: series of values
    test_dataset = pd.DataFrame(ex_dict, columns=columns, index=index)
    test_dataset.to_csv('test_dataset_predictions.csv', index=False)
    
    return score

In [None]:
def print_results(finished_runs = 10, model_types=['GRU','CNN']):
    print(str(finished_runs)+' finished_run(s)\n')
    with open('logfile.txt', 'a') as the_file:
        the_file.write(str(finished_runs)+' finished_run(s)\n')
        for category in ['upvotes', 'replies']:
            the_file.write(category+'\n')
            for percentage in ['10','25','50']:
                the_file.write(percentage+'\n')
                for model_type in model_types:
                    the_file.write(str(model)+'\n')
                    tmp = 0
                    for run in range(1,finished_runs+1):
                        tmp += results[(run,category,percentage,model)]
                        the_file.write(str(results[(run,category,percentage,model_type)])+'\n')
                    print("Accuracy: {:.3f}".format(tmp))

In [None]:
def print_results_for_t_test(finished_runs = 10, model_types=['GRU','CNN']):
    print(str(finished_runs)+' finished_run(s)\n')
    with open('logfile_t_test.txt', 'a') as the_file:
        the_file.write(str(finished_runs)+' finished_run(s)\n')
        for category in ['upvotes', 'replies']:
            the_file.write(category+'\n')
            for percentage in ['10','25','50']:
                the_file.write(percentage+'\n')
                for model_type in model_types:
                    the_file.write(str(model)+'\n')
                    tmp = 0
                    the_file.write("<-c(")
                    for run in range(1,finished_runs+1):
                        the_file.write("{:.4f}".format(results[(run,category,percentage,model_type)])+", ")
                    the_file.write(")\n")

In [None]:
# run experiments with GRU and CNN models
results = defaultdict(lambda: 0.0)
if GUARDIAN:
    categories = ['upvotes', 'replies']
else:
    categories = ['upvotes']
for run in range(1,RUNS+1):
    for category in categories:
        #we always use 10% of the "top/flop 10% dataset" as test data
        test_data = load_data(category,'10', fraction=0.1)
        test_data.to_csv(str(GUARDIAN)+''+category+''+str(run)+'test_data.csv', index=False)
        

        for percentage in ['10','25','50']:
            #we use varying training datasets (either "top/flop 10% dataset", "top/flop 25% dataset", or "top/flop 50% dataset")
            train_data = load_data(category,percentage, fraction =1.0)
            #we ensure that there is no overlap of training and test data by removing all test samples from the training data
            train_data = pd.concat([train_data, test_data, test_data]).drop_duplicates(keep=False)
            #the tokenizer uses the comment texts from both training and test data to define the vocabulary
            texts = pd.concat([train_data, test_data])[COLUMN].values
            train_data, val_data = train_test_split(train_data, test_size=20/80)
            
            x_train = train_data[COLUMN].values
            x_val = val_data[COLUMN].values
            x_test = test_data[COLUMN].values

            y_train = train_data[['top', 'bottom']].values
            y_val = val_data[['top', 'bottom']].values
            y_test = test_data[['top', 'bottom']].values
            
            tokenizer = Tokenizer(num_words=VOCABULARY_SIZE)
            tokenizer.fit_on_texts(texts)
            
            x_test_text = x_test
            x_train = tokenizer.texts_to_sequences(x_train)
            x_train = pad_sequences(x_train, maxlen=MAX_SEQUENCE_LENGTH)            
            x_val = tokenizer.texts_to_sequences(x_val)
            x_val = pad_sequences(x_val, maxlen=MAX_SEQUENCE_LENGTH)
            x_test = tokenizer.texts_to_sequences(x_test)
            x_test = pad_sequences(x_test, maxlen=MAX_SEQUENCE_LENGTH)
            
            word_index = tokenizer.word_index
            num_words = min(VOCABULARY_SIZE, len(word_index)) + 1
            embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
            for word, i in word_index.items():
                if i > VOCABULARY_SIZE:
                    continue
                embedding_matrix[i] = ft_model.get_word_vector(word.lower()).astype('float32')

            embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
            
            for model_type in ['GRU','CNN']:
                results[(run,category,percentage,model)] = evaluate_model(x_test_text, model_type, x_train, y_train, x_val, y_val, x_test, y_test, embedding_layer)
            print_results(run)
            print_results_for_t_test(run)

In [None]:
# run experiments with Logistic Regression model
results = defaultdict(lambda: 0.0)

if GUARDIAN:
    categories = ['upvotes', 'replies']
else:
    categories = ['upvotes']
for run in range(1,RUNS+1):
    for category in categories:
        for percentage in ['10','25','50']:
            #we always use 10% of the "top/flop 10% dataset" as test data
            test_data_tmp = load_data(category,'10', fraction=0.1) 
            #we use varying training datasets (either "top/flop 10% dataset", "top/flop 25% dataset", or "top/flop 50% dataset")
            train_data = load_data(category,percentage, fraction =1.0)            
            #we ensure that there is no overlap of training and test data by removing all test samples from the training data
            train_data = pd.concat([train_data, test_data_tmp, test_data_tmp]).drop_duplicates(keep=False) 

            test_data_tmp['ARI'] = test_data_tmp['comment_text'].apply(metric_readability)
            test_data_tmp['comment_length'] = test_data_tmp['comment_text'].apply(len)
            df_user_info = enrich_df_with_features(train_data)
            train_data['ARI'] = train_data['comment_text'].apply(metric_readability)
            train_data['comment_length'] = train_data['comment_text'].apply(len)
            train_data = pd.merge(train_data, df_user_info, on='comment_author_id', how='outer')
            train_data.dropna(subset=['comment_text'], inplace= True)
            test_data = pd.merge(test_data_tmp, df_user_info, on='comment_author_id', how='outer')
            test_data.dropna(subset=['comment_text', 'top'], inplace= True)
            train_data, val_data = train_test_split(train_data, test_size=20/80)
            test_data.fillna(0, inplace=True)
            
            #x_train = train_data[['ARI','comment_length','upvotes_AVG','children_AVG','ARI_AVG', 'comment_length_AVG']].values#.reshape(-1, 1)
            #x_val = val_data[['ARI','comment_length','upvotes_AVG','children_AVG','ARI_AVG', 'comment_length_AVG']].values#.reshape(-1, 1)
            #x_test = test_data[['ARI','comment_length','upvotes_AVG','children_AVG','ARI_AVG', 'comment_length_AVG']].values#.reshape(-1, 1)
            x_train = train_data[['comment_length']].values.reshape(-1, 1)
            x_val = val_data[['comment_length']].values.reshape(-1, 1)
            x_test = test_data[['comment_length']].values.reshape(-1, 1)
            
            y_train = train_data[['top']].values.ravel()
            y_val = val_data[['top']].values.ravel()
            y_test = test_data[['top']].astype(bool).values.ravel()
            
            for model_type in ['LogisticRegression']:
                logreg = LogisticRegression()
                logreg.fit(x_train, y_train)
                y_pred = logreg.predict(x_test)
                score = logreg.score(x_test, y_test)
                print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(score))
                results[(run,category,percentage,model_type)] = score
            print_results(run, model_types=['LogisticRegression'])
            print_results_for_t_test(run, model_types=['LogisticRegression'])