In [None]:
%matplotlib notebook
from matplotlib import pyplot as plt
import pandas
import re
import nltk
import numpy as np
from numpy import *
import csv
import theano.tensor as T
import os.path
from nltk.collocations import *
from optparse import OptionParser
from collections import Counter
from copy import copy
import cPickle
import csv
import warnings

from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score

from keras.layers.convolutional import MaxPooling1D, Convolution1D
from keras.layers.recurrent import LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.models import Sequential, Graph
from keras.engine.training import slice_X
from keras.layers.core import Layer, Dense, Dropout, Activation,\
    Reshape, Flatten, Lambda
from keras.regularizers import Regularizer
from keras.optimizers import SGD
from keras.constraints import maxnorm
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras.optimizers import Adadelta
from keras.callbacks import Callback


from IPython.utils.io import CapturedIO
from gensim.models import Word2Vec
from pkg_resources import resource_filename
import utils
import datasets
from unidecode import unidecode

# Yoon Kim's tokenization
def my_process(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Every dataset is lower cased except for TREC
    """
    string = re.sub(r"[^\w(),|!?\'\`\:\-\.;\$%#]", " ", string)
    string = re.sub(r"\'s", " is", string)
    string = re.sub(r"\'ve", " have", string)
    string = re.sub(r"n\'t", " not", string)
    string = re.sub(r"\'re", " are", string)
    string = re.sub(r"\'d", " would", string)
    string = re.sub(r"\'ll", " will", string)
    string = re.sub(r"(?<=\w)\.\.\.", " ... ", string)
    string = re.sub(r"(?<=\w)\.", " . ", string)
    string = re.sub(r"(?<=\w),", " , ", string)
    string = re.sub(r"(?<=\w);", " ; ", string)
    string = re.sub(r"(?<=\w)!", " ! ", string)
    string = re.sub(r"\((?=\w)", " ( ", string)
    string = re.sub(r"(?<=\w)\)", " ) ", string)
    string = re.sub(r"(?<=\w)\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

def mixed_score(y_true, y_probs):
    thresholds = sorted(unique(y_probs))
    max_f1, best_threshold = 0, 0
    for th in thresholds:
        f1 = f1_score(y_true, y_probs >= th)*1)
        if f1 > max_f1:
            max_f1 = f1
            best_th = th
    th = best_th    
    return {
        "precision": y_true[y_probs >= th].sum()*1./(y_probs >= th).sum(),
        "recall": y_true[y_probs >= th].sum()*1./y_true.sum(),
        "f1": f1_score(y_true, (y_probs >= th)*1),
        "auc": roc_auc_score(y_true, y_probs)
    }

# This function chooses the best threshold based on f1 of validation.
def seq_score(model, X, y):
    val_split = model.last_fit_params.get('validation_split', 0.)
    split_at = int(model.last_fit_X.shape[0] * (1. - val_split))
    X_val, y_val = model.last_fit_X[split_at:], model.last_fit_y[split_at:]
    val_probs = model.predict(X_val).flatten()
    thresholds = sorted(unique(val_probs))
    max_f1, best_threshold = 0, 0
    for threshold in thresholds:
        f1 = f1_score(y_val, (val_probs >= threshold)*1)
        if f1 > max_f1:
            max_f1 = f1
            best_threshold = threshold
    return mixed_score(y, model.predict(X).flatten(), best_threshold)
    
# Same as seq_f1 but for graph model
def graph_score(model, data):
    val_split = model.last_fit_params.get('validation_split', 0.)
    split_at = int(model.last_fit_data['output'].shape[0] * (1. - val_split))
    data_val = {k: slice_X(v, split_at) for k, v in model.last_fit_data.items()}
    val_probs = model.predict(data_val)['output'].flatten()
    thresholds = sorted(unique(val_probs))
    max_f1, best_threshold = 0, 0
    for threshold in thresholds:
        f1 = f1_score(data_val['output'], (val_probs >= threshold)*1)
        if f1 > max_f1:
            max_f1 = f1
            best_threshold = threshold
    return mixed_score(data['output'], 
                       model.predict(data)['output'].flatten(), 
                       best_threshold)

def seq_auc(model, X, y):
    preds = model.predict(X).flatten()
    return roc_auc_score(y, preds)
    
def graph_auc(model, data):
    preds = model.predict(data)['output'].flatten()
    return roc_auc_score(data['output'], preds)

seq_eval_f = seq_score
graph_eval_f = graph_score
results = pandas.DataFrame()

In [None]:
def mixed_score(y_true, y_probs, th):  
    y_true, y_probs = asarray(y_true), asarray(y_probs)
    return {
        "precision": y_true[y_probs >= th].sum()*1./(y_probs >= th).sum(),
        "recall": y_true[y_probs >= th].sum()*1./y_true.sum(),
        "f1": f1_score(y_true, (y_probs >= th)*1),
        "auc": roc_auc_score(y_true, y_probs)
    }

# Load ADR Twitter data set

In [None]:
data_path = "/home/trung/data/psb2016/"
tweets, clean_tweets, labels = [], [], []
with open(os.path.join(data_path, 'tweets.txt')) as f:
    for line in f:
        user_id, tweet_id, label, tweet = line.strip().split('\t')
        tweets.append(unidecode(tweet.decode('utf-8')))
        labels.append(int(label))
    
np.random.seed(0)
# Shuffle the data as Keras won't shuffle validation data.
# This can make the training ends early as we are using
# early stop for regularisation.
idx = np.random.permutation(len(tweets))
tweets, labels = asarray(tweets)[idx], asarray(labels)[idx]
skf = list(StratifiedKFold(labels, n_folds=10))
texts = asarray(tweets, dtype='str')

In [None]:
labels.shape

In [None]:
labels.sum()

# The baselines

In [None]:
import pandas
from sklearn.linear_model import LogisticRegression
from zhang_adr.concept_matching import run_cm
from zhang_adr.maxent_tfidf import run_tfidf
from zhang_adr.maxent_nblcr import run_nblcr
from zhang_adr.maxent_we import run_we
from zhang_adr.tweetnlp import tweet_tagger
from zhang_adr.preprocess import clean_tweet

tokens, tags = tweet_tagger.runtagger_parse(texts)
zhang_clean_texts = []
for token, tag in zip(tokens, tags):
    zhang_clean_texts.append(clean_tweet(token, tag))
zhang_clean_texts = asarray(zhang_clean_texts)

In [None]:
import warnings
warnings.filterwarnings("ignore")

results = results[results['model'] != 'CM']
results = results[results['model'] != 'ME-TFIDF']
results = results[results['model'] != 'ME-NBLCR']
results = results[results['model'] != 'ME-WE']
clf = LogisticRegression(class_weight="auto")
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for i, (train_idx, test_idx) in enumerate(skf, 1):
        print "### Fold {}:".format(i)
        train, test = [], []
        for train_id in train_idx:
            train.append({"id": None, "label": labels[train_id], "text": zhang_clean_texts[train_id]})
        train = pandas.DataFrame(train)
        for test_id in test_idx:
            test.append({"id": None, "label": labels[test_id], "text": zhang_clean_texts[test_id]})
        test = pandas.DataFrame(test)

        result = {}

        y_pred_cm = run_cm(train, test, resource_filename('zhang_adr', 'data/ADR-lexicon.txt'))
        result = mixed_score(test['label'].values, y_pred_cm, 0.5)
        result['model'] = 'CM'
        results = pandas.concat([results, pandas.DataFrame([result])])

        _, y_prob_tfidf = run_tfidf(train, test, grams='123', n_dim=40000, clf=clf)
        result = mixed_score(test['label'].values, asarray(y_prob_tfidf[:, 1]), 0.5)
        result['model'] = 'ME-TFIDF'
        results = pandas.concat([results, pandas.DataFrame([result])])

        _, y_prob_nblcr = run_nblcr(train, test, 'nblcr', grams='123', clf=clf)
        result = mixed_score(test['label'].values, y_prob_nblcr[:, 1], 0.5)
        result['model'] = 'ME-NBLCR'
        results = pandas.concat([results, pandas.DataFrame([result])])

        _, y_prob_we = run_we(train, test, resource_filename('zhang_adr', 'data/w2v_150.txt'), 150, clf=clf)
        result = mixed_score(test['label'].values,  y_prob_we[:, 1], 0.5)
        result['model'] = 'ME-WE'
        results = pandas.concat([results, pandas.DataFrame([result])])

In [None]:
results.groupby('model').mean()

# Our methods

In [None]:
w2v = Word2Vec.load_word2vec_format(
     '/home/trung/data/embeddings/word2vec_twitter_model/word2vec_twitter_model.bin', 
     binary=True
)

w2v = Word2Vec.load_word2vec_format(
     '/home/trung/data/embeddings/GoogleNews.bin', 
     binary=True
 )

w2v = Word2Vec.load_word2vec_format("/home/trung/data/embeddings/wlin/struc_skip_600.txt")

w2v = Word2Vec.load_word2vec_format(
     resource_filename('zhang_adr', 'data/w2v_150.txt'),
     binary=False,
 )

w2v = Word2Vec.load_word2vec_format(
    '/home/trung/data/embeddings/glovec/tmp',
    binary=False
)

dim = w2v.layer1_size

In [None]:
from zhang_adr.TextUtility import TextUtility

MOST_FREQUENT_WORDS = 20000
USE_CACHE = False
INCLUDE_UNKNOWN_WORDS = False

docs = [[w for w in TextUtility.text_to_wordlist(text)\
         if INCLUDE_UNKNOWN_WORDS or w in w2v.index2word]\
         for text in zhang_clean_texts]
all_words = Counter([w for doc in docs for w in doc])
top_words = sorted(all_words.items(), key=lambda t: t[1], reverse=True)
top_words = top_words[:MOST_FREQUENT_WORDS]
V = {w:i for i, (w, freq) in enumerate(top_words)}
X = utils.vectorize(docs, V)

# initialize embedding matrix
my_embeddings = np.random.normal(-.25, .25, size=(X.max() + 1, dim))
for w in V:
    if w in w2v:
        my_embeddings[V[w]] = w2v[w]
        
# set embedding of padded character as 0s.
my_embeddings[len(V) + 1] = np.zeros((dim, ))

### Logistic regression of sum of embeddings

In [None]:
def T_sum(x, **kwargs):
    import theano.tensor as T
    return T.sum(x, axis=-2)
    
def mk_lr_model_f(max_len, embeddings, embedding_fixed=False, 
                  optimizer='adadelta', loss='binary_crossentropy'):
    
    def lr_model():
        m = Sequential()
        m.add((utils.FixedEmbedding if embedding_fixed else Embedding)\
                (*embeddings.shape, input_length=max_len, weights=[embeddings]))
        m.add(Lambda(T_sum, output_shape=(embeddings.shape[1],)))
        m.add(Dense(1, activation='sigmoid'))
        m.compile(loss=loss, optimizer=optimizer, class_mode='binary')
        return m
    
    return lr_model

In [None]:
# sequential model
early_stopper = utils.MyEarlyStopping(monitor='val_loss', patience=5, verbose=1)
scores = utils.seq_cross_validate(
    mk_lr_model_f(X.shape[1], my_embeddings, optimizer='adadelta'),
    X, labels, 
    skf, eval_f=seq_eval_f,
    fit_params={
        "callbacks": [early_stopper],
        "validation_split": .1,
        "batch_size": 50,
        "nb_epoch": 100,
    })
results["my-lr-sum-embedding-dynamic-embedding"] = scores

### CNN

In [None]:
# # sequential model
early_stopper = utils.MyEarlyStopping(monitor='val_loss', patience=5, verbose=1)
model = utils.mk_yk_model_f(X.shape[1], my_embeddings, embedding_fixed=True, n_filters=300)()
train_idx, test_idx = skf5[0]
fit_params={
     "callbacks": [early_stopper],
     "validation_split": .1,
     "batch_size": 50
 }
model.fit(X[train_idx], labels[train_idx], **fit_params)

idx = test_idx[(model.predict(X[test_idx]) >= 0.5).flatten()*1 != labels[test_idx]]
wrongs = pandas.DataFrame(data={"y": labels[idx], "docs": asarray(docs)[idx], 
                                 "zhang_clean_texts": zhang_clean_texts[idx],
                                 "texts": texts[idx]})
wrongs.to_csv('incorrect.csv')

In [None]:
# sequential model
early_stopper = utils.MyEarlyStopping(monitor='val_loss', patience=5, verbose=0)
scores = utils.seq_cross_validate(
    utils.mk_yk_model_f(X.shape[1], my_embeddings, n_filters=300),
    X, labels, 
    skf, eval_f=seq_eval_f,
    fit_params={
        "callbacks": [early_stopper],
        "validation_split": .1,
        "batch_size": 50
    }, 
    verbose=0)
df = pandas.DataFrame(scores)
model_name = "CNN"
df["model"] = model_name
results = pandas.concat([results[results["model"] != model_name], df])
df.mean()

### GRU

In [None]:
reload(utils)

In [None]:
# sequential model
early_stopper = utils.MyEarlyStopping(monitor='val_loss', patience=5, verbose=0)
scores = utils.seq_cross_validate(
    utils.mk_gru_model_f(X.shape[1], my_embeddings),
    X[:, ::-1], labels,
    skf, eval_f=seq_eval_f,
    verbose=0,
    fit_params={
        "callbacks": [early_stopper],
        "validation_split": .1,
        "batch_size": 50
    })
df = pandas.DataFrame(scores)
model_name = "GRU"
df["model"] = model_name
results = pandas.concat([results[results["model"] != model_name], df])
df.mean()

In [None]:
results.groupby('model').mean()

### CNN with Attention

In [None]:
early_stopper = utils.MyEarlyStopping(monitor='val_loss', patience=5, verbose=0)
scores = utils.graph_cross_validate(
    utils.mk_attention_based_model_f(X.shape[1], my_embeddings, attention_l2=0.1),
    {"tokens": X, "output": labels},
    skf,
    eval_f=graph_eval_f,
    verbose=0,
    fit_params={
        "callbacks": [early_stopper],
        "validation_split": .1,
        "batch_size": 50
    })
df = pandas.DataFrame(scores)
model_name = "CNNA"
df["model"] = model_name
results = pandas.concat([results[results["model"] != model_name], df])
df.mean()

In [None]:
m = utils.mk_attention_based_model_f(X.shape[1], my_embeddings)()
early_stopper = utils.MyEarlyStopping(monitor='val_loss', patience=5, verbose=1)
m.fit({"tokens": X[skf[0][0]], "output": labels[skf[0][0]]},
      callbacks=[early_stopper],
      batch_size=50,
      validation_split=.1)

In [None]:
from keras import backend as K
attentions = K.function([m.nodes["embedding"].get_input()],
                         [m.nodes["norm_attn_weights"].get_output()])

features = K.function([m.nodes["embedding"].get_input()],
                         [m.nodes["features"].get_output()])

In [None]:
"had" in w2v.index2word

In [None]:
for i in xrange(1000):
    if len(docs[skf[0][0][i]]) >= 8 and labels[skf[0][0][i]]==1:
        print tweets[skf[0][0][i]]
        print zhang_clean_texts[skf[0][0][i]]        
        print zip(docs[skf[0][0][i]], attentions([X[skf[0][0][i:i+1]]])[0].flatten())
        print

In [None]:
asarray(docs[skf[0][0][1:2]])
attentions([X[skf[0][0][1:2]]])[0]