In [28]:
%load_ext jupyternotify

The jupyternotify extension is already loaded. To reload it, use:
  %reload_ext jupyternotify


In [29]:
import pandas as pd
import numpy as np

import urllib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import re

In [30]:
from sklearn import metrics

In [31]:
from sklearn.metrics import confusion_matrix

In [33]:
# download annotated comments and annotations

ANNOTATED_COMMENTS_URL = 'https://ndownloader.figshare.com/files/7554634' 
ANNOTATIONS_URL = 'https://ndownloader.figshare.com/files/7554637' 


def download_file(url, fname):
    urllib.request.urlretrieve(url, fname)

                
download_file(ANNOTATED_COMMENTS_URL, 'attack_annotated_comments.tsv')
download_file(ANNOTATIONS_URL, 'attack_annotations.tsv')

In [34]:
comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0)
annotations = pd.read_csv('attack_annotations.tsv',  sep = '\t')

In [35]:
comments.head()

Unnamed: 0_level_0,comment,year,logged_in,ns,sample,split
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train
44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,2002,False,article,random,train
49851,"NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...",2002,False,article,random,train
89320,"Next, maybe you could work on being less cond...",2002,True,article,random,dev
93890,This page will need disambiguation.,2002,True,article,random,train


In [36]:
annotations.head()

Unnamed: 0,rev_id,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
0,37675,1362,0.0,0.0,0.0,0.0,0.0
1,37675,2408,0.0,0.0,0.0,0.0,0.0
2,37675,1493,0.0,0.0,0.0,0.0,0.0
3,37675,1439,0.0,0.0,0.0,0.0,0.0
4,37675,170,0.0,0.0,0.0,0.0,0.0


In [37]:
# labels a comment as an atack if the majority of annoatators did so
labels = annotations.groupby('rev_id')['attack'].mean() > 0.6
# labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

In [38]:
# join labels and comments
comments['attack'] = labels

In [39]:
# remove newline and tab tokens
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))

In [40]:
comments.head()

Unnamed: 0_level_0,comment,year,logged_in,ns,sample,split,attack
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
37675,`- This is not ``creative``. Those are the di...,2002,False,article,random,train,False
44816,` :: the term ``standard model`` is itself le...,2002,False,article,random,train,False
49851,"True or false, the situation as of March 200...",2002,False,article,random,train,False
89320,"Next, maybe you could work on being less cond...",2002,True,article,random,dev,False
93890,This page will need disambiguation.,2002,True,article,random,train,False


In [41]:
# Percentage of train, test, and dev data in original data
comments.split.value_counts() / len(comments.index)

train    0.600066
test     0.200045
dev      0.199890
Name: split, dtype: float64

In [42]:
# split training, development and testing
train_comments = comments.query("split=='train'")
dev_comments = comments.query("split=='dev'")
test_comments = comments.query("split=='test'")

In [43]:
# get rid of rev_id, year, logged_in, ns, sample column that will not be used in training
# only keep comments as training feature
def get_X_Y(data):
    X = data.comment
    Y = data.iloc[:, -1]
    return X, Y

In [44]:
X_train, Y_train = get_X_Y(train_comments)
X_dev, Y_dev = get_X_Y(dev_comments)
X_test, Y_test = get_X_Y(test_comments)

## Text cleaning

In [45]:
## Make sur nltk is downloaded
import nltk.data

In [46]:
nltk.data.path = ['/Users/liangpengzhuang/Downloads/nltk_data']

In [47]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

In [None]:
# Code reference https://www.kaggle.com/c/word2vec-nlp-tutorial

In [48]:
# Clean comment string, and return a list of words
# Only consider character and number
def comment_to_wordlist(review, remove_stopwords=True):
    # Remove non-letters 
    review = re.sub("[^a-zA-Z ]"," ", review)
    words = review.lower().split()
    if remove_stopwords:
        words = [w for w in words if not w in stop_words]
    return words

In [49]:
# Load the punkt tokenizer
# tokenizer = nltk.data.load('/Users/liangpengzhuang/Downloads/nltk_data/tokenizers/punkt/english.pickle')
tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')


# function to split a review into parsed sentences. Comments will be divided to sentences first, and then transform to 
# list of words
def review_to_sentences( comment, tokenizer, remove_stopwords):
    # Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(comment.strip())

    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( comment_to_wordlist(raw_sentence, remove_stopwords))
            
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

## N-gram features
### cleanning method 1 - Liangpeng

In [50]:
def clean_text(comment):
    words = comment_to_wordlist(comment)
    return ' '.join(words)

In [51]:
X_train_clean = X_train.apply(clean_text)
X_test_clean = X_test.apply(clean_text)

### cleanning method 2 - Kingston
 true recall + 0.01

In [52]:
# import nltk
# nltk.download()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re

def text_clean(text):
    tokens = word_tokenize(text)
#     tokens = [w.replace("NEWLINE_TOKEN", " ") for w in tokens]
#     tokens = [w.replace("TAB_TOKEN", " ") for w in tokens]
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('','', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [w for w in tokens if w.isalpha()]
    stop_words_list = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words_list]
    filted_text = ' '.join(tokens)
    return filted_text

In [53]:
X_train_clean = X_train.apply(text_clean)
X_test_clean = X_test.apply(text_clean)

### Character level

In [54]:
clf_char = Pipeline([
    ('vect', CountVectorizer(analyzer='char', max_features = 10000, ngram_range = (1,2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', LinearSVC()),
])
clf_char = clf_char.fit(X_train_clean, Y_train)

In [55]:
Y_pred = clf_char.predict(X_test_clean)
auc = accuracy_score(Y_test, Y_pred)
conf = confusion_matrix (Y_test, Y_pred)
print(metrics.classification_report(Y_test, Y_pred))

             precision    recall  f1-score   support

      False       0.94      0.99      0.97     20913
       True       0.87      0.42      0.56      2265

avg / total       0.93      0.94      0.93     23178



### Word level
#### LinearSVC

In [56]:
clf_word = Pipeline([
    ('vect', CountVectorizer(analyzer='word', max_features = 10000, ngram_range = (1,1))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', LinearSVC(
#         dual=False,
#         tol = 0.01,
#         loss='hinge',
        class_weight='balanced'
    )),
])
clf_word = clf_word.fit(X_train_clean, Y_train)
Y_pred = clf_word.predict(X_test_clean)
conf = confusion_matrix (Y_test, Y_pred)
auc = accuracy_score(Y_test, Y_pred)
print(metrics.classification_report(Y_test, Y_pred))

             precision    recall  f1-score   support

      False       0.98      0.95      0.96     20913
       True       0.63      0.81      0.71      2265

avg / total       0.95      0.94      0.94     23178



#### SVM

In [None]:
clf_word = Pipeline([
    ('vect', CountVectorizer(analyzer='word', max_features = 10000, ngram_range = (1,2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', svm.SVC()),
])
clf_word = clf_word.fit(X_train_clean, Y_train)
Y_pred = clf_word.predict(X_test_clean)
conf = confusion_matrix (Y_test, Y_pred)
auc = accuracy_score(Y_test, Y_pred)
print('Test ROC AUC: %.3f' %auc)
print('Confusion matrix: ', conf)

## Hyper-parameter Tunning
### LinearSVC


In [57]:
text_clf = Pipeline([
    ('vect', CountVectorizer(max_features = 10000, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', LinearSVC(verbose=1)),
])

In [58]:
param_grid = { 
#             'clf__penalty':['l1','l2'],
            'clf__loss':['hinge','squared_hinge'],
#             'clf__dual':[False],
            'clf__tol':[1e-2,1e-4],
           "clf__C":[1.0],
#            "clf__kernel" : ['rbf', 'linear', 'poly','sigmoid','precomputed'],
#            "clf__degree" : [1, 2, 3, 4, 5],
           'clf__class_weight':[None],
}

from pprint import pprint
pprint(param_grid)

{'clf__C': [1.0],
 'clf__class_weight': [None],
 'clf__loss': ['hinge', 'squared_hinge'],
 'clf__tol': [0.01, 0.0001]}


In [59]:
from sklearn.model_selection import GridSearchCV

gs_clf = GridSearchCV(estimator=text_clf, param_grid=param_grid, n_jobs=-1)
gs_clf.fit(X_train_clean, Y_train)



[LibLinear]

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        stri...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=1))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'clf__loss': ['hinge', 'squared_hinge'], 'clf__tol': [0.01, 0.0001], 'clf__C': [1.0], 'clf__class_weight': [None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [60]:
print(gs_clf.best_score_)
for param_name in sorted(param_grid.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.9533699623162558
clf__C: 1.0
clf__class_weight: None
clf__loss: 'hinge'
clf__tol: 0.01


### SVM

In [None]:
text_clf = Pipeline([
    ('vect', CountVectorizer(max_features = 10000, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', svm.SVC()),
])

In [None]:
param_grid = { 
           "clf__C" : [0,1,2,3],
           "clf__kernel" : ['rbf', 'linear', 'poly','sigmoid','precomputed'],
           "clf__degree" : [1, 2, 3, 4, 5],
           'clf__tol':[1e-2,1e-1,1e-3],
            'class_weight':['balanced',None]
}

from pprint import pprint
pprint(param_grid)

### Stemmer
Stemmers remove morphological affixes from words, leaving only the word stem.

In [61]:
from nltk.stem import PorterStemmer

In [62]:
ps = PorterStemmer()

In [63]:
def word_to_stem(comment):
    words = comment_to_wordlist(comment)
    words_out = []
    for word in words:
        words_out.append(ps.stem(word))
    return ' '.join(words_out)

In [64]:
X_train_stem = X_train.apply(word_to_stem)

In [65]:
X_test_stem = X_test.apply(word_to_stem)

In [66]:
clf_word_stem = Pipeline([
    ('vect', CountVectorizer(analyzer='word', max_features = 10000, ngram_range = (1,2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', LinearSVC()),
])
clf_word_stem = clf_word_stem.fit(X_train_stem , Y_train)
Y_pred = clf_word_stem.predict(X_test_stem)
conf = confusion_matrix (Y_test, Y_pred)
auc = accuracy_score(Y_test, Y_pred)
print(metrics.classification_report(Y_test, Y_pred))

             precision    recall  f1-score   support

      False       0.96      0.99      0.97     20913
       True       0.86      0.63      0.72      2265

avg / total       0.95      0.95      0.95     23178



### Lemmatizer
Lemmatize using WordNet's built-in morphy function.
Returns the input word unchanged if it cannot be found in WordNet.

In [67]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()

In [68]:
from nltk import pos_tag

In [69]:
def word_lemm(comment, tokenizer, remove_stopwords=True):
    sentences = review_to_sentences(comment, tokenizer, remove_stopwords)
    feature_str = ''
    for sent in sentences:
        tag = pos_tag(sent)
        for tuple_pair in tag:
            pos = get_wordnet_tag(tuple_pair[1])
            if len(pos) > 0:
                feature_str += lemmatizer.lemmatize(tuple_pair[0], pos=pos) + ' '
            else:
                feature_str += lemmatizer.lemmatize(tuple_pair[0]) + ' '
    return feature_str

In [70]:
def get_wordnet_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

In [71]:
X_train_lemm = X_train.apply(lambda x: word_lemm(x, tokenizer))
X_test_lemm = X_test.apply(lambda x: word_lemm(x, tokenizer))

In [72]:
clf_word_lemm = Pipeline([
    ('vect', CountVectorizer(analyzer='word', max_features = 10000, ngram_range = (1,2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', LinearSVC()),
])
clf_word_lemm = clf_word_lemm.fit(X_train_lemm , Y_train)
Y_pred = clf_word_lemm.predict(X_test_lemm)
conf = confusion_matrix (Y_test, Y_pred)
auc = accuracy_score(Y_test, Y_pred)
print(metrics.classification_report(Y_test, Y_pred))

             precision    recall  f1-score   support

      False       0.96      0.99      0.97     20913
       True       0.86      0.62      0.72      2265

avg / total       0.95      0.95      0.95     23178



In [73]:
% notify -m 'cool'
from sklearn import metrics
print(metrics.classification_report(Y_test, Y_pred))

<IPython.core.display.Javascript object>

             precision    recall  f1-score   support

      False       0.96      0.99      0.97     20913
       True       0.86      0.62      0.72      2265

avg / total       0.95      0.95      0.95     23178



## Embedding derived features

### Word2Vec

In [74]:
# Code reference: http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/

#### Self-trained Word2Vec model

In [75]:
from gensim.models import word2vec

In [76]:
X_total, Y_total = get_X_Y(comments)

In [77]:
# Get a list of all sentences in over comments for word2vec model training
def get_all_sent(data):
    all_sent = []
    for idx, value in data.iteritems():
        all_sent += review_to_sentences(value, tokenizer, remove_stopwords=False)
    return all_sent
all_sent = get_all_sent(X_total)       

In [78]:
# Train a w2v model using all comments 
def train_w2v(sentences, num_features):
    model = word2vec.Word2Vec(sentences, sg=1, workers=4, \
            size=num_features, min_count = 1, \
            window = 10, sample = 1e-3, iter=5)
    return model

In [79]:
# Train the model with 300 features
w2v_model = train_w2v(all_sent, 300)

In [80]:
# Get all word vectors from trained model
word_vectors = w2v_model.wv

In [81]:
w2v_model.save('w2v_model')

In [82]:
X_train_w2v = X_train.apply(lambda x: comment_to_wordlist(x,remove_stopwords=False))
X_test_w2v = X_test.apply(lambda x: comment_to_wordlist(x,remove_stopwords=False))

In [83]:
# Word2vec return a 300 dimension vector for each word in the comment, to transfrom a entire comment to a vector
# Averaging word vectors for all words in a text.
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = 300
            
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [84]:
clf_w2v_self = Pipeline([
    ('w2v-embed', MeanEmbeddingVectorizer(word_vectors)),
    ('clf', LinearSVC()),
])
clf_w2v_self = clf_w2v_self.fit(X_train_w2v, Y_train)
Y_pred = clf_w2v_self.predict(X_test_w2v)
auc = accuracy_score(Y_test, Y_pred)
conf = confusion_matrix(Y_test, Y_pred)
print(metrics.classification_report(Y_test, Y_pred))

             precision    recall  f1-score   support

      False       0.95      0.99      0.97     20913
       True       0.88      0.52      0.65      2265

avg / total       0.94      0.95      0.94     23178



In [85]:
from collections import Counter, defaultdict

In [86]:
# Similar with previous MeanEmbeddingVectorizer, instead, using Tfidf
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim=300
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [87]:
clf_w2v_self_tfid = Pipeline([
    ('w2v-embed', TfidfEmbeddingVectorizer(word_vectors)),
    ('clf', LinearSVC()),
])
clf_w2v_self_tfid = clf_w2v_self_tfid.fit(X_train_w2v, Y_train)
Y_pred = clf_w2v_self_tfid.predict(X_test_w2v)
auc = accuracy_score(Y_test, Y_pred)
conf = confusion_matrix(Y_test, Y_pred)
print(metrics.classification_report(Y_test, Y_pred))

             precision    recall  f1-score   support

      False       0.95      0.99      0.97     20913
       True       0.88      0.51      0.65      2265

avg / total       0.94      0.95      0.94     23178



### Doc2Vec
#### Self-trained Doc2Vec model

In [88]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

In [89]:
X_total, Y_total = get_X_Y(comments)

In [90]:
X_total_d2v = X_total.apply(lambda x: comment_to_wordlist(x, remove_stopwords=True))

In [91]:
# Tagged doc is the required input format for doc2vec training
# Each tagged document has a list of words, and a unique tag for furture vector retrival
def build_taggedDoc(data):
    data_total = []
    for idx, words in data.iteritems():
        doc = TaggedDocument(words, ['SENT' + str(idx)])
        data_total.append(doc)
    return data_total

In [92]:
tagged = build_taggedDoc(X_total_d2v)

In [93]:
def train_d2v(lr, T, data):
    model = Doc2Vec(vector_size=300, min_count=1, sample=len(data), alpha=lr, min_alpha=lr, epochs=T)
    model.build_vocab(data)
    model.train(data, epochs=model.epochs, total_examples=model.corpus_count)
    return model
d2v_model = train_d2v(0.025, 20, tagged)

In [94]:
def get_vector(d2v_model, doc_data):
    x_data = []
    for idx, words in doc_data.iteritems():
        label = 'SENT' + str(idx)
        x_data.append(d2v_model.docvecs[label])
    return np.array(x_data)      

In [95]:
X_train_d2v, Y_train = get_X_Y(train_comments)
X_test_d2v, Y_test = get_X_Y(test_comments)

In [96]:
X_train_d2v = X_train_d2v.apply(lambda x: comment_to_wordlist(x, remove_stopwords=True))
X_test_d2v = X_test_d2v.apply(lambda x: comment_to_wordlist(x, remove_stopwords=True))

In [97]:
X_train_d2v_v = get_vector(d2v_model, X_train_d2v)
X_test_d2v_v = get_vector(d2v_model, X_test_d2v)

In [98]:
clf_d2v = LinearSVC()
clf_d2v.fit(X_train_d2v_v, Y_train)
Y_pred =clf_d2v.predict(X_test_d2v_v)
auc = accuracy_score(Y_test, Y_pred)
conf = confusion_matrix(Y_test, Y_pred)
print(metrics.classification_report(Y_test, Y_pred))

             precision    recall  f1-score   support

      False       0.92      0.99      0.96     20913
       True       0.74      0.24      0.36      2265

avg / total       0.90      0.92      0.90     23178



## Syntactic features

### Text and part-of-speech(POS) tagging

In [99]:
# Instead word as feature, and its POS tag as addition feature
# Eg: 'I like cat' -> 'i i_NNS like like_VBP cat cat_NN'
def get_pos_feature(data, tokenizer, remove_stopwords=False):
    sentences = review_to_sentences(data, tokenizer, remove_stopwords)
    feature_str = ''
    for sent in sentences:
        tag = pos_tag(sent)
        for tuple_pair in tag:
            feature_str += tuple_pair[0] + ' '
            feature_str += tuple_pair[0] + '_' + tuple_pair[1] + ' '
    return feature_str

In [100]:
X_train_pos = X_train.apply(lambda x: get_pos_feature(x, tokenizer))
X_test_pos = X_test.apply(lambda x: get_pos_feature(x, tokenizer))

In [101]:
clf_pos_tag = Pipeline([
    ('vect', CountVectorizer(analyzer='word', max_features = 10000, ngram_range = (1,2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', LinearSVC()),
])
clf_pos_tag =clf_pos_tag.fit(X_train_pos, Y_train)
Y_pred = clf_pos_tag.predict(X_test_pos)
auc = accuracy_score(Y_test, Y_pred)
conf = confusion_matrix (Y_test, Y_pred)
print(metrics.classification_report(Y_test, Y_pred))

             precision    recall  f1-score   support

      False       0.96      0.99      0.98     20913
       True       0.88      0.63      0.73      2265

avg / total       0.95      0.96      0.95     23178



In [102]:
sets = set(['NN','JJ','JJR','JJS','VB','VBD','VBG','VBN','VBP','VBZ','PRP'])

In [103]:
# Only consider Noun, Verb, Adjective, Pronoun 
def get_pos_feature_wo(data, tokenizer, remove_stopwords=False):
    sentences = review_to_sentences(data, tokenizer, remove_stopwords)
    feature_str = ''
    for sent in sentences:
        tag = pos_tag(sent)
        for tuple_pair in tag:
            if tuple_pair[1] in sets:
                # Ignore the tense in tagging
                # Eg: both took, take will be tagged 'V'
                if 'V' in tuple_pair[1]:
                    feature_str += tuple_pair[0] + '_' + 'V' + ' '
                # Ignore comparable in adjective
                # Eg: Both clean, cleaner, cleanest will be tagged 'J'
                elif 'J' in tuple_pair[1]:
                    feature_str += tuple_pair[0] + '_' + 'J' + ' '
                else:
                    feature_str += tuple_pair[0] + '_' + tuple_pair[1] + ' '
    return feature_str

In [104]:
X_train_pos_wo = X_train.apply(lambda x: get_pos_feature_wo(x, tokenizer))
X_test_pos_wo = X_test.apply(lambda x: get_pos_feature_wo(x, tokenizer))

In [105]:
clf_pos_tag_wo = Pipeline([
    ('vect', CountVectorizer(analyzer='word', max_features = 10000, ngram_range = (1,2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', LinearSVC()),
])
clf_pos_tag_wo =clf_pos_tag_wo.fit(X_train_pos_wo , Y_train)
Y_pred = clf_pos_tag_wo.predict(X_test_pos_wo)
auc = accuracy_score(Y_test, Y_pred)
conf = confusion_matrix (Y_test, Y_pred)
print(metrics.classification_report(Y_test, Y_pred))

             precision    recall  f1-score   support

      False       0.96      0.99      0.97     20913
       True       0.86      0.61      0.71      2265

avg / total       0.95      0.95      0.95     23178



## Combine BOW with Word2Vec

In [106]:
clf_vectorize = Pipeline([
    ('vect', CountVectorizer(analyzer='word', max_features = 10000, ngram_range = (1,2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
])

In [107]:
x_train_b = clf_vectorize.fit_transform(X_train_clean).todense()

In [108]:
x_test_b = clf_vectorize.fit_transform(X_test_clean).todense()

In [109]:
x_train_em = MeanEmbeddingVectorizer(word_vectors).transform(X_train_clean)

In [110]:
x_test_em = MeanEmbeddingVectorizer(word_vectors).transform(X_test_clean)

In [111]:
x_train_br = np.apply_along_axis(lambda x: np.squeeze(np.asarray(x)), 1, x_train_b)

In [112]:
x_test_br = np.apply_along_axis(lambda x: np.squeeze(np.asarray(x)), 1, x_test_b)

In [113]:
x_train_be = np.c_[x_train_br, x_train_em]

In [114]:
x_test_be = np.c_[x_test_br, x_test_em]

In [115]:
clf_svc = LinearSVC()
clf_svc.fit(x_train_be, Y_train)
Y_pred = clf_svc.predict(x_test_be)
auc = accuracy_score(Y_test, Y_pred)
conf = confusion_matrix (Y_test, Y_pred)
print(metrics.classification_report(Y_test, Y_pred))

             precision    recall  f1-score   support

      False       0.91      0.98      0.95     20913
       True       0.49      0.15      0.23      2265

avg / total       0.87      0.90      0.88     23178

