# Binary Classification of Subjective/Objective tweets
## Features: 
- tfidf count
- POS tags in BoW fashion

In [1]:
import scipy.sparse as sp
import pandas as p
import numpy as np
import nltk
import csv
import re

from nltk.corpus import stopwords as sw
from nltk import pos_tag
from nltk.stem import PorterStemmer

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report as clsr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix as cm

In [2]:
train = p.read_csv('../sub_obj_data/train_new.csv', usecols=(['class', 'text'])).dropna()
test  = p.read_csv('../sub_obj_data/test_ds.csv', usecols=(['class', 'text'])).dropna()
train = train.reindex(np.random.permutation(train.index))

# Preprocessor class
Helper class which tokenises tweets and creates additional features

In [3]:
from __future__ import unicode_literals, division

class Preprocessor:
    """Tweet tokenisor and feature extractor.

       Attributes:
           feats (dict of lists): Contains counts for lexicon features.

    """
    def __init__(self):
        self.stopwords = list(sw.words('english'))
        self.word_re = word_re
        self.emoticon_re = emoticon_re
        self.url_re = url_re
        self.rep_char_re = rep_char_re
        self.hashtag_re = hashtag_re
        self.user_tag_re = user_tag_re
        self.lexicon = self._load_lexicon()
        self.stemmer = PorterStemmer()
        self.feats = {'pos': [], 'neg': []}

    def tokenise(self, tweet, pos_tags=False):
        """Tweet tokenisor method.

            Args:
                tweet (str): Text of a tweet.
                lexicon_feats (bool): Whether to include lexicon
                    features or not.

            Returns:
                Returns list of str.
        """
        tokens = self.word_re.findall(tweet)
        return self._normalise(tokens, pos_tags)

    def reset_feats(self):
        """Re-initialises the feats attribute."""
        self.feats = {k: [] for k, v in self.feats.iteritems()}

    def normalise_vect(self):
        """Normalises each value in the feats attribute."""
        max_val = 0
        for v in self.feats.itervalues():
            temp = max(map(lambda x: x[0], v))
            if temp > max_val:
                max_val = temp

        for k, v in self.feats.iteritems():
            self.feats[k] = map(lambda x: [0] if x[0] == 0 else [(x[0] / max_val) * 15], v)

    def _load_lexicon(self):
        with open('../app/data/lexicon.csv', 'rb') as f:
            reader = csv.reader(f)
            return dict((rows[2], rows[5]) for rows in reader)

    def _pos_tags(self, tokens):
        TAG_MAP = ["NN", "NNP", "NNS", "VBP", "VB", "VBD", 'VBG',
                   "VBN", "VBZ", "MD","UH", "PRP", "PRP$"]
        tags = pos_tag(tokens)
        return [tag[1] for tag in tags if tag[1] in TAG_MAP]

    def _normalise(self, tokens, pos_tags):

        token_list = []

        for t in tokens:
            # Ignore stopwords
            if t in self.stopwords:
                continue

            # lowercase all tokens except for emoticons
            if not self.emoticon_re.search(t):
                t = t.lower()

            # Normalise tokens
            t = self.rep_char_re.sub(r'\1', t)
            t = self.url_re.sub('_URL', t)
            t = self.hashtag_re.sub('_HASH', t)
            t = self.user_tag_re.sub('_USER', t)

            # Get token's stem and append it to the list
            token_list.append(self.stemmer.stem(t))

        # Get list of pos tags and append to token_list
        if pos_tags:
            tags = self._pos_tags(tokens)
            token_list = tags + token_list
        return token_list

    def _lexicon_lookup(self, tokens):
        # Initialise new 'row' to list
        for k, v in self.feats.iteritems():
            self.feats[k].append([0])

        idx = len(self.feats['pos']) - 1

        # Check if token is in lexicon dictionary.
        # If it is increment pos/neg feature count
        for t in tokens:
            t = t.lower()
            if t in self.lexicon:
                if self.lexicon[t] == '4':
                    self.feats['pos'][idx][0] += 1
                elif self.lexicon[t] == '0':
                    self.feats['neg'][idx][0] += 1

"""
    This file is based on the work of Christopher Potts.
    However, the file has been altered and extended for
    my purposes
    http://sentiment.christopherpotts.net/index.html
"""

emoticon_string = r"""
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
    )"""

regex_strings = (
    # Emoticons:
    emoticon_string,
    # HTML tags:
    r'<[^>]+>',
    # Twitter username:
    r'(?:@[\w_]+)',
    # Links
    r'http\S+',
    # Twitter hashtags:
    r'(?:\#+[\w_]+[\w\'_\-]*[\w_]+)',
    # Remaining word types:
    r"""
    (?:[a-z][a-z'\-_]+[a-z])       # Words with apostrophes or dashes.
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace
    """
)

word_re = re.compile(r'(%s)' % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)
emoticon_re = re.compile(regex_strings[0], re.VERBOSE | re.I | re.UNICODE)
html_entity_digit_re = re.compile(r'&#\d+;')
html_entity_alpha_re = re.compile(r'&\w+;')
amp = "&amp;"
url_re = re.compile(r'http\S+')
rep_char_re = re.compile(r'(\w)\1{3,}')
hashtag_re = re.compile(r'(?:\#+[\w_]+[\w\'_\-]*[\w_]+)')
user_tag_re = re.compile(r'(?:@[\w_]+)')

# Train and evaluate models

In [4]:
class FeatureCombiner(object):

    def transform(self, X, pre):
        pre.normalise_vect()
        feats = X
        for k, v in pre.feats.iteritems():
            feats = np.c_[feats, np.array(v)]
        return feats

    def fit(self, X, y=None):
        return self


def build_and_evaluate(n_gram, min_df, max_df, norm, clf, pos_tags):

    def preprocess(s):
        return preprocessor.tokenise(s, pos_tags)

    X = train['text'].values
    y = train['class'].values
    X_test = test['text'].values
    y_test = test['class'].values

    # Initialise transformers/estimators
    preprocessor = Preprocessor()
    feat_comb = FeatureCombiner()
    vec = TfidfVectorizer(tokenizer=preprocess,
                          lowercase=False,
                          ngram_range=n_gram,
                          min_df=min_df,
                          max_df=max_df, 
                          norm=norm)  

    # Build model
    print("Building model")
    tfidf_matrix = vec.fit_transform(X)
    clf.fit(tfidf_matrix, y)

    # Evaluate on test set
    tfidf_matrix_test = vec.transform(X_test)
    y_pred = clf.predict(tfidf_matrix_test)

    print("Classification Report:\n")
    print np.mean(y_pred == y_test)
    print cm(y_test, y_pred)
    print(clsr(y_test, y_pred, target_names=['obj', 'sub']))

# Logistic Regression classifiers

## Logit with -
- tfidf, unigrams

In [5]:
clf = LogisticRegression()
n_gram=(1, 1)
model = build_and_evaluate(n_gram, 1, 0.8, 'l2', clf, False)

Building model
Classification Report:

0.665467625899
[[86 53]
 [40 99]]
             precision    recall  f1-score   support

        obj       0.68      0.62      0.65       139
        sub       0.65      0.71      0.68       139

avg / total       0.67      0.67      0.66       278



## Logit with -
- tfidf, bigrams

In [6]:
clf = LogisticRegression()
n_gram=(1, 2)
model = build_and_evaluate(n_gram, 1, 0.8, 'l2', clf, False)

Building model
Classification Report:

0.68345323741
[[97 42]
 [46 93]]
             precision    recall  f1-score   support

        obj       0.68      0.70      0.69       139
        sub       0.69      0.67      0.68       139

avg / total       0.68      0.68      0.68       278



## Logit with -
- tfidf, bigrams, pos tags

In [7]:
clf = LogisticRegression()
n_gram=(1, 2)
model = build_and_evaluate(n_gram, 1, 0.8, 'l2', clf, True)

Building model
Classification Report:

0.625899280576
[[102  37]
 [ 67  72]]
             precision    recall  f1-score   support

        obj       0.60      0.73      0.66       139
        sub       0.66      0.52      0.58       139

avg / total       0.63      0.63      0.62       278



# Naive Bayes classifiers

## Naive Bayes with -
- tfidf, unigrams

In [10]:
clf = MultinomialNB(alpha=0.9)
n_gram = (1, 1)
model = build_and_evaluate(n_gram , 1, 0.8, 'l2', clf, False)

Building model
Classification Report:

0.679856115108
[[ 86  53]
 [ 36 103]]
             precision    recall  f1-score   support

        obj       0.70      0.62      0.66       139
        sub       0.66      0.74      0.70       139

avg / total       0.68      0.68      0.68       278



## Naive Bayes with -
- tfidf, bigrams

In [11]:
clf = MultinomialNB(alpha=0.9)
n_gram = (1, 2)
model = build_and_evaluate(n_gram , 1, 0.8, 'l2', clf, False)

Building model
Classification Report:

0.697841726619
[[98 41]
 [43 96]]
             precision    recall  f1-score   support

        obj       0.70      0.71      0.70       139
        sub       0.70      0.69      0.70       139

avg / total       0.70      0.70      0.70       278



## Naive Bayes with -
- tfidf, bigrams, pos tags

In [12]:
clf = MultinomialNB(alpha=0.9)
n_gram = (1, 2)
model = build_and_evaluate(n_gram , 1, 0.8, 'l2', clf, True)

Building model
Classification Report:

0.651079136691
[[95 44]
 [53 86]]
             precision    recall  f1-score   support

        obj       0.64      0.68      0.66       139
        sub       0.66      0.62      0.64       139

avg / total       0.65      0.65      0.65       278



# SGD Classifiers (SVM)

## SGD with -
- tfidf, unigrams

In [16]:
clf = SGDClassifier()
n_gram=(1, 1)
model = build_and_evaluate(n_gram, 1, 0.8, 'l2', clf, False)

Building model
Classification Report:

0.604316546763
[[ 61  78]
 [ 32 107]]
             precision    recall  f1-score   support

        obj       0.66      0.44      0.53       139
        sub       0.58      0.77      0.66       139

avg / total       0.62      0.60      0.59       278



## SGD with -
- tfidf, bigrams

In [17]:
clf = SGDClassifier()
n_gram=(1, 2)
model = build_and_evaluate(n_gram, 1, 0.8, 'l2', clf, False)

Building model
Classification Report:

0.643884892086
[[85 54]
 [45 94]]
             precision    recall  f1-score   support

        obj       0.65      0.61      0.63       139
        sub       0.64      0.68      0.66       139

avg / total       0.64      0.64      0.64       278



## SGD with -
- tfidf, bigrams, pos tags

In [18]:
clf = SGDClassifier()
n_gram=(1, 2)
model = build_and_evaluate(n_gram, 1, 0.8, 'l2', clf, True)

Building model
Classification Report:

0.589928057554
[[92 47]
 [67 72]]
             precision    recall  f1-score   support

        obj       0.58      0.66      0.62       139
        sub       0.61      0.52      0.56       139

avg / total       0.59      0.59      0.59       278

