# Binary Classification of Subjective/Objective tweets
## Features: 
- POS tags + tfidf count in BoW style
- Binary features for character repetitions in tweet
- Binary features for acronym presence in tweet
- Binary features for presence of happy or sad emojis

In [1]:
import pandas as p
import numpy as np
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk import pos_tag

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report as clsr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix as cm

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.cross_validation import train_test_split

In [2]:
train = p.read_csv('./sub_obj_data/train_new.csv', usecols=(['class', 'text'])).dropna()
test  = p.read_csv('./sub_obj_data/test_ds.csv', usecols=(['class', 'text'])).dropna()
train = train.reindex(np.random.permutation(train.index))
training_data, test_data, training_labels, test_labels = train_test_split(train['text'].values, train['class'].values, test_size=0.3, random_state=0) #70-30 split
print len(training_data), len(test_data)
# dev_data, eval_data, dev_labels, eval_labels = train_test_split(training_data, training_labels, test_size=0.5, random_state=0)
# print len(dev_data), len(eval_data)


4162 1784


# Preprocessor class
Helper class which tokenises tweets and includes pos tags and creates binary features from character repitions, presence of acronyms in a tweet and seperate binary features for happy and sad emojis.

In [6]:
from __future__ import unicode_literals, division
import re
import htmlentitydefs
import csv

from nltk.corpus import stopwords as sw
from nltk import pos_tag


class Preprocessor():

    def __init__(self):
        self.stopwords = list(sw.words('english'))
        self.feats = {'reps': [], 'acry': [], 'happy': [], 'sad': []}
        self.emoji_happy = self.emoji_happy()
        self.emoji_sad = self.emoji_sad()
        self.emoji_dic = self.load_emoji_dict()
        self.word_re = word_re
        self.emoticon_re = emoticon_re
        self.html_entity_digit_re = html_entity_digit_re
        self.html_entity_alpha_re = html_entity_alpha_re
        self.amp = amp
        self.punct_re = punct_re
        self.negation_re = negation_re
        self.url_re = url_re
        self.rep_char_re = rep_char_re
        self.hashtag_re = hashtag_re
        self.user_tag_re = user_tag_re
        self.acrynoms = self.load_acrynoms()
        self.stemmer = PorterStemmer()

    def load_acrynoms(self):
        with open('./data/acrynom.csv', 'rb') as f:
            reader = csv.reader(f)
            slang = dict((rows[0], rows[1]) for rows in reader)
            return slang
        
    def load_emoji_dict(self):
        with open('./data/emoji_dict.csv', 'rb') as f:
            reader = csv.reader(f)
            return dict((rows[0], int(rows[1])) for rows in reader)
        
    def pos_tags(self, tokens):
        TAG_MAP = [ "NN", "NNP", "NNS", "VBP", "VB", "VBD", 'VBG', "VBN",
                    "VBZ", "MD","UH", "PRP", "PRP$"]
        tags = pos_tag(tokens)
        return [tag[1] for tag in tags if tag[1] in TAG_MAP]

    
    def reset_feats(self):
        self.feats = {k: [] for k, v in self.feats.iteritems()}

    def normalise(self, tokens):
        
        vect = []
        
        for t in tokens:

            if t in string.punctuation or t in self.stopwords:
                continue
                
            if not self.emoticon_re.search(t):
                t = t.lower()       
            
#             if t in self.acrynoms:
#                 b = t 
#                 t = self.acrynoms[b]
#             if self.rep_char_re.search(t):
#                 vect.append('_REPS')
            
            t = self.rep_char_re.sub(r'\1', t)
            t = self.url_re.sub('_URL', t)
            t = self.hashtag_re.sub('_HASH', t)
            t = self.user_tag_re.sub('_USER', t)
            
            vect.append(self.stemmer.stem(t))

        tags = self.pos_tags(tokens)
        vect = tags + vect
        return vect
    
    def tokenise(self, tweet):
        tweet = self.__html2unicode(tweet)
        tokens = self.word_re.findall(tweet)
#         self.caps_intensifier(tokens)
        self.aryonms(tokens)
        self.happy(tokens)
        self.sad(tokens)
        self.char_repititions(tokens)
        return self.normalise(tokens)
    
    def append_binary_feats(self, intensify, feat):
        if intensify:
            self.feats[feat].append([1])
        else:
            self.feats[feat].append([0])
    
    def happy(self, tokens):
        happy = any(self.emoji_happy.search(word) for word in tokens)
        self.append_binary_feats(happy, 'happy')
    
    def sad(self, tokens):
        sad = any(self.emoji_sad.search(word) for word in tokens)
        self.append_binary_feats(sad, 'sad')
    
    def aryonms(self, tokens):
        acrs = any(word in self.acrynoms for word in tokens)
        self.append_binary_feats(acrs, 'acry')
        
    def char_repititions(self, tokens):
        reps = any(self.rep_char_re.search(word) for word in tokens)
        self.append_binary_feats(reps, 'reps')

    def caps_intensifier(self, tokens):
        caps = any(self.word_has_all_caps(word) for word in tokens)
        self.append_binary_feats(caps, 'caps')

    def word_has_all_caps(self, token):
        if (self.emoticon_re.search(token)
            or self.punct_re.match(token)
                or self.has_num(token)):
            return False

        if (token.upper() == token
            and (token != 'I'
                 and token != 'A')):
            return True

        return False

    def has_num(self, s):
        return any(i.isdigit() for i in s)
    
    def emoji_happy(self):
        try:
            return re.compile(u'['
                              u'\U0001f600-\U0001F60F'
                              u'\U0001F617-\U0001F61D'
                              u'\U0001F638-\U0001F63D'
                              ']+', re.UNICODE)
        except re.error:
            return re.compile(u'('
                              u'\ud83d[\ude00-\ude0f]|'
                              u'\ud83d[\ude17-\ude1d]|'
                              u'\ud83d[\ude38-\ude3d]'
                              ')+', re.UNICODE)

    def emoji_sad(self):
        try:
            return re.compile(u'['
                              u'\U0001F612-\U0001F616'
                              u'\U0001F61E-\U0001F62B'
                              u'\U0001F63E-\U0001F63F'
                              ']+', re.UNICODE)
        except re.error:
            return re.compile(u'('
                              u'\ud83d[\ude12-\ude16]|'
                              u'\ud83d[\ude1e-\ude2b]|'
                              u'\ud83d[\ude3e-\ude3f]'
                              ')+', re.UNICODE)

    def ensure_unicode(self, tweet):
        try:
            return unicode(tweet)
        except UnicodeDecodeError:
            tweet = str(tweet).encode('string_escape')
            return unicode(tweet)

    def __html2unicode(self, s):
        """
        This function is curtosy of Christopher Potts
        http://sentiment.christopherpotts.net/index.html
        Internal metod that seeks to replace all the HTML entities in
        s with their corresponding unicode characters.
        """
        # First the digits:
        ents = set(self.html_entity_digit_re.findall(s))
        if len(ents) > 0:
            for ent in ents:
                entnum = ent[2:-1]
                try:
                    entnum = int(entnum)
                    s = s.replace(ent, unichr(entnum))
                except:
                    pass
        # Now the alpha versions:
        ents = set(self.html_entity_alpha_re.findall(s))
        ents = filter((lambda x: x != amp), ents)
        for ent in ents:
            entname = ent[1:-1]
            try:
                s = s.replace(ent,
                              unichr(htmlentitydefs.name2codepoint[entname]))
            except:
                pass
            s = s.replace(self.amp, " and ")
        return s

    
"""
    This file is based on the work of Christopher Potts
    however the file has been altered and extended for
    my purposes
    http://sentiment.christopherpotts.net/index.html
"""
emoticon_string = r"""
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
    )"""

# The components of the tokenizer:
regex_strings = (
    # Phone numbers:
    r""""
    (?:
      (?:            # (international)
        \+?[01]
        [\-\s.]*
      )?
      (?:            # (area code)
        [\(]?
        \d{3}
        [\-\s.\)]*
      )?
      \d{3}          # exchange
      [\-\s.]*
      \d{4}          # base
    )""",
    # Emoticons:
    emoticon_string,
    # HTML tags:
    r'<[^>]+>',
    # Twitter username:
    r'(?:@[\w_]+)',
    # Links
    r'http\S+',
    # Twitter hashtags:
    r'(?:\#+[\w_]+[\w\'_\-]*[\w_]+)',
    # Remaining word types:
    r"""
    (?:[a-z][a-z'\-_]+[a-z])       # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                    # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace
    """
    )

negation_words = (
    """
    (?x)(?:
    ^(?:never|no|nothing|nowhere|noone|none|not|
        havent|hasnt|hadnt|cant|couldnt|shouldnt|
        wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint
     )$
    )
    |
    n't
    """
    )

# ######################################################################

word_re = re.compile(r'(%s)' % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
html_entity_digit_re = re.compile(r'&#\d+;')
html_entity_alpha_re = re.compile(r'&\w+;')
amp = "&amp;"
punct_re = re.compile("^[.:;!?]$")
negation_re = re.compile(negation_words)
url_re = re.compile(r'http\S+')
rep_char_re = re.compile(r'(\w)\1{3,}')
hashtag_re = re.compile(r'(?:\#+[\w_]+[\w\'_\-]*[\w_]+)')
user_tag_re = re.compile(r'(?:@[\w_]+)')

# Train and evaluate models

In [4]:
import scipy.sparse as sp

class FeatureCombiner(object):

    def transform(self, X, pre):
#         pre.normalise_vect()
        feats = X
        for k, v in pre.feats.iteritems():
            feats = np.c_[feats, np.array(v)]
        return feats

    def fit(self, X, y=None):
        return self


def build_and_evaluate(X, y, X_test, y_test, n_gram, min_df, max_df, norm, clf, outpath=None):

    def preprocess(s):
        return preprocessor.tokenise(s)

    # Initialise transformers/estimators
    preprocessor = Preprocessor()
    feat_comb = FeatureCombiner()
    vec = TfidfVectorizer(tokenizer=preprocess,
                          lowercase=False,
                          ngram_range=n_gram,
                          min_df=min_df,
                          max_df=max_df, 
                          norm=norm)
#                           max_features=5000,
#                           ngram_range=(1, 3))
    

    # Build model
    print("Building model")
    tfidf_matrix = vec.fit_transform(X)
#     feat_matrix = feat_comb.transform(tfidf_matrix.todense(),
#                                       preprocessor)
    clf.fit(tfidf_matrix, y)

    # Evaluate on test set
    preprocessor.reset_feats()
    tfidf_matrix_test = vec.transform(X_test)
#     feat_matrix_test = feat_comb.transform(tfidf_matrix_test.todense(),
#                                            preprocessor)
    y_pred = clf.predict(tfidf_matrix_test)

    print("Classification Report:\n")
    print np.mean(y_pred == y_test)
    print cm(y_test, y_pred)
    print(clsr(y_test, y_pred, target_names=['obj', 'sub']))

# Logistical Regression classifier

In [7]:
clf = LogisticRegression(C=7)
n_gram=(1, 2)
model = build_and_evaluate(training_data, training_labels, test_data, test_labels, n_gram, 1, 0.8, 'l2', clf)

Building model
Classification Report:

0.914798206278
[[847  37]
 [115 785]]
             precision    recall  f1-score   support

        obj       0.88      0.96      0.92       884
        sub       0.95      0.87      0.91       900

avg / total       0.92      0.91      0.91      1784



# Naive Bayes

In [8]:
clf = MultinomialNB()
n_gram = (1, 1)
model = build_and_evaluate(training_data, training_labels, test_data, test_labels, n_gram , 1, 0.8, 'l2', clf)

Building model
Classification Report:

0.913677130045
[[802  82]
 [ 72 828]]
             precision    recall  f1-score   support

        obj       0.92      0.91      0.91       884
        sub       0.91      0.92      0.91       900

avg / total       0.91      0.91      0.91      1784



# Linear SVC classifier

In [None]:
clf = LinearSVC(C=5)#, penalty='l1', dual=False)
n_gram=(1, 2)
model = build_and_evaluate(training_data, training_labels, test_data, test_labels, n_gram, 1, 0.8, 'l2', clf)

# SGD Classifier

In [None]:
clf = SGDClassifier()
n_gram=(1, 2)
model = build_and_evaluate(training_data, training_labels, test_data, test_labels, n_gram, 1, 0.8, 'l2', clf)

# Logistic Regression with seperate test set

In [10]:
clf = LogisticRegression(C=7)
n_gram=(1, 1)
model = build_and_evaluate(train['text'].values, train['class'].values,
                           test['text'].values, test['class'].values,
                           n_gram, 1, 0.8, 'l2', clf)

Building model
Classification Report:

0.604316546763
[[79 60]
 [50 89]]
             precision    recall  f1-score   support

        obj       0.61      0.57      0.59       139
        sub       0.60      0.64      0.62       139

avg / total       0.60      0.60      0.60       278



In [None]:

slang = {}
with open('./data/emoji_dict.csv', 'rb') as f:
    reader = csv.reader(f)
    slang = dict((rows[0], int(rows[1])) for rows in reader)
    
print slang

## remove punct from pos_tags

In [None]:
to = Preprocessor()

tweet = '@test HA helllllo i is ! i\'ve his playing biggestttt'

t = rep_char_re.match(tweet)
t = to.tokenise(tweet.decode('utf-8'))
print t


In [None]:
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']

In [None]:
model = build_and_evaluate(train['text'].values, train['class'].values, test['text'].values, test['class'].values)

In [None]:
model = build_and_evaluate(train['text'].values, train['class'].values, test['text'].values, test['class'].values)

In [None]:
# 