In [1]:
import pandas as p
import numpy as np
import codecs
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.util import ngrams


from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report as clsr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix as cm

p.options.display.max_colwidth = 150

In [2]:
train = p.read_csv('./sub_obj_data/train_new.csv', usecols=(['class', 'text'])).dropna()
test  = p.read_csv('./sub_obj_data/test_ds.csv', usecols=(['class', 'text'])).dropna()
train = train.reindex(np.random.permutation(train.index))

In [22]:
from __future__ import unicode_literals, division
import re
import htmlentitydefs
import csv

from nltk.corpus import stopwords as sw
from nltk import pos_tag


class Preprocessor():

    def __init__(self, corpus):
        self.stopwords = sw.words('english')
        self.word_re = word_re
        self.emoticon_re = emoticon_re
        self.html_entity_digit_re = html_entity_digit_re
        self.html_entity_alpha_re = html_entity_alpha_re
        self.amp = amp
        self.punct_re = punct_re
        self.negation_re = negation_re
        self.url_re = url_re
        self.rep_char_re = rep_char_re
        self.hashtag_re = hashtag_re
        self.user_tag_re = user_tag_re
        self.top_bigrams = self.get_bigrams(corpus)
        
    def helper(self, tweet):
        tweet = self.__html2unicode(tweet)
        tokens = self.word_re.findall(tweet)
        vect = []
        for t in tokens:
            if t in self.stopwords or t in string.punctuation:
                continue
            vect.append(t)
        
        return vect
        
    def get_bigrams(self, corpus):
        tokens = [] #[token for sublist in corpus for token in sublist]
        for tweet in corpus:
            tokens.append(self.helper(tweet.decode('utf-8').lower()))
        tokens = [val for sublist in tokens for val in sublist]
        
        bigram_measures = nltk.collocations.BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(tokens)
        finder.apply_freq_filter(5)
        return finder.nbest(bigram_measures.likelihood_ratio, 1000)
        

    def load_acrynoms(self):
        with open('../data/acrynom.csv', 'rb') as f:
            reader = csv.reader(f)
            slang = dict((rows[0], rows[1]) for rows in reader)
            return slang


    def normalise(self, tokens):
        vect = []
        
        bigrams = ngrams(tokens, 2)
        
#         for bigram in bigrams:
#             bigram_lower = (bigram[0].lower(), bigram[1].lower())
#             if bigram_lower in self.top_bigrams:
#                 if bigram_lower[0] not in self.stopwords or bigram_lower[1] not in self.stopwords :
#                     vect.append(bigram_lower[0] + '_' + bigram_lower[1])
        
        
        for t in tokens:
#             if (t in self.stopwords and
#                     not self.negation_re.match(t)):
#                 continue
            if t in self.stopwords or t in string.punctuation:
                continue
            
            if not self.emoticon_re.search(t):
                t = t.lower()
            
            t = self.rep_char_re.sub(r'\1', t)
            t = self.url_re.sub('_URL', t)
            t = self.hashtag_re.sub('_HASH', t)
            t = self.user_tag_re.sub('_USER', t)
            
            vect.append(t)
        
        return vect

    def tokenise(self, tweet):
        tweet = self.__html2unicode(tweet)
        tokens = self.word_re.findall(tweet)
        return self.normalise(tokens)

    def ensure_unicode(self, tweet):
        try:
            return unicode(tweet)
        except UnicodeDecodeError:
            tweet = str(tweet).encode('string_escape')
            return unicode(tweet)

    def __html2unicode(self, s):
        """
        This function is curtosy of Christopher Potts
        http://sentiment.christopherpotts.net/index.html
        Internal metod that seeks to replace all the HTML entities in
        s with their corresponding unicode characters.
        """
        # First the digits:
        ents = set(self.html_entity_digit_re.findall(s))
        if len(ents) > 0:
            for ent in ents:
                entnum = ent[2:-1]
                try:
                    entnum = int(entnum)
                    s = s.replace(ent, unichr(entnum))
                except:
                    pass
        # Now the alpha versions:
        ents = set(self.html_entity_alpha_re.findall(s))
        ents = filter((lambda x: x != amp), ents)
        for ent in ents:
            entname = ent[1:-1]
            try:
                s = s.replace(ent,
                              unichr(htmlentitydefs.name2codepoint[entname]))
            except:
                pass
            s = s.replace(self.amp, " and ")
        return s

    
"""
    This file is based on the work of Christopher Potts
    however the file has been altered and extended for
    my purposes
    http://sentiment.christopherpotts.net/index.html
"""
emoticon_string = r"""
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
    )"""

# The components of the tokenizer:
regex_strings = (
    # Phone numbers:
    r""""
    (?:
      (?:            # (international)
        \+?[01]
        [\-\s.]*
      )?
      (?:            # (area code)
        [\(]?
        \d{3}
        [\-\s.\)]*
      )?
      \d{3}          # exchange
      [\-\s.]*
      \d{4}          # base
    )""",
    # Emoticons:
    emoticon_string,
    # HTML tags:
    r'<[^>]+>',
    # Twitter username:
    r'(?:@[\w_]+)',
    # Links
    r'http\S+',
    # Twitter hashtags:
    r'(?:\#+[\w_]+[\w\'_\-]*[\w_]+)',
    # Remaining word types:
    r"""
    (?:[a-z][a-z'\-_]+[a-z])       # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                    # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace
    """
    )

negation_words = (
    """
    (?x)(?:
    ^(?:never|no|nothing|nowhere|noone|none|not|
        havent|hasnt|hadnt|cant|couldnt|shouldnt|
        wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint
     )$
    )
    |
    n't
    """
    )

# ######################################################################

word_re = re.compile(r'(%s)' % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
html_entity_digit_re = re.compile(r'&#\d+;')
html_entity_alpha_re = re.compile(r'&\w+;')
amp = "&amp;"
punct_re = re.compile("^[.:;!?]$")
negation_re = re.compile(negation_words)
url_re = re.compile(r'http\S+')
rep_char_re = re.compile(r'(\w)\1{3,}')
hashtag_re = re.compile(r'(?:\#+[\w_]+[\w\'_\-]*[\w_]+)')
user_tag_re = re.compile(r'(?:@[\w_]+)')

In [14]:
def build_and_evaluate(X, y, X_test, y_test, outpath=None):

    def preprocess(s):
        return preprocessor.tokenise(s)

    # Initialise transformers/estimators
    # clf = MultinomialNB()
#     clf = SGDClassifier()
    clf = LogisticRegression()
    preprocessor = Preprocessor(X_test)
    vec = TfidfVectorizer(tokenizer=preprocess,
                          lowercase=False,
                          ngram_range=(1, 1),
                          max_features=5000)

    # Build model
    print("Building model")
    matrix = vec.fit_transform(X)
    clf.fit(matrix, y)

    # Evaluate on test set
    matrix = vec.transform(X_test)
    y_pred = clf.predict(matrix)

    print("Classification Report:\n")
    print np.mean(y_pred == y_test)
    print cm(y_test, y_pred)
    print(clsr(y_test, y_pred, target_names=['obj', 'sub']))

In [23]:
model = build_and_evaluate(train['text'].values, train['class'].values, test['text'].values, test['class'].values)

Building model
Classification Report:

0.672661870504
[[ 85  54]
 [ 37 102]]
             precision    recall  f1-score   support

        obj       0.70      0.61      0.65       139
        sub       0.65      0.73      0.69       139

avg / total       0.68      0.67      0.67       278



## make corpus of sentances

In [21]:
model = build_and_evaluate(train['text'].values, train['class'].values, test['text'].values, test['class'].values)

Building model
Classification Report:

0.676258992806
[[ 85  54]
 [ 36 103]]
             precision    recall  f1-score   support

        obj       0.70      0.61      0.65       139
        sub       0.66      0.74      0.70       139

avg / total       0.68      0.68      0.67       278



In [6]:
# def get_bigrams(myString):
#     tokenizer = Preprocessor()
#     tokens = tokenizer.tokenise(myString)
#     stemmer = PorterStemmer()
#     bigram_finder = BigramCollocationFinder.from_words(tokens)
#     bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500)

#     for bigram_tuple in bigrams:
#         x = "%s %s" % bigram_tuple
#         tokens.append(x)

#     return ['_'.join([stemmer.stem(w).lower() for w in x.split() if w not in string.punctuation]) for x in tokens ]

# get_bigrams('@some royal flying rumble basketball edition, YAAY!')

In [12]:
tokenizer = Preprocessor(train['text'])
s = tokenizer.tokenise('3 hours to midnight @hel')
print s

[u'3_hours', u'3', u'hours', u'midnight', u'_USER']


In [13]:
tokenizer.top_bigrams

[(u'twitter', u'com'),
 (u'new', u'york'),
 (u'last', u'night'),
 (u'amanda', u'knox'),
 (u"i'm", u'going'),
 (u'pic', u'twitter'),
 (u'michael', u'jackson'),
 (u'real', u'madrid'),
 (u'\xef', u'\xbf'),
 (u'tampa', u'bay'),
 (u'nuit', u'blanche'),
 (u'footymad', u'attempt'),
 (u"can't", u'wait'),
 (u'droid', u'bionic'),
 (u'constitution', u'hall'),
 (u'jason', u'vernau'),
 (u'attempt', u'assist'),
 (u'abu', u'dhabi'),
 (u'daily', u'zap'),
 (u'fu', u'panda'),
 (u'andy', u'rooney'),
 (u'port', u'abu'),
 (u'kung', u'fu'),
 (u'bayer', u'leverkusen'),
 (u'pj', u'harvey'),
 (u'assist', u'form'),
 (u'brooklyn', u'bridge'),
 (u'good', u'morning'),
 (u':/', u'www'),
 (u'tony', u'romo'),
 (u'trayvon', u'martin'),
 (u'form', u'guide'),
 (u'subtitles', u'tentatively'),
 (u'yushin', u'okami'),
 (u'saturday', u'night'),
 (u'2d', u'3d'),
 (u"don't", u'know'),
 (u'ghost', u'rider'),
 (u'monday', u'night'),
 (u'indonesian', u'subtitles'),
 (u'windows', u'7'),
 (u'arrival', u'departure'),
 (u'fm', u'pre

In [9]:
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
text = "Hi How are you? i am fine and you"
token=nltk.word_tokenize(text)
bigrams=ngrams(token,2)

In [10]:
print token
for b in bigrams:
    print b

[u'Hi', u'How', u'are', u'you', u'?', u'i', u'am', u'fine', u'and', u'you']
(u'Hi', u'How')
(u'How', u'are')
(u'are', u'you')
(u'you', u'?')
(u'?', u'i')
(u'i', u'am')
(u'am', u'fine')
(u'fine', u'and')
(u'and', u'you')
