In [3]:
import json
from collections import Counter
from random import shuffle
from sklearn import linear_model
import nltk
import os
import statistics
nltk.download('averaged_perceptron_tagger')

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from scipy.cluster.vq import whiten

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/himankyadav/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
class AuthorshipMasterClassifier(object):

    def __init__(self, author_path, rando_path):
        with open(author_path, mode='r') as f:
            text = f.read()
            self.author_text = json.loads(text)
        shuffle(self.author_text)
        #randomly get samples
        self.rando_text = []
        #file_list = os.listdir(rando_path)
        file_list = ['_vargas_.data', '-eDgaR-.data', 'Abe_lincolin.data',
                     'Ambiguously_Ironic.data', 'anutensil.data', 'APOSTOLATE.data',
                    'awildsketchappeared.data', 'axolotl_peyotl.data','boib.data']
        """
        'DaedalusMinion.data',
                    'Donald_Keyman.data', 'Elaus.data', 'dick-nipples.data', 'IAmTheRedWizards.data'
        """
        shuffle(file_list)
        for filename in file_list:
            real_path = rando_path + filename
            if filename.startswith('.') or real_path == author_path:
                continue
            with open(real_path, mode='r') as f:
                text = f.read()
                text_json = json.loads(text)
                shuffle(text_json)
                self.rando_text.extend(text_json[:len(self.author_text)/20])
            if len(self.rando_text) >= len(self.author_text):
                break
        shuffle(self.rando_text)
        self.subclassifier_list = []
        self._train_test_split(0.7)

    def _train_test_split(self, split):
        self.train_samples = [ (comment, 1) for comment in self.author_text[:int(len(self.author_text) * split)]]
        self.train_samples.extend([(comment, 0) for comment in self.rando_text[:int(len(self.rando_text) * split)]])
        self.test_samples = [ (comment, 1) for comment in self.author_text[int(len(self.author_text) * split):]]
        self.test_samples.extend([(comment, 0) for comment in self.rando_text[int(len(self.rando_text) * split):]])
        shuffle(self.train_samples)
        shuffle(self.test_samples)

    def add_subclassifier(self, slave):
        slave.initialize(self.author_text, self.rando_text)
        slave.build_model(self.train_samples)
        self.subclassifier_list.append(slave)
        return self

    def predict(self, comment):
        one_counts = 0
        zero_counts = 0
        for slave in self.subclassifier_list:
            result = slave.predict(comment)
            if result == 1:
                one_counts += 1
            elif result == 0:
                zero_counts += 1
            else:
                raise Exception("Sad")
        return int(one_counts > zero_counts)
    
    def test(self):
        correct = 0.0
        for comment, expected in self.test_samples:
            predicted = self.predict(comment)
            if predicted == expected:
                correct += 1
        return correct / len(self.test_samples)

In [5]:
class BagOfWordsClassifier(object):
    def __init__(self, top_most=500):
        self.tokenizer = nltk.tokenize.casual.TweetTokenizer()
        self.logreg = linear_model.LogisticRegression()
        self.top_most = top_most
    def initialize(self, author_text, rando_text):
        author_bag = Counter()
        for comment in author_text:
            for word in self.tokenizer.tokenize(comment):
                author_bag[word] += 1
        self.feature_set = set(word for word, count in author_bag.most_common(self.top_most))

    def build_feature_row(self, comment):
        row = [0] * (len(self.feature_set))
        comment_set = set(self.tokenizer.tokenize(comment))
        for i, word in enumerate(self.feature_set):
            if word in comment_set:
                row[i] = 1
        return row
        
    def build_model(self, train_samples):
        X = []
        Y = []
        for comment, expected in train_samples:
            X.append(self.build_feature_row(comment))
            Y.append(expected)
        self.logreg.fit(X, Y)
    def predict(self, comment):
        X = [self.build_feature_row(comment)]
        return self.logreg.predict(X)

In [6]:
class CharacterNGramsClassifier(object):
    def __init__(self, n=2, top_most=500):
        self.n = n
        self.top_most = top_most
        self.logreg = linear_model.LogisticRegression()

    def initialize(self, author_text, rando_text):
        author_bag = Counter()
        for comment in author_text:
            for gram in nltk.ngrams(comment, self.n):
                author_bag[gram] += 1
        self.feature_set = {gram for gram, freq in author_bag.most_common(self.top_most)}

    def build_feature_row(self, comment):
        row = [0] * (len(self.feature_set))
        ngram_set = set(nltk.ngrams(comment, self.n))
        for i, gram in enumerate(self.feature_set):
            if gram in ngram_set:
                row[i] = 1
        return row

    def build_model(self, train_samples):
        X = []
        Y = []
        for comment, expected in train_samples:
            X.append(self.build_feature_row(comment))
            Y.append(expected)
        self.logreg.fit(X, Y)

    def predict(self, comment):
        X = [self.build_feature_row(comment)]
        return self.logreg.predict(X)

In [7]:
class PartOfSpeechClassifier(object):
    def __init__(self, n=2, top_most=400):
        self.tokenizer = nltk.tokenize.casual.TweetTokenizer()
        self.n = n
        self.top_most = top_most
        self.logreg = linear_model.LogisticRegression()

    def initialize(self, author_text, rando_text):
        author_pos = []
        for comment in author_text:
            tokens = self.tokenizer.tokenize(comment)
            pos = nltk.pos_tag(tokens)
            author_pos.append([tag for w, tag in pos])
        author_counter = Counter()
        for pos in author_pos:
            for gram in nltk.ngrams(pos, self.n):
                author_counter[gram] += 1
        self.feature_set = {pos for pos, freq in author_counter.most_common(self.top_most)}

    def build_feature_row(self, comment):
        row = [0] * (len(self.feature_set))
        tokens = self.tokenizer.tokenize(comment)
        pos = nltk.pos_tag(tokens)
        ngram_set = set(nltk.ngrams(pos, self.n))
        for i, gram in enumerate(self.feature_set):
            if gram in ngram_set:
                row[i] = 1
        return row

    def build_model(self, train_samples):
        X = []
        Y = []
        for comment, expected in train_samples:
            X.append(self.build_feature_row(comment))
            Y.append(expected)
        self.logreg.fit(X, Y)

    def predict(self, comment):
        X = [self.build_feature_row(comment)]
        return self.logreg.predict(X)

In [8]:
# PARAMs TO TWEAK
class ShortMessageVerificationClassifier(object):
    def __init__(self, NUM_BUCKET = 10, SPLIT_PERCENTAGE = 0.8, GAMMA = 1, NGRAM = 4):
        self.NUM_BUCKET = NUM_BUCKET
        self.SPLIT_PERCENTAGE = SPLIT_PERCENTAGE
        self.GAMMA = GAMMA
        self.NGRAM = NGRAM
        self.word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
        self.OTHER_USERS = ['Thehealeroftri.data', 'User_Name13.data', 'maxwellhill.data', 
                       'illuminatedwax.data', 'Rlight.data', 'straydog1980.data', 'themightiestduck.data', 
                       'qgyh2.data', 'BritishEnglishPolice.data', 'IAmAN00bie.data', 'manbra.data', 
                       'MaiaNyx.data', 'nix0n.data', 'Jux_.data', '_vargas_.data', '-eDgaR-.data', 
                       'Abe_lincolin.data', 'Ambiguously_Ironic.data', 'anutensil.data', 'APOSTOLATE.data',
                    'awildsketchappeared.data', 'axolotl_peyotl.data','boib.data']
        self.NUM_OTHERS = len(self.OTHER_USERS)
        
    def initialize(self, author_text, rando_text):
        pass
        
    def split_comments(self, comments):
        total_comments = len(comments)
        split1 = comments[:int(total_comments * self.SPLIT_PERCENTAGE)]
        split2 = comments[int(total_comments * self.SPLIT_PERCENTAGE):]
        return (split1, split2)
            
    def build_model(self, train_samples):
        # DECIDE TRANINING SAMPLE FOR SAHIL
        author_train_samples = [comment for (comment, expected) in train_samples if expected]
        self.sahil_split1, self.sahil_split2 = self.split_comments(author_train_samples)
        # Data of other sample users
        others = []
        for user in self.OTHER_USERS:
            with open('./data/' + user, 'r') as f:
                other = f.read()
                other = json.loads(other)
                others.append(other)
                
        other_blocks_ru_values = []
        
        for other_comments in others:
            other_comments1, other_comments2 = self.split_comments(other_comments)
            partitions = self.parition_ngram(other_comments2)
            ru_values = [self.calculate_ru_block(n_gram_block) for n_gram_block in partitions]
            other_blocks_ru_values.append(ru_values)

        ru_values_of_all_blocks = [self.get_ru_for_user_block(block) for block in xrange(self.NUM_BUCKET)]
        sample_mean = statistics.mean(ru_values_of_all_blocks)
        sample_variance = statistics.variance(ru_values_of_all_blocks)
        sample_std_dev = statistics.stdev(ru_values_of_all_blocks)
        self.threshold = self.train_for_user_threshold(sample_mean, sample_variance, ru_values_of_all_blocks, other_blocks_ru_values)
                
    def get_all_ngrams(self, comments):
        # returns the set of ngrams given 
        all_text = " ".join(comments)
        words = self.word_tokenizer.tokenize(all_text.lower())
        n_grams = set(nltk.ngrams(" ".join(words), self.NGRAM))
        return n_grams

    def parition_ngram(self, comments):
        # returns ngrams of buckets of text
        all_text = " ".join(comments)
        words = self.word_tokenizer.tokenize(all_text.lower())
        buckets = [words[i::self.NUM_BUCKET] for i in xrange(self.NUM_BUCKET)]
        n_gram_buckets = [self.get_all_ngrams(bucket) for bucket in buckets]
        return n_gram_buckets

    def get_ngram_for_user_block(self, block):
        # Returns ngram given a user block
        assert  block < self.NUM_BUCKET
        return self.parition_ngram(self.sahil_split2)[block]

    def calculate_ru_block(self, n_gram_block):
        # calculates percentage of unique n-gram models for given block
        num = len(set.intersection(n_gram_block, self.get_all_ngrams(self.sahil_split1)))
        den = len(n_gram_block)
        return num / float(den)

    def get_ru_for_user_block(self, block):
        # calculates percentage of unique n-gram models for a block of user data
        assert block < self.NUM_BUCKET
        n_gram_block = self.get_ngram_for_user_block(block)
        return self.calculate_ru_block(n_gram_block)
    
    def train_for_user_threshold(self, sample_mean, sample_std_dev, ru_values_of_all_blocks, other_blocks_ru_values):
        up = False
        down = False
        delta = 1
        threshold = sample_mean - (sample_std_dev / 2.0)
        while delta > 0.0001:
            FAR, FRR = self.calculate(threshold, ru_values_of_all_blocks, other_blocks_ru_values)
            if FRR - FAR > 0:
                down = True
                threshold -= delta
            if FAR - FRR > 0:
                up = True
                threshold += delta
            if up and down:
                up = False
                down = False
                delta = delta / 10.0
        return threshold
    
    def calculate(self, threshold, ru_values_of_all_blocks, other_blocks_ru_values):
        FA, FR = 0.0, 0.0
        threshold_gamma_sum = threshold + self.GAMMA
        for i in xrange(self.NUM_BUCKET):
            if ru_values_of_all_blocks[i] < threshold_gamma_sum:
                FR += 1
        FRR = FR / float(self.NUM_BUCKET)
        for k in xrange(self.NUM_OTHERS):       
            for j in xrange(self.NUM_BUCKET):
                if other_blocks_ru_values[k][j] >= threshold_gamma_sum:
                    FA += 1
        FAR = FA / float(self.NUM_BUCKET * self.NUM_OTHERS)
        if FAR == FRR:
            FRR += 0.00001
        return (FAR, FRR)
    
    def predict(self, comment):
        ngrams = self.get_all_ngrams([comment])
        if not ngrams:
            return 0
        return 1 if self.calculate_ru_block(ngrams) > (self.threshold + self.GAMMA) else 0    

In [None]:
class LexicalKMeansClustering():
    def __init__(self):
        self.sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        self.word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
        
    def initialize(self, author_text, rando_text):
        pass
    
    def build_model(self, train_samples):
        # print pacling.classify('\n'.join(sahil), '\n'.join(sahil))
        comments = [comment for (comment, expected) in train_samples if expected]
        self.comments = filter(lambda k: k != "", comments)

        fvs_lexical = np.zeros((len(comments), 3), np.float64)
        fvs_punct = np.zeros((len(comments), 3), np.float64)
        for e, ch_text in enumerate(comments):
            tokens = nltk.word_tokenize(ch_text.lower())
            words = self.word_tokenizer.tokenize(ch_text.lower())
            sentences = self.sentence_tokenizer.tokenize(ch_text)
            vocab = set(words)
            words_per_sentence = np.array([len(self.word_tokenizer.tokenize(s)) for s in sentences])

            fvs_lexical[e, 0] = words_per_sentence.mean()
            fvs_lexical[e, 1] = words_per_sentence.std()
            if words:
                fvs_lexical[e, 2] = len(vocab) / float(len(words))
            if sentences:
                fvs_punct[e, 0] = tokens.count(',') / float(len(sentences))
                fvs_punct[e, 1] = tokens.count(';') / float(len(sentences))
                fvs_punct[e, 2] = tokens.count(':') / float(len(sentences))
    
        # apply whitening to decorrelate the features
        fvs_lexical = whiten(fvs_lexical)
        fvs_punct = whiten(fvs_punct)
        feature_sets = list((np.nan_to_num(fvs_lexical), np.nan_to_num(fvs_punct)))
        self.kmeans_cluster = self.predict_authors(feature_sets[0])
        results = self.kmeans_cluster.labels_
        self.sahil = 1 if sum(results) > len(results)/2 else 0
        
    def predict_authors(self, fvs):
        km = KMeans(n_clusters=2, init='k-means++', n_init=10, verbose=0)
        km.fit(fvs)
        return km 
    
    def predict(self, comment):
        tokens = nltk.word_tokenize(comment.lower())
        words = self.word_tokenizer.tokenize(comment.lower())
        sentences = self.sentence_tokenizer.tokenize(comment)
        vocab = set(words)
        words_per_sentence = np.array([len(self.word_tokenizer.tokenize(s)) for s in sentences])
        fvs_lexical = np.zeros((1, 3), np.float64)
        fvs_lexical[0][0] = words_per_sentence.mean()
        fvs_lexical[0][1] = words_per_sentence.std()
        fvs_lexical[0][2] = len(vocab) / float(len(words)) if words else 0
        return 1 if self.sahil == self.kmeans_cluster.predict(fvs_lexical)[0] else 0

In [None]:
test_times = 10
total = 0.0
for i in xrange(test_times):
    cl = AuthorshipMasterClassifier('./data/vrckid.data', './data/')
    cl.add_subclassifier(CharacterNGramsClassifier())
    cl.add_subclassifier(BagOfWordsClassifier())
    cl.add_subclassifier(PartOfSpeechClassifier(n=2))
    cl.add_subclassifier(ShortMessageVerificationClassifier())
    cl.add_subclassifier(LexicalKMeansClustering())
    score = cl.test()
    print 'Test ', i + 1, ' score=', score
    total += score
print 'Average score = ', total / test_times

Test  1  score= 0.787037037037
Test  2  score= 0.777777777778
Test  3  score= 0.780092592593
Test  4  score= 0.803240740741
Test  5  score= 0.770833333333
Test  6  score= 0.798611111111
