In [168]:
import json
from collections import Counter
from random import shuffle
from sklearn import linear_model
import nltk
import os

In [179]:
class AuthorshipMasterClassifier(object):

    def __init__(self, author_path, rando_path):
        with open(author_path, mode='r') as f:
            text = f.read()
            self.author_text = json.loads(text)
        shuffle(self.author_text)
        #randomly get samples
        self.rando_text = []
        #file_list = os.listdir(rando_path)
        file_list = ['_vargas_.data', '-eDgaR-.data', 'Abe_lincolin.data',
                     'Ambiguously_Ironic.data', 'anutensil.data', 'APOSTOLATE.data',
                    'awildsketchappeared.data', 'axolotl_peyotl.data','boib.data']
        """
        'DaedalusMinion.data',
                    'Donald_Keyman.data', 'Elaus.data', 'dick-nipples.data', 'IAmTheRedWizards.data'
        """
        shuffle(file_list)
        for filename in file_list:
            real_path = rando_path + filename
            if filename.startswith('.') or real_path == author_path:
                continue
            with open(real_path, mode='r') as f:
                text = f.read()
                text_json = json.loads(text)
                shuffle(text_json)
                self.rando_text.extend(text_json[:len(self.author_text)/20])
            if len(self.rando_text) >= len(self.author_text):
                break
        shuffle(self.rando_text)
        self.subclassifier_list = []
        self._train_test_split(0.7)

    def _train_test_split(self, split):
        self.train_samples = [ (comment, 1) for comment in self.author_text[:int(len(self.author_text) * split)]]
        self.train_samples.extend([(comment, 0) for comment in self.rando_text[:int(len(self.rando_text) * split)]])
        self.test_samples = [ (comment, 1) for comment in self.author_text[int(len(self.author_text) * split):]]
        self.test_samples.extend([(comment, 0) for comment in self.rando_text[int(len(self.rando_text) * split):]])
        shuffle(self.train_samples)
        shuffle(self.test_samples)

    def add_subclassifier(self, slave):
        slave.initialize(self.author_text, self.rando_text)
        slave.build_model(self.train_samples)
        self.subclassifier_list.append(slave)
        return self

    def predict(self, comment):
        one_counts = 0
        zero_counts = 0
        for slave in self.subclassifier_list:
            result = slave.predict(comment)
            if result == 1:
                one_counts += 1
            elif result == 0:
                zero_counts += 1
            else:
                raise Exception("Sad")
        return int(one_counts > zero_counts)
    
    def test(self):
        correct = 0.0
        for comment, expected in self.test_samples:
            predicted = self.predict(comment)
            if predicted == expected:
                correct += 1
        return correct / len(self.test_samples)

In [180]:
class BagOfWordsClassifier(object):
    def __init__(self, top_most=500):
        self.tokenizer = nltk.tokenize.casual.TweetTokenizer()
        self.logreg = linear_model.LogisticRegression()
        self.top_most = top_most
    def initialize(self, author_text, rando_text):
        author_bag = Counter()
        for comment in author_text:
            for word in self.tokenizer.tokenize(comment):
                author_bag[word] += 1
        self.feature_set = set(word for word, count in author_bag.most_common(self.top_most))

    def build_feature_row(self, comment):
        row = [0] * (len(self.feature_set))
        comment_set = set(self.tokenizer.tokenize(comment))
        for i, word in enumerate(self.feature_set):
            if word in comment_set:
                row[i] = 1
        return row
        
    def build_model(self, train_samples):
        X = []
        Y = []
        for comment, expected in train_samples:
            X.append(self.build_feature_row(comment))
            Y.append(expected)
        self.logreg.fit(X, Y)
    def predict(self, comment):
        X = [self.build_feature_row(comment)]
        return self.logreg.predict(X)

In [181]:
class CharacterNGramsClassifier(object):
    def __init__(self, n=2, top_most=500):
        self.n = n
        self.top_most = top_most
        self.logreg = linear_model.LogisticRegression()

    def initialize(self, author_text, rando_text):
        author_bag = Counter()
        for comment in author_text:
            for gram in nltk.ngrams(comment, self.n):
                author_bag[gram] += 1
        self.feature_set = {gram for gram, freq in author_bag.most_common(self.top_most)}

    def build_feature_row(self, comment):
        row = [0] * (len(self.feature_set))
        ngram_set = set(nltk.ngrams(comment, self.n))
        for i, gram in enumerate(self.feature_set):
            if gram in ngram_set:
                row[i] = 1
        return row

    def build_model(self, train_samples):
        X = []
        Y = []
        for comment, expected in train_samples:
            X.append(self.build_feature_row(comment))
            Y.append(expected)
        self.logreg.fit(X, Y)

    def predict(self, comment):
        X = [self.build_feature_row(comment)]
        return self.logreg.predict(X)

In [182]:
class PartOfSpeechClassifier(object):
    def __init__(self, n=2, top_most=400):
        self.tokenizer = nltk.tokenize.casual.TweetTokenizer()
        self.n = n
        self.top_most = top_most
        self.logreg = linear_model.LogisticRegression()

    def initialize(self, author_text, rando_text):
        author_pos = []
        for comment in author_text:
            tokens = self.tokenizer.tokenize(comment)
            pos = nltk.pos_tag(tokens)
            author_pos.append([tag for w, tag in pos])
        author_counter = Counter()
        for pos in author_pos:
            for gram in nltk.ngrams(pos, self.n):
                author_counter[gram] += 1
        self.feature_set = {pos for pos, freq in author_counter.most_common(self.top_most)}

    def build_feature_row(self, comment):
        row = [0] * (len(self.feature_set))
        tokens = self.tokenizer.tokenize(comment)
        pos = nltk.pos_tag(tokens)
        ngram_set = set(nltk.ngrams(pos, self.n))
        for i, gram in enumerate(self.feature_set):
            if gram in ngram_set:
                row[i] = 1
        return row

    def build_model(self, train_samples):
        X = []
        Y = []
        for comment, expected in train_samples:
            X.append(self.build_feature_row(comment))
            Y.append(expected)
        self.logreg.fit(X, Y)

    def predict(self, comment):
        X = [self.build_feature_row(comment)]
        return self.logreg.predict(X)

In [184]:
test_times = 10
total = 0.0
for i in xrange(test_times):
    cl = AuthorshipMasterClassifier('./data/vrckid.data', './data/')
    cl.add_subclassifier(CharacterNGramsClassifier())
    cl.add_subclassifier(BagOfWordsClassifier())
    cl.add_subclassifier(PartOfSpeechClassifier(n=2))
    score = cl.test()
    print 'Test ', i + 1, ' score=', score
    total += score
print 'Average score = ', total / test_times

Test  1  score= 0.756944444444
Test  2  score= 0.75462962963
Test  3  score= 0.747685185185
Test  4  score= 0.759259259259
Test  5  score= 0.74537037037
Test  6  score= 0.766203703704
Test  7  score= 0.74537037037
Test  8  score= 0.738425925926
Test  9  score= 0.766203703704
Test  10  score= 0.75
Average score =  0.753009259259
