In [116]:
from collections import Counter
import json
import nltk
import statistics
from random import shuffle

In [137]:
# PARAMs TO TWEAK
word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
NUM_BUCKET = 10
SPLIT_PERCENTAGE = 0.8
GAMMA = 1
NGRAM = 4
OTHER_USERS = ['Thehealeroftri.data', 'User_Name13.data', 'maxwellhill.data', 
               'illuminatedwax.data', 'Rlight.data', 'straydog1980.data', 'themightiestduck.data', 
               'qgyh2.data', 'BritishEnglishPolice.data', 'IAmAN00bie.data', 'manbra.data', 
               'MaiaNyx.data', 'nix0n.data', 'Jux_.data', '_vargas_.data', '-eDgaR-.data', 
               'Abe_lincolin.data', 'Ambiguously_Ironic.data', 'anutensil.data', 'APOSTOLATE.data',
            'awildsketchappeared.data', 'axolotl_peyotl.data','boib.data']
NUM_OTHERS = len(OTHER_USERS)

In [138]:
def split_comments(comments):
    total_comments = len(comments)
    split1 = comments[:int(total_comments * SPLIT_PERCENTAGE)]
    split2 = comments[int(total_comments * SPLIT_PERCENTAGE):]
    return (split1, split2)

sahil = None
with open('./data/vrckid.data', 'r') as f:
    sahil = f.read()
    sahil = json.loads(sahil)
    
# DECIDE TRANINING SAMPLE FOR SAHIL
sahil_split1, sahil_split2 = split_comments(sahil[:700])
len(sahil), len(sahil[700:])

(994, 294)

In [139]:
# Data of other sample users
others = []
for user in OTHER_USERS:
    with open('./data/' + user, 'r') as f:
        other = f.read()
        other = json.loads(other)
        others.append(other)
len(others)

23

In [140]:
def get_all_ngrams(comments):
    # returns the set of ngrams given 
    all_text = " ".join(comments)
    words = word_tokenizer.tokenize(all_text.lower())
    n_grams = set(nltk.ngrams(" ".join(words), NGRAM))
    return n_grams

def parition_ngram(comments = sahil_split2):
    # returns ngrams of buckets of text
    all_text = " ".join(comments)
    words = word_tokenizer.tokenize(all_text.lower())
    buckets = [words[i::NUM_BUCKET] for i in xrange(NUM_BUCKET)]
    n_gram_buckets = [get_all_ngrams(bucket) for bucket in buckets]
    return n_gram_buckets

def get_ngram_for_user_block(block):
    # Returns ngram given a user block
    assert  block < NUM_BUCKET
    return parition_ngram()[block]

def calculate_ru_block(n_gram_block):
    # calculates percentage of unique n-gram models for given block
    num = len(set.intersection(n_gram_block, get_all_ngrams(sahil_split1)))
    den = len(n_gram_block)
    return num / float(den)

def get_ru_for_user_block(block):
    # calculates percentage of unique n-gram models for a block of user data
    assert block < NUM_BUCKET
    n_gram_block = get_ngram_for_user_block(block)
    return calculate_ru_block(n_gram_block)

In [141]:
other_blocks_ru_values = []
for other_comments in others:
    other_comments1, other_comments2 = split_comments(other_comments)
    partitions = parition_ngram(comments=other_comments2)
    ru_values = [calculate_ru_block(n_gram_block) for n_gram_block in partitions]
    other_blocks_ru_values.append(ru_values)

ru_values_of_all_blocks = [get_ru_for_user_block(block) for block in xrange(NUM_BUCKET)]
sample_mean = statistics.mean(ru_values_of_all_blocks)
sample_variance = statistics.variance(ru_values_of_all_blocks)
sample_std_dev = statistics.stdev(ru_values_of_all_blocks)
sample_mean, sample_variance, sample_std_dev

(0.8790858314799795, 0.00014092618112231532, 0.011871233344615685)

In [142]:
def calculate(threshold):
    FA, FR = 0.0, 0.0
    threshold_gamma_sum = threshold + GAMMA
    for i in xrange(NUM_BUCKET):
        if ru_values_of_all_blocks[i] < threshold_gamma_sum:
            FR += 1
    FRR = FR / float(NUM_BUCKET)
    for k in xrange(NUM_OTHERS):       
        for j in xrange(NUM_BUCKET):
            if other_blocks_ru_values[k][j] >= threshold_gamma_sum:
                FA += 1
    FAR = FA / float(NUM_BUCKET * NUM_OTHERS)
    if FAR == FRR:
        FRR += 0.00001
    return (FAR, FRR)

In [143]:
def train_for_user_threshold():
    up = False
    down = False
    delta = 1
    threshold = sample_mean - (sample_std_dev / 2.0)
    while delta > 0.0001:
        FAR, FRR = calculate(threshold)
        if FRR - FAR > 0:
            down = True
            threshold -= delta
        if FAR - FRR > 0:
            up = True
            threshold += delta
        if up and down:
            up = False
            down = False
            delta = delta / 10.0
    return threshold

In [144]:
threshold = train_for_user_threshold()
threshold

-0.14184978519232827

In [145]:
def classify_comment(comment):
    ngrams = get_all_ngrams([comment])
    if not ngrams:
        return 0
    return 1 if calculate_ru_block(ngrams) > (threshold + GAMMA) else 0

In [146]:
num_sahil_test_comments = len (sahil[700:])
test_others = []
test_user_list = [
 'Dacvak.data',
 'X019.data',
 'Slouching2Bethlehem.data',
 'DaedalusMinion.data',
 'stopscopiesme.data',
 'girafa.data']

for user in test_user_list:
    with open('./data/' + user, 'r') as f:
        other = f.read()
        other = json.loads(other)
        shuffle(other)
        test_others.extend(other[:50])

expected_val = [0] * len(test_others) + [1] * num_sahil_test_comments
        
test_others = test_others + sahil[700:]
res = [classify_comment(comment) for comment in test_others]

correct_count = 0
for predicted, expected in zip(res, expected_val):
    if predicted == expected:
        correct_count += 1
print 'Accuracy=', correct_count /float(len(res))

Accuracy= 0.659932659933
