In [1]:
from collections import defaultdict
import numpy as np
import math
import sys

import random
from nltk.corpus import stopwords
import operator

In [2]:
def read_data_file(filename):
    data_file = open(filename)
    dict = {}
    for line in data_file:
        l = line.split()
        docIdx = eval(l[0])
        wordIdx = eval(l[1])
        count = eval(l[2])
        if docIdx not in dict:
            dict[docIdx] = defaultdict(int)
        dict[docIdx][wordIdx] += count
    data_file.close()
    return dict


def read_data_label(filename):
    data_file = open(filename)
    dict = defaultdict(int)
    map = defaultdict(int)

    count = 0
    for line in data_file:
        groupId = eval(line)
        dict[groupId] += 1
        count += 1
        map[count] = groupId
    data_file.close()
    return dict, map

voc_file = open("data/vocabulary.txt")
voc_dict = defaultdict(int)
count = 0
for line in voc_file:
    count += 1
    voc_dict[count] = line.strip()
voc_file.close()

train_data = read_data_file("data/train.data")
train_label, train_map = read_data_label("data/train.label")

group_num = len(train_label)
doc_num = len(train_map)
voc_num = len(voc_dict)

In [None]:


pi = [train_label[groupId] * 1.0 / doc_num for groupId in range(1, 21)]
# smoothing
p = np.ones((group_num, voc_num))

for docId in train_data:
    for vId in train_data[docId]:
        p[train_map[docId] - 1][vId - 1] += train_data[docId][vId]

for groupId in range(len(p)):
    group_sum = sum(p[groupId])
    for vId in range(len(p[groupId])):
        p[groupId][vId] = p[groupId][vId] / group_sum



In [None]:

# routine groupId: 1 - 20
def helper(data, groupId):
    result = 0.0
    result += math.log(pi[groupId - 1])
    for wordId in data:
        result += data[wordId] * math.log(p[groupId - 1][wordId - 1])
    return result

In [None]:

def choose(data):
    m = -sys.maxint - 1
    result = 0
    for groupId in range(1, 21):
        temp = helper(data, groupId)
        if temp > m:
            m = temp
            result = groupId
    return result

In [None]:

test_data = read_data_file("data/test.data")
test_label, test_map = read_data_label("data/test.label")


In [None]:

test_num = len(test_map)
error_num = 0
for docId in test_map:
    if choose(test_data[docId]) != test_map[docId]:
#         print docId
        error_num += 1

error_rate = error_num * 1.0 / test_num
print "error_rate " + str(error_rate)

0.21892071952

In [3]:
def para(freq_log, remove_stopwords, voc_size, t_data, t_label, t_map, v_data, v_label, v_map):
    group_num = len(t_label)
    doc_num = len(t_map)
    voc_num = voc_size
    stop_words = stopwords.words("english")
    voc_counter = defaultdict(int)
    for docId in t_data:
        for vocId in t_data[docId]:
            voc_counter[vocId] += t_data[docId][vocId]
    sorted_voc = sorted(voc_counter.items(), key=operator.itemgetter(1), reverse=True)
    voc_indexs = defaultdict(int)
    i = 0
    for voc in sorted_voc:
        if voc_size <= 0:
            break
        if remove_stopwords and voc_dict[voc[0]] in stop_words:
            continue
        voc_indexs[i] = voc[0]
        i += 1
        voc_size -= 1

    rev_voc_indexs = {v: k for k, v in voc_indexs.iteritems()}

    pi = [t_label[groupId] * 1.0 / doc_num for groupId in range(1, 21)]
    p = np.ones((group_num, voc_num))

    for docId in t_data:
        for v in range(voc_num):
            p[t_map[docId] - 1][v] += t_data[docId][voc_indexs[v]]

    for groupId in range(len(p)):
        group_sum = sum(p[groupId])
        for vId in range(len(p[groupId])):
            p[groupId][vId] = p[groupId][vId] / group_sum
            if freq_log:
                p[groupId][vId] = math.log(1 + p[groupId][vId])
                      
    def helper(data, groupId):
        result = 0.0
        result += math.log(pi[groupId - 1])
        for wordId in data:
            if wordId in rev_voc_indexs:
                result += data[wordId] * math.log(p[groupId - 1][rev_voc_indexs[wordId]])
        return result

    def choose(data):
        m = -sys.maxint - 1
        result = 0
        for groupId in range(1, 21):
            temp = helper(data, groupId)
            if temp > m:
                m = temp
                result = groupId
        return result

    v_num = len(v_map)
    error_num = 0
    for docId in v_map:
        if choose(v_data[docId]) != v_map[docId]:
            error_num += 1

    error_rate = error_num * 1.0 / v_num
                      
    return error_rate

In [None]:
print "False, False, 10000 ", para(False, False, 10000)
print "False, True, 10000 ", para(False, True, 10000)
print "True, False, 10000 ", para(True, False, 10000)
print "True, True, 10000 ", para(True, True, 10000)

False, False, 10000  error_rate 0.160603371783

False, True, 10000  error_rate 0.157497781721

True, False, 10000  error_rate 0.160603371783

True, True, 10000  error_rate 0.157941437445

In [None]:
print "False, True, 500 ", para(False, True, 500)
print "False, True, 1000 ", para(False, True, 1000)
print "False, True, 2000 ", para(False, True, 2000)
print "False, True, 3000 ", para(False, True, 3000)
print "False, True, 5000 ", para(False, True, 5000)

False, True, 500  error_rate 0.416149068323

False, True, 1000  error_rate 0.326530612245

False, True, 2000  error_rate 0.251552795031

False, True, 3000  error_rate 0.217391304348

False, True, 5000  error_rate 0.188997338066

In [4]:
train_indexs = random.sample(range(1, doc_num + 1), doc_num * 8 / 10)
train_indexs.sort()
validation_indexs = [i for i in range(1, doc_num + 1) if i not in train_indexs]

In [None]:
t_data = {}
t_label = defaultdict(int)
t_map = {}
v_data = {}
v_label = defaultdict(int)
v_map = {}

for docId in train_data:
    if docId in train_indexs:
        t_data[docId] = train_data[docId]
        t_map[docId] = train_map[docId]
        t_label[train_map[docId]] += 1
    else:
        v_data[docId] = train_data[docId]
        v_map[docId] = train_map[docId]
        v_label[train_map[docId]] += 1

In [None]:
print "False, True, 8000 ", para(False, True, 8000, t_data, t_label, t_map, v_data, v_label, v_map)
print "False, True, 15000 ", para(False, True, 15000, t_data, t_label, t_map, v_data, v_label, v_map)
print "False, True, 20000 ", para(False, True, 20000, t_data, t_label, t_map, v_data, v_label, v_map)
print "False, True, 30000 ", para(False, True, 30000, t_data, t_label, t_map, v_data, v_label, v_map)
print "False, True, 50000 ", para(False, True, 50000, t_data, t_label, t_map, v_data, v_label, v_map)
print "False, True, 70000 ", para(False, True, 70000, t_data, t_label, t_map, v_data, v_label, v_map)
print "False, True, 100000 ", para(False, True, 100000, t_data, t_label, t_map, v_data, v_label, v_map)

False, True, 8000  0.16149068323

False, True, 15000  0.143744454303

False, True, 20000  0.139307897072

False, True, 30000  0.132653061224

False, True, 35000  0.131765749778

False, True, 40000  0.131322094055

False, True, 45000  0.130434782609

False, True, 50000  0.132209405501

False, True, 55000  0.135314995563

False, True, 60000  0.136645962733

False, True, 65000  0.137976929902

False, True, 70000  0.140195208518

False, True, 100000  0.145075421473

In [None]:
print "False, True, 44000 ", para(False, True, 44000, t_data, t_label, t_map, v_data, v_label, v_map)
print "False, True, 46000 ", para(False, True, 46000, t_data, t_label, t_map, v_data, v_label, v_map)
print "False, True, 43000 ", para(False, True, 43000, t_data, t_label, t_map, v_data, v_label, v_map)
print "False, True, 47000 ", para(False, True, 47000, t_data, t_label, t_map, v_data, v_label, v_map)
print "False, True, 42000 ", para(False, True, 42000, t_data, t_label, t_map, v_data, v_label, v_map)
print "False, True, 48000 ", para(False, True, 48000, t_data, t_label, t_map, v_data, v_label, v_map)
print "False, True, 41000 ", para(False, True, 41000, t_data, t_label, t_map, v_data, v_label, v_map)
print "False, True, 49000 ", para(False, True, 49000, t_data, t_label, t_map, v_data, v_label, v_map)

False, True, 44000  0.134427684117
False, True, 46000  0.133984028394
False, True, 43000 

In [None]:
import nltk
nltk.download()

In [None]:
print "True, True, 61188 ", para(True, True, 61188, train_data, train_label, train_map, test_data, test_label, test_map)