In [1]:
from collections import defaultdict
import numpy as np
import math
import sys

import random
from nltk.corpus import stopwords
import operator

In [2]:
def read_data_file(filename):
    data_file = open(filename)
    dict = {}
    for line in data_file:
        l = line.split()
        docIdx = eval(l[0])
        wordIdx = eval(l[1])
        count = eval(l[2])
        if docIdx not in dict:
            dict[docIdx] = defaultdict(int)
        dict[docIdx][wordIdx] += count
    data_file.close()
    return dict


def read_data_label(filename):
    data_file = open(filename)
    dict = defaultdict(int)
    map = defaultdict(int)

    count = 0
    for line in data_file:
        groupId = eval(line)
        dict[groupId] += 1
        count += 1
        map[count] = groupId
    data_file.close()
    return dict, map

voc_file = open("data/vocabulary.txt")
voc_dict = defaultdict(int)
count = 0
for line in voc_file:
    count += 1
    voc_dict[count] = line.strip()
voc_file.close()

train_data = read_data_file("data/train.data")
train_label, train_map = read_data_label("data/train.label")

group_num = len(train_label)
doc_num = len(train_map)
voc_num = len(voc_dict)

pi = [train_label[groupId] * 1.0 / doc_num for groupId in range(1, 21)]
# smoothing
p = np.ones((group_num, voc_num))

for docId in train_data:
    for vId in train_data[docId]:
        p[train_map[docId] - 1][vId - 1] += train_data[docId][vId]

for groupId in range(len(p)):
    group_sum = sum(p[groupId])
    for vId in range(len(p[groupId])):
        p[groupId][vId] = p[groupId][vId] / group_sum



In [3]:

# routine groupId: 1 - 20
def helper(data, groupId):
    result = 0.0
    result += math.log(pi[groupId - 1])
    for wordId in data:
        result += data[wordId] * math.log(p[groupId - 1][wordId - 1])
    return result

In [4]:

def choose(data):
    m = -sys.maxint - 1
    result = 0
    for groupId in range(1, 21):
        temp = helper(data, groupId)
        if temp > m:
            m = temp
            result = groupId
    return result

In [5]:

test_data = read_data_file("data/test.data")
test_label, test_map = read_data_label("data/test.label")


In [6]:

test_num = len(test_map)
error_num = 0
for docId in test_map:
    if choose(test_data[docId]) != test_map[docId]:
#         print docId
        error_num += 1

error_rate = error_num * 1.0 / test_num
print "error_rate " + str(error_rate)

error_rate 0.21892071952


In [7]:
train_indexs = random.sample(range(1, doc_num + 1), doc_num * 8 / 10)
train_indexs.sort()
validation_indexs = [i for i in range(1, doc_num + 1) if i not in train_indexs]
print len(train_indexs)
print len(validation_indexs)

9015
2254


In [8]:

t_data = {}
t_label = defaultdict(int)
t_map = {}
v_data = {}
v_label = defaultdict(int)
v_map = {}

for docId in train_data:
    if docId in train_indexs:
        t_data[docId] = train_data[docId]
        t_map[docId] = train_map[docId]
        t_label[train_map[docId]] += 1
    else:
        v_data[docId] = train_data[docId]
        v_map[docId] = train_map[docId]
        v_label[train_map[docId]] += 1

In [19]:
freq_log = True
remove_stopwords = True
voc_size = 5000

In [20]:
stop_words = stopwords.words("english")
voc_counter = defaultdict(int)
for docId in t_data:
    for vocId in t_data[docId]:
        voc_counter[vocId] += t_data[docId][vocId]
sorted_voc = sorted(voc_counter.items(), key=operator.itemgetter(1), reverse=True)
voc_indexs = defaultdict(int)
i = 0
for voc in sorted_voc:
    if voc_size <= 0:
        break
    if remove_stopwords and voc_dict[voc[0]] in stop_words:
        print voc_dict[voc[0]]
        continue
    voc_indexs[i] = voc[0]
    i += 1
    voc_size -= 1
    
print len(voc_indexs)
print voc_indexs

voc_num = len(voc_indexs)


the
to
of
and
in
is
that
it
for
you
this
on
be
are
not
have
with
as
or
if
but
was
they
can
from
by
at
an
there
my
what
will
all
we
do
about
he
so
your
has
no
any
some
me
who
which
out
don
more
just
when
their
other
up
were
only
how
than
them
been
had
his
does
then
these
should
am
because
very
now
why
into
most
such
those
here
where
same
its
being
our
over
did
after
too
off
him
before
both
while
own
through
down
under
few
between
each
again
against
she
above
having
her
doing
during
once
until
themselves
myself
itself
further
himself
nor
yourself
below
whom
yours
ourselves
ours
5000
defaultdict(<type 'int'>, {0: 775, 1: 100, 2: 1003, 3: 778, 4: 80, 5: 770, 6: 67, 7: 44, 8: 476, 9: 73, 10: 630, 11: 531, 12: 282, 13: 1319, 14: 137, 15: 792, 16: 902, 17: 245, 18: 143, 19: 492, 20: 2045, 21: 31, 22: 131, 23: 574, 24: 316, 25: 1576, 26: 834, 27: 289, 28: 481, 29: 456, 30: 393, 31: 419, 32: 1023, 33: 993, 34: 1413, 35: 972, 36: 439, 37: 279, 38: 84, 39: 465, 40: 1334, 41: 1863, 42: 1557, 43: 8

In [None]:
group_num = len(t_label)
doc_num = len(t_map)

pi = [t_label[groupId] * 1.0 / doc_num for groupId in range(1, 21)]
# smoothing
p = np.ones((group_num, voc_num))

for docId in t_data:
    for v in range(voc_num):
        p[t_map[docId] - 1][v] += t_data[docId][vId]

for groupId in range(len(p)):
    group_sum = sum(p[groupId])
    for vId in range(len(p[groupId])):
        p[groupId][vId] = p[groupId][vId] / group_sum
        if freq_log:
            p[groupId][vId] = math.log(1 + p[groupId][vId])
