# CS4248 Assignment 3

### Perceptron text classifier in Python with feature selection

##### https://github.com/jia1/pyceptron

In [1]:
import os
import re
import sys
from decimal import Decimal
from string import punctuation
from porter import PorterStemmer

p = PorterStemmer()

In [2]:
src_dir = os.path.abspath('tc')
dst_dir = os.path.abspath('tc_proc')

In [3]:
'''
class_to_text
<class 'dict'>
KEY: <class 'str'>
VAL: <class 'set'>
E.g: {'c1': {'0000', '0001', '0002', ... }, 'c2': {'1000', '1001', '1002', ... } }

text_to_freq
<class 'dict'>
KEY: <class 'str'>
VAL: <class 'dict'>
     KEY: <class 'str'>
     VAL: <class 'int'>
E.g: {'0000': {'word0': 1, 'word1': 3, 'word2': 5, ... }, '0001': { ... }}

class_to_feat
<class 'dict'>
KEY: <class 'str'>
VAL: <class 'list'>
     VAL: <class 'tuple'>
          <class 'str'>
          <class 'float'>
E.g: {'c1': [ ('word1', 99.00), ('word2', 90.00), ... ], 'c2': [ ... ] }

nxx_dict
chi_dict
'''

k = 2
train_percent = 0.8
classes, nxxs = ['c1', 'c2', 'c3', 'c4', 'c5'], ['n00', 'n01', 'n10', 'n11']
class_to_text, text_to_freq = {}, {}
class_to_feat = {}
nxx_dict = { n: {} for n in nxxs }
chi_dict = {}

In [4]:
# A. Split each line by space into tokens
# B. Strip all default white space characters from each token
# C. Remove punctuation from each token
# D. Return a list of tokens which are not stop words

def strip_and_filter_line(ln):
    tokens = map(lambda t: t.strip().strip(punctuation).lower(), ln.split(' '))
    return list(filter(lambda t: t and len(t) > 2 and t.isalpha() and t not in stop_list, tokens))

In [5]:
def is_in(a, b):
    return 1 if a in b else 0

def is_not_in(a, b):
    return 1 if a not in b else 0

def count_nxx(nxx, w, c):
    global classes, class_to_text, text_to_freq
    answer = 0
    if nxx == 'n00':
        for class_name in filter(lambda x: x != c, classes):
            for text in class_to_text[class_name]:
                answer += is_not_in(w, text_to_freq[text])
    elif nxx == 'n01':
        for text in class_to_text[c]:
            answer += is_not_in(w, text_to_freq[text])
    elif nxx == 'n10':
        for class_name in filter(lambda x: x != c, classes):
            for text in class_to_text[class_name]:
                answer += is_in(w, text_to_freq[text])
    elif nxx == 'n11':
        for text in class_to_text[c]:
            answer += is_in(w, text_to_freq[text])
    return answer

In [6]:
# n00 is the number of training texts that do not contain w and are not in class c.
# n01 is the number of training texts that do not contain w and are in class c.
# n10 is the number of training texts that contain w and are not in class c.
# n11 is the number of training texts that contain w and are in class c.

def chi_square(w, c):
    global nxxs, nxx_dict
    ns_dict = {}
    for n in nxxs:
        if w not in nxx_dict[n]:
            nxx_dict[n][w] = {}
        if c not in nxx_dict[n][w]:
            nxx_dict[n][w][c] = count_nxx(n, w, c)
        ns_dict[n] = nxx_dict[n][w][c]
    n00, n01, n10, n11 = ns_dict['n00'], ns_dict['n01'], ns_dict['n10'], ns_dict['n11']
    return ((n11+n10+n01+n00)*(n11*n00-n10*n01)**2)/((n11+n01)*(n11+n10)*(n10+n00)*(n01+n00))

In [7]:
def put_chi_dict(c, w, chi_square_value):
    global chi_dict
    if w not in chi_dict[c]:
        chi_dict[c][w] = chi_square_value
    else:
        chi_dict[c][w] = max(chi_dict[c][w], chi_square_value)

In [8]:
def gen_feat():
    global classes, chi_dict, class_to_feat
    max_feat_len = sys.maxsize
    feat_queue_dict = { c: [] for c in classes }
    for c in chi_dict:
        feat_queue_dict[c] = sorted(chi_dict[c].items(), key = lambda x: x[1], reverse = True)
        max_feat_len = min(max_feat_len, len(feat_queue_dict[c]))
    if max_feat_len > 1000:
        max_feat_len //= 100
    elif max_feat_len > 100:
        max_feat_len //= 10
    class_to_feat = { c: feat_queue_dict[c][:max_feat_len] for c in feat_queue_dict }

In [9]:
def feat_select():
    global classes, class_to_text, text_to_freq, class_to_feat
    for c in classes:
        for text in class_to_text[c]:
            for w in text_to_freq[text]:
                put_chi_dict(c, w, chi_square(w, c))
                gen_feat()

In [10]:
# Load all stop words into a list

with open('stopword-list', 'r') as s:
    stop_list = list(map(lambda ln: ln.strip(), s.readlines()))

In [11]:
for curr_dir, sub_dir, files in os.walk(src_dir):
    if not files:
        classes = sub_dir
        class_to_text = { c: set() for c in classes }
        class_to_feat = { c: set() for c in classes }
        chi_dict = { c: {} for c in classes }
        continue
    curr_class = re.split('[(\\\\)(\\)(\/)]', curr_dir)[-1] # curr_dir.split('\\')[-1]
    for i in range(int(len(files) * train_percent)):
        file = files[i]
        flat_text = []
        freq_dict = {}
        with open(os.path.join(curr_dir, file), 'r') as f:
            processed_lines = map(lambda ln: strip_and_filter_line(ln), f.readlines())
            for line in processed_lines:
                flat_text.extend(list(map(lambda word: p.stem(word, 0, len(word) - 1), line)))
            for word in flat_text:
                if word not in freq_dict:
                    freq_dict[word] = 1
                else:
                    freq_dict[word] += 1
            fin_freq_dict = { word: freq for word, freq in freq_dict.items() if freq >= k }
            if not fin_freq_dict:
                fin_freq_dict = freq_dict
            sum_freq = sum(fin_freq_dict.values())
            normalized_freq_dict = { word: freq / sum_freq for word, freq in fin_freq_dict.items() }
            class_to_text[curr_class].add(file)
            text_to_freq[file] = normalized_freq_dict

In [12]:
print(text_to_freq)

{'37261': {'robert': 0.03968253968253968, 'lipman': 0.03968253968253968, 'call': 0.015873015873015872, 'present': 0.0873015873015873, 'navi': 0.03968253968253968, 'seminar': 0.047619047619047616, 'distribut': 0.015873015873015872, 'organ': 0.015873015873015872, 'carderock': 0.023809523809523808, 'divis': 0.023809523809523808, 'bethesda': 0.031746031746031744, 'scientif': 0.03968253968253968, 'visual': 0.03968253968253968, 'virtual': 0.03968253968253968, 'realiti': 0.03968253968253968, 'june': 0.015873015873015872, 'naval': 0.015873015873015872, 'surfac': 0.015873015873015872, 'warfar': 0.015873015873015872, 'center': 0.023809523809523808, 'david': 0.015873015873015872, 'taylor': 0.015873015873015872, 'research': 0.015873015873015872, 'maryland': 0.023809523809523808, 'sponsor': 0.015873015873015872, 'inform': 0.015873015873015872, 'work': 0.015873015873015872, 'type': 0.015873015873015872, 'minut': 0.015873015873015872, 'length': 0.015873015873015872, 'videotap': 0.015873015873015872, 

In [13]:
from time import time
start = time()
feat_select()
stop = time()
print(stop - start)

109.17125511169434


In [15]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(class_to_feat)

{   'c1': [   ('graphic', 151.82349136760618),
              ('imag', 80.52800974251913),
              ('polygon', 79.15479363912124),
              ('algorithm', 65.88223970740393),
              ('point', 61.38488003897211),
              ('gif', 61.23035205441677),
              ('cview', 56.394763343403824),
              ('surfac', 53.87375266285458),
              ('file', 45.685344738527654),
              ('tiff', 42.75870078587744),
              ('anim', 42.30976880081301),
              ('packag', 42.18278480461047),
              ('viewer', 40.20100502512563),
              ('intersect', 38.75335345405768)],
    'c2': [   ('id', 165.7080723936733),
              ('bu', 147.73041383667666),
              ('drive', 141.86224948695073),
              ('isa', 107.77830262950717),
              ('vlb', 93.07030854830552),
              ('scsi', 89.44543828264759),
              ('control', 87.5290162490995),
              ('card', 87.3777445559178),
              ('gatewai', 62

In [None]:
# k = 2
# num_feat_per_class = 1600+ // 10 = 16

class_to_feat = {
    'c1': [   ('graphic', 151.82349136760618),
              ('imag', 80.52800974251913),
              ('polygon', 79.15479363912124),
              ('algorithm', 65.88223970740393),
              ('point', 61.38488003897211),
              ('gif', 61.23035205441677),
              ('cview', 56.394763343403824),
              ('surfac', 53.87375266285458),
              ('file', 45.685344738527654),
              ('tiff', 42.75870078587744),
              ('anim', 42.30976880081301),
              ('packag', 42.18278480461047),
              ('viewer', 40.20100502512563),
              ('intersect', 38.75335345405768)],
    'c2': [   ('id', 165.7080723936733),
              ('bu', 147.73041383667666),
              ('drive', 141.86224948695073),
              ('isa', 107.77830262950717),
              ('vlb', 93.07030854830552),
              ('scsi', 89.44543828264759),
              ('control', 87.5290162490995),
              ('card', 87.3777445559178),
              ('gatewai', 62.91344321112232),
              ('irq', 60.45340050377834),
              ('disk', 59.731480168297644),
              ('esdi', 52.34021137393055),
              ('floppi', 50.760502172863355),
              ('adaptec', 40.20100502512563)],
    'c3': [   ('appl', 244.89445746537285),
              ('mac', 130.20472862851912),
              ('centri', 108.73500792034397),
              ('duo', 76.72892478546188),
              ('adb', 68.58295511850731),
              ('lciii', 64.51612903225806),
              ('quadra', 51.79399485449308),
              ('simm', 47.40562312430895),
              ('powerbook', 40.20100502512563),
              ('vram', 40.20100502512563),
              ('powerpc', 36.16273229532898),
              ('iii', 34.75935828877005),
              ('nubu', 34.75935828877005),
              ('iisi', 32.1285140562249)],
    'c4': [   ('window', 230.80490975970545),
              ('server', 159.96728207525123),
              ('widget', 158.87699701837926),
              ('motif', 147.6363002527422),
              ('xterm', 130.08130081300814),
              ('client', 96.6246090801251),
              ('xlib', 84.89135927235978),
              ('manag', 81.84143222506394),
              ('applic', 76.90259032540912),
              ('pixmap', 76.72892478546188),
              ('event', 68.73977086743044),
              ('compil', 66.11501975740386),
              ('sun', 65.19696199411501),
              ('implement', 60.80001558724963)],
    'c5': [   ('circuit', 95.4816460135609),
              ('voltag', 88.97876643073812),
              ('cool', 68.58295511850731),
              ('amp', 66.96511597013736),
              ('power', 65.50188127228039),
              ('ground', 65.1953992654659),
              ('nuclear', 60.45340050377834),
              ('radio', 54.82925907258065),
              ('outlet', 50.79764903442485),
              ('tower', 49.43011238343208),
              ('detector', 48.2897384305835),
              ('water', 48.2897384305835),
              ('cold', 48.2897384305835),
              ('radar', 46.77384548985757)]
}

In [None]:
# k = 3
# num_feat_per_class = 1400+ // 10 = 14

class_to_feat = {
    'c1': [
        ('graphic', 169.17137744140854),
        ('imag', 98.3116817832547),
        ('algorithm', 88.7812752219532),
        ('polygon', 83.03566726931246),
        ('tiff', 70.87953629032258),
        ('point', 70.21568856675239),
        ('file', 69.66309823677582),
        ('cview', 56.31536604987932),
        ('surfac', 54.7409671900161),
        ('gif', 53.01195979162081),
        ('packag', 52.06390785056943),
        ('librari', 39.77719093998164),
        ('viewer', 38.706625133976424),
        ('format', 37.34571649131328)
    ],
    'c2': [
        ('id', 185.69556314573137),
        ('drive', 160.46240000462572),
        ('scsi', 103.22117547434439),
        ('control', 83.98206256019533),
        ('card', 82.96180195710245),
        ('isa', 73.76560004212669),
        ('bu', 65.37960011590843),
        ('disk', 63.53762350800936),
        ('vlb', 60.36217303822938),
        ('gatewai', 53.01195979162081),
        ('floppi', 49.494949494949495),
        ('motherboard', 45.85597826086956),
        ('irq', 44.194455604660504),
        ('hard', 40.65040650406504)
    ],
    'c3': [
        ('appl', 213.91769935344828),
        ('mac', 179.77787774609058),
        ('duo', 80.64516129032258),
        ('centri', 69.00620961227787),
        ('powerbook', 56.31536604987932),
        ('quadra', 53.23840725806452),
        ('nubu', 46.70799333409953),
        ('lciii', 40.16064257028113),
        ('adb', 40.16064257028113),
        ('pd', 40.16064257028113),
        ('simm', 38.68938394548647),
        ('price', 36.54766215667556),
        ('acceler', 30.90305757005999),
        ('modem', 30.607070414372647)
    ],
    'c4': [
        ('window', 172.49840831918505),
        ('widget', 160.8511081405711),
        ('motif', 142.91158536585365),
        ('xterm', 117.36139214892756),
        ('server', 114.93389423076923),
        ('applic', 65.96930396095334),
        ('event', 64.48345838762337),
        ('client', 60.640274236676824),
        ('pixmap', 60.36217303822938),
        ('manag', 55.81061733405483),
        ('expos', 54.7409671900161),
        ('compil', 50.96258116777068),
        ('patch', 49.809291416929234),
        ('xlib', 49.809291416929234)
    ], 'c5': [
        ('radar', 76.5820233776703),
        ('circuit', 73.76560004212669),
        ('cool', 66.83690786821953),
        ('ground', 65.04405750294009),
        ('wire', 56.81818181818182),
        ('detector', 56.31536604987932),
        ('tower', 54.7409671900161),
        ('amp', 54.7409671900161),
        ('nuclear', 52.27181342983514),
        ('car', 52.27181342983514),
        ('outlet', 52.27181342983514),
        ('power', 50.137415866316125),
        ('voltag', 46.70799333409953),
        ('electron', 39.13612345180398)
    ]
}

In [16]:
class_to_feat_set = {c: set() for c in classes}
num_class = len(classes)
for c in class_to_feat:
    for p in class_to_feat[c]:
        w = p[0]
        class_to_feat_set[c].add(w)
    for nc in class_to_feat:
        if nc != c:
            for i in range(len(class_to_feat[nc]) // (num_class - 1)):
                w = class_to_feat[nc][i][0]
                class_to_feat_set[c].add(w) # if many overlapping words across class_to_feat, then fewer features

class_to_feat_list = {c: sorted(list(class_to_feat_set[c])) for c in classes}
class_to_feat_to_index = {c: {} for c in classes}
for c in class_to_feat_list:
    for i in range(len(class_to_feat_list[c])):
        class_to_feat_to_index[c][class_to_feat_list[c][i]] = i

In [17]:
pp.pprint(class_to_feat_to_index)

{   'c1': {   'algorithm': 0,
              'anim': 1,
              'appl': 2,
              'bu': 3,
              'centri': 4,
              'circuit': 5,
              'cool': 6,
              'cview': 7,
              'drive': 8,
              'file': 9,
              'gif': 10,
              'graphic': 11,
              'id': 12,
              'imag': 13,
              'intersect': 14,
              'mac': 15,
              'packag': 16,
              'point': 17,
              'polygon': 18,
              'server': 19,
              'surfac': 20,
              'tiff': 21,
              'viewer': 22,
              'voltag': 23,
              'widget': 24,
              'window': 25},
    'c2': {   'adaptec': 0,
              'appl': 1,
              'bu': 2,
              'card': 3,
              'centri': 4,
              'circuit': 5,
              'control': 6,
              'cool': 7,
              'disk': 8,
              'drive': 9,
              'esdi': 10,
              '

In [None]:
# k = 2

class_to_feat_to_index = {
    'c1': {   'algorithm': 0,
              'anim': 1,
              'appl': 2,
              'bu': 3,
              'centri': 4,
              'circuit': 5,
              'cool': 6,
              'cview': 7,
              'drive': 8,
              'file': 9,
              'gif': 10,
              'graphic': 11,
              'id': 12,
              'imag': 13,
              'intersect': 14,
              'mac': 15,
              'packag': 16,
              'point': 17,
              'polygon': 18,
              'server': 19,
              'surfac': 20,
              'tiff': 21,
              'viewer': 22,
              'voltag': 23,
              'widget': 24,
              'window': 25},
    'c2': {   'adaptec': 0,
              'appl': 1,
              'bu': 2,
              'card': 3,
              'centri': 4,
              'circuit': 5,
              'control': 6,
              'cool': 7,
              'disk': 8,
              'drive': 9,
              'esdi': 10,
              'floppi': 11,
              'gatewai': 12,
              'graphic': 13,
              'id': 14,
              'imag': 15,
              'irq': 16,
              'isa': 17,
              'mac': 18,
              'polygon': 19,
              'scsi': 20,
              'server': 21,
              'vlb': 22,
              'voltag': 23,
              'widget': 24,
              'window': 25},
    'c3': {   'adb': 0,
              'appl': 1,
              'bu': 2,
              'centri': 3,
              'circuit': 4,
              'cool': 5,
              'drive': 6,
              'duo': 7,
              'graphic': 8,
              'id': 9,
              'iii': 10,
              'iisi': 11,
              'imag': 12,
              'lciii': 13,
              'mac': 14,
              'nubu': 15,
              'polygon': 16,
              'powerbook': 17,
              'powerpc': 18,
              'quadra': 19,
              'server': 20,
              'simm': 21,
              'voltag': 22,
              'vram': 23,
              'widget': 24,
              'window': 25},
    'c4': {   'appl': 0,
              'applic': 1,
              'bu': 2,
              'centri': 3,
              'circuit': 4,
              'client': 5,
              'compil': 6,
              'cool': 7,
              'drive': 8,
              'event': 9,
              'graphic': 10,
              'id': 11,
              'imag': 12,
              'implement': 13,
              'mac': 14,
              'manag': 15,
              'motif': 16,
              'pixmap': 17,
              'polygon': 18,
              'server': 19,
              'sun': 20,
              'voltag': 21,
              'widget': 22,
              'window': 23,
              'xlib': 24,
              'xterm': 25},
    'c5': {   'amp': 0,
              'appl': 1,
              'bu': 2,
              'centri': 3,
              'circuit': 4,
              'cold': 5,
              'cool': 6,
              'detector': 7,
              'drive': 8,
              'graphic': 9,
              'ground': 10,
              'id': 11,
              'imag': 12,
              'mac': 13,
              'nuclear': 14,
              'outlet': 15,
              'polygon': 16,
              'power': 17,
              'radar': 18,
              'radio': 19,
              'server': 20,
              'tower': 21,
              'voltag': 22,
              'water': 23,
              'widget': 24,
              'window': 25}
}

In [None]:
# k = 3

class_to_feat_to_index = {
    'c1': {
        'algorithm': 0, 'appl': 1, 'circuit': 2, 'cool': 3, 'cview': 4, 'drive': 5, 'duo': 6,
        'file': 7, 'format': 8, 'gif': 9, 'graphic': 10, 'id': 11, 'imag': 12, 'librari': 13,
        'mac': 14, 'motif': 15, 'packag': 16, 'point': 17, 'polygon': 18, 'radar': 19, 'scsi': 20,
        'surfac': 21, 'tiff': 22, 'viewer': 23, 'widget': 24, 'window': 25
    },
    'c2': {
        'algorithm': 0, 'appl': 1, 'bu': 2, 'card': 3, 'circuit': 4, 'control': 5, 'cool': 6,
        'disk': 7, 'drive': 8, 'duo': 9, 'floppi': 10, 'gatewai': 11, 'graphic': 12, 'hard': 13,
        'id': 14, 'imag': 15, 'irq': 16, 'isa': 17, 'mac': 18, 'motherboard': 19, 'motif': 20,
        'radar': 21, 'scsi': 22, 'vlb': 23, 'widget': 24, 'window': 25
    },
    'c3': {
        'acceler': 0, 'adb': 1, 'algorithm': 2, 'appl': 3, 'centri': 4, 'circuit': 5, 'cool': 6,
        'drive': 7, 'duo': 8, 'graphic': 9, 'id': 10, 'imag': 11, 'lciii': 12, 'mac': 13,
        'modem': 14, 'motif': 15, 'nubu': 16, 'pd': 17, 'powerbook': 18, 'price': 19, 'quadra': 20,
        'radar': 21, 'scsi': 22, 'simm': 23, 'widget': 24, 'window': 25
    },
    'c4': {
        'algorithm': 0, 'appl': 1, 'applic': 2, 'circuit': 3, 'client': 4, 'compil': 5, 'cool': 6,
        'drive': 7, 'duo': 8, 'event': 9, 'expos': 10, 'graphic': 11, 'id': 12, 'imag': 13,
        'mac': 14, 'manag': 15, 'motif': 16, 'patch': 17, 'pixmap': 18, 'radar': 19, 'scsi': 20,
        'server': 21, 'widget': 22, 'window': 23, 'xlib': 24, 'xterm': 25
    },
    'c5': {
        'algorithm': 0, 'amp': 1, 'appl': 2, 'car': 3, 'circuit': 4, 'cool': 5, 'detector': 6,
        'drive': 7, 'duo': 8, 'electron': 9, 'graphic': 10, 'ground': 11, 'id': 12, 'imag': 13,
        'mac': 14, 'motif': 15, 'nuclear': 16, 'outlet': 17, 'power': 18, 'radar': 19, 'scsi': 20,
        'tower': 21, 'voltag': 22, 'widget': 23, 'window': 24, 'wire': 25
    }
}

In [None]:
alpha = 0.05

In [18]:
# Credits: Dr. Jason Brownlee

from random import seed
from random import randrange

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

# Make a prediction with weights
def predict(row, weights):
    activation = weights[0]
    for i in range(len(row)-1):
        activation += weights[i + 1] * row[i]
    return 1.0 if activation >= 0.0 else 0.0

# Estimate Perceptron weights using stochastic gradient descent
def train_weights(train, l_rate, n_epoch):
    weights = [0.0 for i in range(len(train[0]))]
    for epoch in range(n_epoch):
        for row in train:
            prediction = predict(row, weights)
            error = row[-1] - prediction
            weights[0] = weights[0] + l_rate * error
            for i in range(len(row)-1):
                weights[i + 1] = weights[i + 1] + l_rate * error * row[i]
    return weights

# Perceptron Algorithm With Stochastic Gradient Descent
def perceptron(train, test, l_rate, n_epoch):
    predictions = list()
    weights = train_weights(train, l_rate, n_epoch)
    for row in test:
        prediction = predict(row, weights)
        predictions.append(prediction)
    return(predictions)

# Test the Perceptron algorithm on the sonar dataset
seed(1)

# load and prepare data
class_to_mat = {c: [] for c in classes}
for c in classes:
    for d in classes:
        texts = class_to_text[d]
        num_texts = len(texts)
        texts = iter(texts)
        if c != d:
            num_texts_to_train = int((1 - train_percent) * num_texts)
        else:
            num_texts_to_train = num_texts
        for i in range(num_texts_to_train):
            text = next(texts)
            feat_vec = [0 for i in range(len(class_to_feat_to_index[d]) + 1)]
            for word in text_to_freq[text]:
                if word in class_to_feat_to_index[d]:
                    index = class_to_feat_to_index[d][word]
                    feat_vec[index] = text_to_freq[text][word]
            feat_vec[-1] = 1 if c == d else 0
            class_to_mat[c].append(feat_vec)

dataset = class_to_mat['c1']

# evaluate algorithm
n_folds = 3
l_rate = 0.05
n_epoch = 500

# scores = evaluate_algorithm(dataset, perceptron, n_folds, l_rate, n_epoch)

# print('Scores: %s' % scores)
# print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

In [19]:
for feat_vec in dataset:
    print(feat_vec)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[0, 0, 0, 0, 0, 0, 0, 0.15384615384615385, 0, 0.07692307692307693, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[0, 0.15384615384615385, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.21875, 0, 0, 0, 0, 0, 0, 0, 1]
[0, 0.02112676056338028, 0, 0, 0.014084507042253521, 0, 0, 0, 0, 0, 0, 0.014084507042253521, 0, 0, 0, 0.014084507042253521, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0.06521739130434782, 0, 0, 0, 0, 0, 0, 0.06521739130434782, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.08, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[0, 0,