# CS4248 Assignment 3

### Perceptron text classifier in Python with feature selection

##### https://github.com/jia1/pyceptron

In [None]:
import os
import re
import sys
from string import punctuation
from porter import PorterStemmer

p = PorterStemmer()

In [None]:
import numpy as np

In [None]:
src_dir = os.path.abspath('tc')
dst_dir = os.path.abspath('tc_proc')

In [None]:
'''
class_to_text
<class 'dict'>
KEY: <class 'str'>
VAL: <class 'set'>
E.g: {'c1': {'0000', '0001', '0002', ... }, 'c2': {'1000', '1001', '1002', ... } }

text_to_freq
<class 'dict'>
KEY: <class 'str'>
VAL: <class 'dict'>
     KEY: <class 'str'>
     VAL: <class 'int'>
E.g: {'0000': {'word0': 1, 'word1': 3, 'word2': 5, ... }, '0001': { ... }}

class_to_feat
<class 'dict'>
KEY: <class 'str'>
VAL: <class 'list'>
     VAL: <class 'tuple'>
          <class 'str'>
          <class 'float'>
E.g: {'c1': [ ('word1', 99.00), ('word2', 90.00), ... ], 'c2': [ ... ] }

nxx_dict
chi_dict
'''

k = 3
train_percent = 0.8
classes, nxxs = [], ['n00', 'n01', 'n10', 'n11']
class_to_text, text_to_freq = {}, {}
class_to_feat = {}
nxx_dict = { n: {} for n in nxxs }
chi_dict = {}

In [None]:
# A. Split each line by space into tokens
# B. Strip all default white space characters from each token
# C. Remove punctuation from each token
# D. Return a list of tokens which are not stop words

def strip_and_filter_line(ln):
    tokens = map(lambda t: t.strip().strip(punctuation).lower(), ln.split(' '))
    return list(filter(lambda t: t and len(t) > 2 and t.isalpha() and t not in stop_list, tokens))

In [None]:
def is_in(a, b):
    return 1 if a in b else 0

def is_not_in(a, b):
    return 1 if a not in b else 0

def count_nxx(nxx, w, c):
    global classes, class_to_text, text_to_freq
    answer = 0
    if nxx == 'n00':
        for class_name in filter(lambda x: x != c, classes):
            for text in class_to_text[class_name]:
                answer += is_not_in(w, text_to_freq[text])
    elif nxx == 'n01':
        for text in class_to_text[c]:
            answer += is_not_in(w, text_to_freq[text])
    elif nxx == 'n10':
        for class_name in filter(lambda x: x != c, classes):
            for text in class_to_text[class_name]:
                answer += is_in(w, text_to_freq[text])
    elif nxx == 'n11':
        for text in class_to_text[c]:
            answer += is_in(w, text_to_freq[text])
    return answer

In [None]:
# n00 is the number of training texts that do not contain w and are not in class c.
# n01 is the number of training texts that do not contain w and are in class c.
# n10 is the number of training texts that contain w and are not in class c.
# n11 is the number of training texts that contain w and are in class c.

def chi_square(w, c):
    global nxxs, nxx_dict
    ns_dict = {}
    for n in nxxs:
        if w not in nxx_dict[n]:
            nxx_dict[n][w] = {}
        if c not in nxx_dict[n][w]:
            nxx_dict[n][w][c] = count_nxx(n, w, c)
        ns_dict[n] = nxx_dict[n][w][c]
    n00, n01, n10, n11 = ns_dict['n00'], ns_dict['n01'], ns_dict['n10'], ns_dict['n11']
    return ((n11+n10+n01+n00)*(n11*n00-n10*n01)**2)/((n11+n01)*(n11+n10)*(n10+n00)*(n01+n00))

In [None]:
def put_chi_dict(c, w, chi_square_value):
    global chi_dict
    if w not in chi_dict[c]:
        chi_dict[c][w] = chi_square_value
    else:
        chi_dict[c][w] = max(chi_dict[c][w], chi_square_value)

In [None]:
def gen_feat():
    global classes, chi_dict, class_to_feat
    max_feat_len = sys.maxsize
    feat_queue_dict = { c: [] for c in classes }
    for c in chi_dict:
        feat_queue_dict[c] = sorted(chi_dict[c].items(), key = lambda x: x[1], reverse = True)
        max_feat_len = min(max_feat_len, len(feat_queue_dict[c]))
    if max_feat_len > 1000:
        max_feat_len //= 100
    elif max_feat_len > 100:
        max_feat_len //= 10
    class_to_feat = { c: feat_queue_dict[c][:max_feat_len] for c in feat_queue_dict }

In [None]:
def feat_select():
    global classes, class_to_text, text_to_freq, class_to_feat
    for c in classes:
        for text in class_to_text[c]:
            for w in text_to_freq[text]:
                put_chi_dict(c, w, chi_square(w, c))
                gen_feat()

In [None]:
# Refactor this
def pla(xn, yn, maxIter = 1000, w = np.zeros(3)):
    N = xn.shape[0]
    summ_w = w
    rows = len(w)
    for _ in range(maxIter):
        i = nr.randint(N)
        if(yn[i] != g(xn[i,:])):
            w[0] += yn[i]
            w[1] += yn[i]*xn[i][0]
            w[2] += yn[i]*xn[i][1]
            for j in range(rows):
                summ_w[j] += w[j]
    summ_w = map(lambda weight: weight/maxIter+1, summ_w)
    return summ_w
    # return w

In [None]:
# Load all stop words into a list

with open('stopword-list', 'r') as s:
    stop_list = list(map(lambda ln: ln.strip(), s.readlines()))

In [None]:
for curr_dir, sub_dir, files in os.walk(src_dir):
    if not files:
        classes = sub_dir
        class_to_text = { c: set() for c in classes }
        class_to_feat = { c: set() for c in classes }
        chi_dict = { c: {} for c in classes }
        continue
    curr_class = re.split('[(\\\\)(\\)(\/)]', curr_dir)[-1] # curr_dir.split('\\')[-1]
    for i in range(int(len(files) * train_percent)):
        file = files[i]
        flat_text = []
        freq_dict = {}
        with open(os.path.join(curr_dir, file), 'r') as f:
            processed_lines = map(lambda ln: strip_and_filter_line(ln), f.readlines())
            for line in processed_lines:
                flat_text.extend(list(map(lambda word: p.stem(word, 0, len(word) - 1), line)))
            for word in flat_text:
                if word not in freq_dict:
                    freq_dict[word] = 1
                else:
                    freq_dict[word] += 1
            fin_freq_dict = { word: freq for word, freq in freq_dict.items() if freq >= k }
            if not fin_freq_dict:
                fin_freq_dict = freq_dict
            sum_freq = sum(fin_freq_dict.values())
            normalized_freq_dict = { word: freq / sum_freq for word, freq in fin_freq_dict.items() }
            class_to_text[curr_class].add(file)
            text_to_freq[file] = normalized_freq_dict

In [None]:
from time import time
start = time()
feat_select()
stop = time()
print(stop - start)

In [None]:
# print(text_to_freq)
# print(nxx_dict)
# print(class_to_feat)
# print(len(class_to_feat['c1']))

In [None]:
# k = 2
# num_feat_per_class = 1600+ // 10 = 16

class_to_feat = {
    'c1': [
        ('graphic', 166.49374732865473),
        ('imag', 101.01010101010101),
        ('polygon', 87.09654348411416),
        ('file', 74.43622269203665),
        ('algorithm', 73.76560004212669),
        ('cview', 68.46556584776481),
        ('point', 66.21075119517303),
        ('surfac', 65.74271499644634),
        ('gif', 65.04405750294009),
        ('tiff', 62.79937774196436),
        ('anim', 55.81061733405483),
        ('packag', 50.13840590326237),
        ('pov', 48.231511254019296),
        ('aspect', 44.194455604660504),
        ('sphere', 44.194455604660504),
        ('curv', 44.194455604660504)
    ],
    'c2': [
        ('id', 188.3321723952954),
        ('bu', 171.25324776186847),
        ('drive', 156.31142209801806),
        ('isa', 123.8238690103303),
        ('vlb', 109.17913465426608),
        ('control', 105.08732215080168),
        ('scsi', 101.22399468783168),
        ('card', 100.476702538723),
        ('disk', 87.4716887572537),
        ('gatewai', 83.03566726931246),
        ('irq', 64.4122383252818),
        ('motherboard', 60.758636764895506),
        ('bio', 57.75472538662254),
        ('esdi', 56.31536604987932),
        ('floppi', 54.77638143603884),
        ('adaptec', 44.194455604660504)
    ],
    'c3': [
        ('appl', 321.3403768946428),
        ('mac', 235.2407894400082),
        ('centri', 124.49186991869918),
        ('duo', 96.93053311793214),
        ('quadra', 80.98006644518273),
        ('simm', 72.14822404371584),
        ('lciii', 68.46556584776481),
        ('adb', 68.46556584776481),
        ('powerbook', 64.4122383252818),
        ('nubu', 54.7409671900161),
        ('vram', 46.70799333409953),
        ('iisi', 44.194455604660504),
        ('powerpc', 44.194455604660504),
        ('pd', 40.16064257028113),
        ('iii', 34.100398997865824),
        ('jon', 33.95372233400403)
    ],
    'c4': [
        ('window', 261.0570533731099),
        ('widget', 199.47265168342295),
        ('motif', 188.3321723952954),
        ('server', 183.49783907675143),
        ('xterm', 146.1038961038961),
        ('client', 104.31758910460736),
        ('manag', 100.86384422034463),
        ('sun', 91.5923224456585),
        ('pixmap', 88.7812752219532),
        ('applic', 83.62768061488656),
        ('xlib', 78.97910705114094),
        ('event', 72.48655853589133),
        ('implement', 72.15275872431049),
        ('compil', 64.9846387326029),
        ('user', 64.66729746317512),
        ('displai', 62.426542841178396)
    ],
    'c5': [
        ('circuit', 130.60868420186324),
        ('voltag', 109.17913465426608),
        ('amp', 95.2307847770661),
        ('radio', 83.03566726931246),
        ('ground', 76.42741562644476),
        ('power', 71.63220052705117),
        ('cool', 68.46556584776481),
        ('detector', 68.46556584776481),
        ('car', 62.79937774196436),
        ('radar', 62.79937774196436),
        ('nuclear', 60.36217303822938),
        ('wire', 55.13215018367444),
        ('electron', 54.77638143603884),
        ('outlet', 54.7409671900161),
        ('audio', 49.88131074867832),
        ('tower', 49.335475932879355)
    ]
}

In [None]:
# k = 3
# num_feat_per_class = 1400+ // 10 = 14

class_to_feat = {
    'c1': [
        ('graphic', 169.17137744140854),
        ('imag', 98.3116817832547),
        ('algorithm', 88.7812752219532),
        ('polygon', 83.03566726931246),
        ('tiff', 70.87953629032258),
        ('point', 70.21568856675239),
        ('file', 69.66309823677582),
        ('cview', 56.31536604987932),
        ('surfac', 54.7409671900161),
        ('gif', 53.01195979162081),
        ('packag', 52.06390785056943),
        ('librari', 39.77719093998164),
        ('viewer', 38.706625133976424),
        ('format', 37.34571649131328)
    ],
    'c2': [
        ('id', 185.69556314573137),
        ('drive', 160.46240000462572),
        ('scsi', 103.22117547434439),
        ('control', 83.98206256019533),
        ('card', 82.96180195710245),
        ('isa', 73.76560004212669),
        ('bu', 65.37960011590843),
        ('disk', 63.53762350800936),
        ('vlb', 60.36217303822938),
        ('gatewai', 53.01195979162081),
        ('floppi', 49.494949494949495),
        ('motherboard', 45.85597826086956),
        ('irq', 44.194455604660504),
        ('hard', 40.65040650406504)
    ],
    'c3': [
        ('appl', 213.91769935344828),
        ('mac', 179.77787774609058),
        ('duo', 80.64516129032258),
        ('centri', 69.00620961227787),
        ('powerbook', 56.31536604987932),
        ('quadra', 53.23840725806452),
        ('nubu', 46.70799333409953),
        ('lciii', 40.16064257028113),
        ('adb', 40.16064257028113),
        ('pd', 40.16064257028113),
        ('simm', 38.68938394548647),
        ('price', 36.54766215667556),
        ('acceler', 30.90305757005999),
        ('modem', 30.607070414372647)
    ],
    'c4': [
        ('window', 172.49840831918505),
        ('widget', 160.8511081405711),
        ('motif', 142.91158536585365),
        ('xterm', 117.36139214892756),
        ('server', 114.93389423076923),
        ('applic', 65.96930396095334),
        ('event', 64.48345838762337),
        ('client', 60.640274236676824),
        ('pixmap', 60.36217303822938),
        ('manag', 55.81061733405483),
        ('expos', 54.7409671900161),
        ('compil', 50.96258116777068),
        ('patch', 49.809291416929234),
        ('xlib', 49.809291416929234)
    ], 'c5': [
        ('radar', 76.5820233776703),
        ('circuit', 73.76560004212669),
        ('cool', 66.83690786821953),
        ('ground', 65.04405750294009),
        ('wire', 56.81818181818182),
        ('detector', 56.31536604987932),
        ('tower', 54.7409671900161),
        ('amp', 54.7409671900161),
        ('nuclear', 52.27181342983514),
        ('car', 52.27181342983514),
        ('outlet', 52.27181342983514),
        ('power', 50.137415866316125),
        ('voltag', 46.70799333409953),
        ('electron', 39.13612345180398)
    ]
}

In [None]:
class_to_feat_set = {c: set() for c in classes}
num_class = len(classes)
for c in class_to_feat:
    for p in class_to_feat[c]:
        w = p[0]
        class_to_feat_set[c].add(w)
    for nc in class_to_feat:
        if nc != c:
            for i in range(len(class_to_feat[nc]) // (num_class - 1)):
                w = class_to_feat[nc][i][0]
                class_to_feat_set[c].add(w) # if many overlapping words across class_to_feat, then fewer features

class_to_feat_list = {c: sorted(list(class_to_feat_set[c])) for c in classes}
class_to_feat_to_index = {c: {} for c in classes}
for c in class_to_feat_list:
    for i in range(len(class_to_feat_list[c])):
        class_to_feat_to_index[c][class_to_feat_list[c][i]] = i

In [None]:
# k = 2

class_to_feat_to_index = {
    'c1': {
        'algorithm': 0, 'amp': 1, 'anim': 2, 'appl': 3, 'aspect': 4, 'bu': 5, 'centri': 6, 'circuit': 7,
        'curv': 8, 'cview': 9, 'drive': 10, 'duo': 11, 'file': 12, 'gif': 13, 'graphic': 14, 'id': 15,
        'imag': 16, 'isa': 17, 'mac': 18, 'motif': 19, 'packag': 20, 'point': 21, 'polygon': 22, 'pov': 23,
        'radio': 24, 'server': 25, 'sphere': 26, 'surfac': 27, 'tiff': 28, 'voltag': 29, 'widget': 30, 'window': 31
    },
    'c2': {
        'adaptec': 0, 'amp': 1, 'appl': 2, 'bio': 3, 'bu': 4, 'card': 5, 'centri': 6, 'circuit': 7,
        'control': 8, 'disk': 9, 'drive': 10, 'duo': 11, 'esdi': 12, 'file': 13, 'floppi': 14, 'gatewai': 15,
        'graphic': 16, 'id': 17, 'imag': 18, 'irq': 19, 'isa': 20, 'mac': 21, 'motherboard': 22, 'motif': 23,
        'polygon': 24, 'radio': 25, 'scsi': 26, 'server': 27, 'vlb': 28, 'voltag': 29, 'widget': 30, 'window': 31
    },
    'c3': {
        'adb': 0, 'amp': 1, 'appl': 2, 'bu': 3, 'centri': 4, 'circuit': 5, 'drive': 6, 'duo': 7,
        'file': 8, 'graphic': 9, 'id': 10, 'iii': 11, 'iisi': 12, 'imag': 13, 'isa': 14, 'jon': 15,
        'lciii': 16, 'mac': 17, 'motif': 18, 'nubu': 19, 'pd': 20, 'polygon': 21, 'powerbook': 22, 'powerpc': 23,
        'quadra': 24, 'radio': 25, 'server': 26, 'simm': 27, 'voltag': 28, 'vram': 29, 'widget': 30, 'window': 31
    },
    'c4': {
        'amp': 0, 'appl': 1, 'applic': 2, 'bu': 3, 'centri': 4, 'circuit': 5, 'client': 6, 'compil': 7,
        'displai': 8, 'drive': 9, 'duo': 10, 'event': 11, 'file': 12, 'graphic': 13, 'id': 14, 'imag': 15,
        'implement': 16, 'isa': 17, 'mac': 18, 'manag': 19, 'motif': 20, 'pixmap': 21, 'polygon': 22, 'radio': 23,
        'server': 24, 'sun': 25, 'user': 26, 'voltag': 27, 'widget': 28, 'window': 29, 'xlib': 30, 'xterm': 31
    },
    'c5': {
        'amp': 0, 'appl': 1, 'audio': 2, 'bu': 3, 'car': 4, 'centri': 5, 'circuit': 6, 'cool': 7,
        'detector': 8, 'drive': 9, 'duo': 10, 'electron': 11, 'file': 12, 'graphic': 13, 'ground': 14, 'id': 15,
        'imag': 16, 'isa': 17, 'mac': 18, 'motif': 19, 'nuclear': 20, 'outlet': 21, 'polygon': 22, 'power': 23,
        'radar': 24, 'radio': 25, 'server': 26, 'tower': 27, 'voltag': 28, 'widget': 29, 'window': 30, 'wire': 31
    }
}

In [None]:
# k = 3

class_to_feat_to_index = {
    'c1': {
        'algorithm': 0, 'appl': 1, 'circuit': 2, 'cool': 3, 'cview': 4, 'drive': 5, 'duo': 6,
        'file': 7, 'format': 8, 'gif': 9, 'graphic': 10, 'id': 11, 'imag': 12, 'librari': 13,
        'mac': 14, 'motif': 15, 'packag': 16, 'point': 17, 'polygon': 18, 'radar': 19, 'scsi': 20,
        'surfac': 21, 'tiff': 22, 'viewer': 23, 'widget': 24, 'window': 25
    },
    'c2': {
        'algorithm': 0, 'appl': 1, 'bu': 2, 'card': 3, 'circuit': 4, 'control': 5, 'cool': 6,
        'disk': 7, 'drive': 8, 'duo': 9, 'floppi': 10, 'gatewai': 11, 'graphic': 12, 'hard': 13,
        'id': 14, 'imag': 15, 'irq': 16, 'isa': 17, 'mac': 18, 'motherboard': 19, 'motif': 20,
        'radar': 21, 'scsi': 22, 'vlb': 23, 'widget': 24, 'window': 25
    },
    'c3': {
        'acceler': 0, 'adb': 1, 'algorithm': 2, 'appl': 3, 'centri': 4, 'circuit': 5, 'cool': 6,
        'drive': 7, 'duo': 8, 'graphic': 9, 'id': 10, 'imag': 11, 'lciii': 12, 'mac': 13,
        'modem': 14, 'motif': 15, 'nubu': 16, 'pd': 17, 'powerbook': 18, 'price': 19, 'quadra': 20,
        'radar': 21, 'scsi': 22, 'simm': 23, 'widget': 24, 'window': 25
    },
    'c4': {
        'algorithm': 0, 'appl': 1, 'applic': 2, 'circuit': 3, 'client': 4, 'compil': 5, 'cool': 6,
        'drive': 7, 'duo': 8, 'event': 9, 'expos': 10, 'graphic': 11, 'id': 12, 'imag': 13,
        'mac': 14, 'manag': 15, 'motif': 16, 'patch': 17, 'pixmap': 18, 'radar': 19, 'scsi': 20,
        'server': 21, 'widget': 22, 'window': 23, 'xlib': 24, 'xterm': 25
    },
    'c5': {
        'algorithm': 0, 'amp': 1, 'appl': 2, 'car': 3, 'circuit': 4, 'cool': 5, 'detector': 6,
        'drive': 7, 'duo': 8, 'electron': 9, 'graphic': 10, 'ground': 11, 'id': 12, 'imag': 13,
        'mac': 14, 'motif': 15, 'nuclear': 16, 'outlet': 17, 'power': 18, 'radar': 19, 'scsi': 20,
        'tower': 21, 'voltag': 22, 'widget': 23, 'window': 24, 'wire': 25
    }
}

In [None]:
alpha = 0.05