# CS4248 Assignment 3

### Perceptron text classifier in Python with feature selection

##### https://github.com/jia1/pyceptron

In [None]:
import os
import re
import sys
from string import punctuation
from porter import PorterStemmer

p = PorterStemmer()

In [None]:
import numpy as np

In [None]:
src_dir = os.path.abspath('tc')
dst_dir = os.path.abspath('tc_proc')

In [None]:
k = 2
classes, nxxs = [], ['n00', 'n01', 'n10', 'n11']
class_to_text, text_to_freq = {}, {}
class_to_feat = {}
nxx_dict = { n: {} for n in nxxs }
chi_dict = {}

In [None]:
# A. Split each line by space into tokens
# B. Strip all default white space characters from each token
# C. Remove punctuation from each token
# D. Return a list of tokens which are not stop words

def strip_and_filter_line(ln):
    tokens = map(lambda t: t.strip().strip(punctuation).lower(), ln.split(' '))
    return list(filter(lambda t: t and len(t) > 2 and t.isalpha() and t not in stop_list, tokens))

In [None]:
def isIn(a, b):
    return 1 if a in b else 0

def isNotIn(a, b):
    return 1 if a not in b else 0

def count_nxx(nxx, w, c):
    global classes, class_to_text, text_to_freq
    answer = 0
    if nxx == 'n00':
        for class_name in filter(lambda x: x != c, classes):
            for text in class_to_text[class_name]:
                if isNotIn(w, text_to_freq[text]):
                    answer += 1
    elif nxx == 'n01':
        for text in class_to_text[c]:
            if isNotIn(w, text_to_freq[text]):
                answer += 1
    elif nxx == 'n10':
        for class_name in filter(lambda x: x != c, classes):
            for text in class_to_text[class_name]:
                if isIn(w, text_to_freq[text]):
                    answer += 1
    elif nxx == 'n11':
        for text in class_to_text[c]:
            if isIn(w, text_to_freq[text]):
                answer += 1
    return answer

In [None]:
# n00 is the number of training texts that do not contain w and are not in class c.
# n01 is the number of training texts that do not contain w and are in class c.
# n10 is the number of training texts that contain w and are not in class c.
# n11 is the number of training texts that contain w and are in class c.

def chi_square(w, c):
    global nxxs, nxx_dict
    ns_dict = {}
    for n in nxxs:
        if w not in nxx_dict[n]:
            nxx_dict[n][w] = {}
        if c not in nxx_dict[n][w]:
            nxx_dict[n][w][c] = count_nxx(n, w, c)
        ns_dict[n] = nxx_dict[n][w][c]
    n00, n01, n10, n11 = ns_dict['n00'], ns_dict['n01'], ns_dict['n10'], ns_dict['n11']
    return ((n11+n10+n01+n00)*(n11*n00-n10*n01)**2)/((n11+n01)*(n11+n10)*(n10+n00)*(n01+n00))

In [None]:
def put_chi_dict(c, w, chi_square_value):
    global chi_dict
    if w not in chi_dict[c]:
        chi_dict[c][w] = chi_square_value
    else:
        chi_dict[c][w] = max(chi_dict[c][w], chi_square_value)

In [None]:
def gen_feats():
    global classes, chi_dict, class_to_feat
    max_feat_len = sys.maxsize
    feat_queue_dict = { c: [] for c in classes }
    for c in chi_dict:
        feat_queue_dict[c] = sorted(chi_dict[c].items(), key = lambda x: x[1], reverse = True)
        max_feat_len = min(max_feat_len, len(feat_queue_dict[c]))
    max_feat_len //= 100
    class_to_feat = { c: feat_queue_dict[c][:max_feat_len] for c in feat_queue_dict }

In [None]:
def feature_select():
    global classes, class_to_text, text_to_freq, class_to_feat
    for c in classes:
        for text in class_to_text[c]:
            for w in text_to_freq[text]:
                put_chi_dict(c, w, chi_square(w, c))
                gen_feats()

In [None]:
# Refactor this
def pla(xn, yn, maxIter = 1000, w = np.zeros(3)):
    N = xn.shape[0]
    summ_w = w
    rows = len(w)
    for _ in range(maxIter):
        i = nr.randint(N)
        if(yn[i] != g(xn[i,:])):
            w[0] += yn[i]
            w[1] += yn[i]*xn[i][0]
            w[2] += yn[i]*xn[i][1]
            for j in range(rows):
                summ_w[j] += w[j]
    summ_w = map(lambda weight: weight/maxIter+1, summ_w)
    return summ_w
    # return w

In [None]:
%%timeit

# Load all stop words into a list

with open('stopword-list', 'r') as s:
    stop_list = list(map(lambda ln: ln.strip(), s.readlines()))

In [None]:
%%timeit

for curr_dir, sub_dir, files in os.walk(src_dir):
    if not files:
        classes = sub_dir
        class_to_text = { c: set() for c in classes }
        class_to_feat = { c: set() for c in classes }
        chi_dict = { c: {} for c in classes }
        continue
    curr_class = re.split('[(\\\\)(\\)(\/)]', curr_dir)[-1] # curr_dir.split('\\')[-1]
    for file in files:
        flat_text = []
        freq_dict = {}
        with open(os.path.join(curr_dir, file), 'r') as f:
            processed_lines = map(lambda ln: strip_and_filter_line(ln), f.readlines())
            for line in processed_lines:
                flat_text.extend(list(map(lambda word: p.stem(word, 0, len(word) - 1), line)))
            for word in flat_text:
                if word not in freq_dict:
                    freq_dict[word] = 1
                else:
                    freq_dict[word] += 1
            fin_freq_dict = { word: freq for word, freq in freq_dict.items() if freq >= k }
            if not fin_freq_dict:
                fin_freq_dict = freq_dict
            sum_freq = sum(fin_freq_dict.values())
            normalized_freq_dict = { word: freq / sum_freq for word, freq in fin_freq_dict.items() }
            class_to_text[curr_class].add(file)
            text_to_freq[file] = normalized_freq_dict

In [None]:
%%timeit

feature_select()

In [None]:
# print(classes)
# print(nxxs)
# print(class_to_text)
# print(text_to_freq)
# print(nxx_dict)
# print(class_to_feat)
# print(len(class_to_feat['c1']))

In [None]:
%lsmagic