In [7]:
import os
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk import word_tokenize
import random
import math
import time
import operator

In [8]:
def cleaning(raw_data):
    letters_only = re.sub("[^a-zA-Z]", " ", raw_data)
    words = letters_only.lower().split()
    #words = word_tokenize(raw_data)
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    return meaningful_words

In [9]:
def load_data(dirc):
    # output a list of string
    package = os.listdir(dirc)
    string_list = []
    for name in package:
        file_dir = dirc + name
        string_list.append(cleaning(open(file_dir,encoding='latin-1').read()))
    return string_list

In [10]:
pos_dire = '/Users/kunqi/Downloads/review-polarity/tokens/pos/'
pos_data = load_data(pos_dire)
neg_dire = '/Users/kunqi/Downloads/review-polarity/tokens/neg/'
neg_data = load_data(neg_dire)
all_data = pos_data + neg_data

# take all the words appeared into account
def word_appeared(word_list):
    word_appear = []
    for samp in word_list:
        for w in samp:
            if w not in word_appear:
                word_appear.append(w) 
    return word_appear

w_appeared = word_appeared(all_data)


In [13]:
print(w_appeared[200:240])

['undoing', 'wins', 'consequences', 'dire', 'intensely', 'personal', 'filmmaker', 'interiors', 'hearts', 'minds', 'particular', 'trademarks', 'stripped', 'minimalist', 'style', 'flat', 'expressionless', 'dialogue', 'use', 'natural', 'sounds', 'music', 'background', 'twice', 'narration', 'segment', 'credits', 'heavy', 'drumbeat', 'accompanying', 'bagpipes', 'rest', 'scored', 'punctuate', 'thematic', 'elements', 'incessant', 'clanking', 'creaking', 'armor']


In [14]:
# order of feature vector will be total number of words appeared
def vectorize(to_vec,voc):
    vec = [0 for i in range(len(voc))]
    for word in to_vec:
        vec[voc.index(word)] += 1
    return vec

def normalize(vec):
    return (vec - np.mean(vec))/np.std(vec)

def get_set(data,label,voc):
    feature_set = []
    lable_set = [label for i in range(len(data))]
    for sam in data:
        vec = np.array(vectorize(sam,voc))
        nor = normalize(vec)
        feature_set.append(nor)
    formed_set = list(zip(feature_set,lable_set))           
    return formed_set

pos_set = get_set(pos_data,1,w_appeared)
neg_set = get_set(neg_data,-1,w_appeared)

pos_neg = pos_set + neg_set

#split data 
random.shuffle(pos_neg)
training_set = pos_neg[:1000]
test_set = pos_neg[1000:]
       

In [37]:
class SVM(object):
    def __init__(self,dia):
        self.w = np.array([np.random.rand() for i in range(dia)])
        self.b = np.random.rand()
        
    def activation(self,x):
        act = 2*self.sigmoid(2*(np.dot(self.w,x) + self.b)) - 1
        if act > 0:
            return 1
        else:
            return -1
    
    def sigmoid(self,t):
        return 1/(1+np.exp(-t))
    
    def train(self,training_data, eta, epoch, test_data):
        for i in range(epoch):
            sum_se = 0
            random.shuffle(training_data)
            for point in training_data:
                x = point[0]
                y = point[1]
                error = y - self.activation(x)
                sum_se += error ** 2
                self.w += eta * error * x
                self.b += eta * error
            print("epoch%d: se:%.3f" % (i,sum_se))
            if test_data:
                self.evaluate(test_data)
            if sum_se == 0:
                break
    
    def evaluate(self,test_set):
        results = [(self.activation(x),y) for (x,y) in test_set]
        corr = sum(int(x==y) for (x,y) in results)
        print("performance: %d/%d" % (corr,len(test_set)))

In [39]:
# K-fold-cross-validation
k = 7
print(len(pos_neg))
batch_num = int(len(pos_neg) / k)
print(batch_num)
dia = len(w_appeared)
for i in range(k):
    print("%d-fold:"%i)
    start = time.time()
    test_set = pos_neg[i*batch_num:(i+1)*batch_num]
    train_set = pos_neg[0:i*batch_num] + pos_neg[(i+1)*batch_num:len(pos_neg)]
    svm = SVM(dia)
    svm.train(train_set,0.5,15,test_set)
    end = time.time()
    print("Elapsed time for a fold:",end-start)

1400
200
0-fold:


  


epoch0: se:1516.000
performance: 138/200
epoch1: se:576.000
performance: 159/200
epoch2: se:208.000
performance: 153/200
epoch3: se:152.000
performance: 164/200
epoch4: se:84.000
performance: 160/200
epoch5: se:8.000
performance: 160/200
epoch6: se:8.000
performance: 162/200
epoch7: se:32.000
performance: 159/200
epoch8: se:16.000
performance: 164/200
epoch9: se:0.000
performance: 164/200
Elapsed time for a fold: 2.096766948699951
1-fold:
epoch0: se:1492.000
performance: 138/200
epoch1: se:540.000
performance: 143/200
epoch2: se:196.000
performance: 158/200
epoch3: se:132.000
performance: 163/200
epoch4: se:52.000
performance: 162/200
epoch5: se:16.000
performance: 158/200
epoch6: se:40.000
performance: 157/200
epoch7: se:32.000
performance: 156/200
epoch8: se:16.000
performance: 160/200
epoch9: se:20.000
performance: 160/200
epoch10: se:0.000
performance: 160/200
Elapsed time for a fold: 1.6096088886260986
2-fold:
epoch0: se:1544.000
performance: 153/200
epoch1: se:544.000
performance