In [1]:
import glob
import re
import numpy as np

In [2]:
fcf = glob.glob('./*.txt')

In [3]:
f = open('01.txt','r')
text1 = f.read()

In [4]:
f = open('feature.txt','r')
features = f.read()
features = re.split(r'[` \n]', features)
print("Features: ",features)

Features:  ['car', 'passenger', 'seat', 'drive', 'power', 'highway', 'purchase', 'hotel', 'room', 'night', 'staff', 'water', 'location']


In [5]:
Xtrain = []

In [6]:
file_features = []
for feature in features:
    occurance = 0
    for word in re.split(r'[` \-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?]', text1):
        if word == feature: occurance += 1
    file_features.append(occurance)

Xtrain.append(file_features)

In [7]:
print("Xtrain: ", Xtrain)
print(features)

Xtrain:  [[1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0]]
['car', 'passenger', 'seat', 'drive', 'power', 'highway', 'purchase', 'hotel', 'room', 'night', 'staff', 'water', 'location']


In [8]:
Xtrain = []

In [9]:
for file_number in range(1, 41):
    
    file_name = "{:02d}.txt".format(file_number)
    f = open(file_name,'r')
    text = f.read()
    
    file_features = []
    for feature in features:
        occurance = 0
        for word in re.split(r'[` \-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?]', text):
            if word == feature: occurance += 1 
        file_features.append(occurance)
        
    Xtrain.append(file_features)

In [10]:
Xtrain = np.asarray(Xtrain)
Xtrain.shape

(40, 13)

In [11]:
Xtest = []

In [12]:
for file_number in range(41, 51):
    
    file_name = "{:02d}.txt".format(file_number)
    f = open(file_name,'r')
    text = f.read()
    
    file_features = []
    for feature in features:
        occurance = 0
        for word in re.split(r'[` \-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?]', text):
            if word == feature: occurance += 1 
        file_features.append(occurance)
        
    Xtest.append(file_features)

In [13]:
Xtest = np.asarray(Xtest)
Xtest.shape

(10, 13)

# Q4

In [14]:
import matplotlib.pyplot as plt
import matplotlib.colors
from cvxopt import solvers
from cvxopt import matrix

## Two Features

In [15]:
# choose only "car" and "hotel" features for the train data
Xtrain_two = np.concatenate((Xtrain[:20, [0,7]], -Xtrain[20:, [0, 7]]))

Y_two = np.append( np.ones((1, 20)), -np.ones((1, 20)))
Y_two = Y_two.reshape(40,1)
Y_two = np.concatenate((Y_two,Xtrain_two), axis=1)

Y_two.shape

(40, 3)

In [16]:
# choosing "car" and "hotel" features only for the test data
Xtest_two = np.concatenate((np.ones((10, 1)),Xtest[:, [0,7]]), axis=1)

### Using Perceptron

In [17]:
a=np.zeros((Y_two.shape[1],1))

# no. of misclassified samples
sum_wrong=1

#Perceptron
a_iter=a
k=0

while sum_wrong>0 and k<1000:
    
    wrong=np.dot(Y_two,a_iter)<=0
    sum_wrong=sum(wrong)
    sum1=sum(wrong*np.ones((1,Y_two.shape[1]))*Y_two)    
    a_iter=a_iter+sum1.reshape(Y_two.shape[1],1)
    
    k=k+1

print("Final a = {} after k={} iterations".format(np.transpose(a_iter), k, ))

Final a = [[  2.  37. -33.]] after k=1000 iterations


#### Perceptron does not converge with two features since this dataset contains an observation 0,0 [car, hotel] which belongs to both classes. This means that there is data with neither of the keywords "car" and "hotel" which belong to both the classes, and it is difficult for the perceptron, a linear classifier, to classify the data. Thus, the classifier cannot be obtained

### Using SVM

In [18]:
A=matrix(Y_two,tc='d')
b=matrix(-1*np.ones((A.size[0],1)),tc='d')

q1=np.zeros((1,A.size[1]))
Q2=np.concatenate((np.zeros((Xtrain_two.shape[1],1)), np.eye(Xtrain_two.shape[1])),axis=1)
Q=np.concatenate((q1,Q2),axis=0)
Q=matrix(2*Q,tc='d')

q=matrix(np.zeros((A.size[1],1)),tc='d')

In [19]:
solvers.options['show_progress'] = False
sol=solvers.qp(Q,q,A,b)

#### SVM does not converge with two features since this dataset containains an observation 0,0 [car, hotel] which belongs to both classes. This means that there is data with neither of the keywords "car" and "hotel" which belong to both the classes, and it is difficult for the linear version of SVM to classify the data. Thus, we cannot obtain the classifier.

### Testing accuracy

In [20]:
Y_true = [0,1,1,0,0,0,1,1,0,1]

for i, val in enumerate(Y_true):
    if val == 0:
        Y_true[i] = -1

In [21]:
#Perceptron classifier
a_con_p=a_iter
ans = np.dot(np.transpose(a_con_p), np.transpose(Xtest_two))
match = sum([1 for i in range(len(Y_true)) if Y_true[i]*ans[0][i] < 0])
print("Accuracy is equal to {} percent".format(match/len(Y_true)*100))
print("Note: the classifier did not converge")

Accuracy is equal to 70.0 percent
Note: the classifier did not converge


In [22]:
#SVM classifier
a_con_s=sol['x']
ans = np.dot(np.transpose(a_con_s), np.transpose(Xtest_two))
match = sum([1 for i in range(len(Y_true)) if Y_true[i]*ans[0][i] > 0])
print("Accuracy is equal to {} percent".format(match/len(Y_true)*100))
print("Note: the classifier did not converge")

Accuracy is equal to 90.0 percent
Note: the classifier did not converge


## All Features

In [23]:
# choose all features for the train data
Xtrain_all = np.concatenate((Xtrain[:20], -Xtrain[20:]))

Y_all = np.append( np.ones((1, 20)), -np.ones((1, 20)))
Y_all = Y_all.reshape(40,1)
Y_all = np.concatenate((Y_all,Xtrain_all), axis=1)

Y_all.shape

(40, 14)

In [24]:
# choosing all features for the test data
Xtest_all = np.concatenate((np.ones((10, 1)),Xtest), axis=1)

Xtest_all.shape

(10, 14)

### Using Perceptron

In [25]:
a=np.zeros((Y_all.shape[1],1))

# no. of misclassified samples
sum_wrong=1

#Perceptron
a_iter=a
k=0

while sum_wrong>0 and k<1000:
    
    wrong=np.dot(Y_all,a_iter)<=0
    sum_wrong=sum(wrong)
    sum1=sum(wrong*np.ones((1,Y_all.shape[1]))*Y_all)    
    a_iter=a_iter+sum1.reshape(Y_all.shape[1],1)
    
    k=k+1

print("Final a = {} after k={} iterations".format(np.transpose(a_iter), k, ))

Final a = [[  1.  37.   2.   5.   8.   6.   5.   3. -33. -30.  -5.  -9.  -1. -15.]] after k=3 iterations


### Using SVM

In [26]:
A=matrix(Y_all,tc='d')
b=matrix(-1*np.ones((A.size[0],1)),tc='d')

q1=np.zeros((1,A.size[1]))
Q2=np.concatenate((np.zeros((Xtrain_all.shape[1],1)), np.eye(Xtrain_all.shape[1])),axis=1)
Q=np.concatenate((q1,Q2),axis=0)
Q=matrix(2*Q,tc='d')

q=matrix(np.zeros((A.size[1],1)),tc='d')

In [27]:
solvers.options['show_progress'] = True
sol=solvers.qp(Q,q,A,b)

     pcost       dcost       gap    pres   dres
 0:  6.5331e-01  1.8617e+01  1e+02  2e+00  1e+02
 1:  4.1640e+00 -1.4621e+01  3e+01  5e-01  3e+01
 2:  5.4075e+00  2.5870e+00  3e+00  1e-02  5e-01
 3:  4.4849e+00  4.0273e+00  5e-01  1e-03  6e-02
 4:  4.3851e+00  4.3407e+00  4e-02  5e-16  9e-15
 5:  4.3765e+00  4.3740e+00  2e-03  6e-16  6e-15
 6:  4.3761e+00  4.3760e+00  1e-04  6e-16  3e-14
 7:  4.3761e+00  4.3761e+00  1e-05  5e-16  7e-14
 8:  4.3761e+00  4.3761e+00  2e-06  5e-16  4e-15
Optimal solution found.


### Testing accuracy

In [28]:
#Perceptron classifier
a_con_p=a_iter
ans = np.dot(np.transpose(a_con_p), np.transpose(Xtest_all))
match = sum([1 for i in range(len(Y_true)) if Y_true[i]*ans[0][i] < 0])
print("Accuracy is equal to {} percent".format(match/len(Y_true)*100))
print("Note: the classifier converged after 3 iterations")

Accuracy is equal to 90.0 percent
Note: the classifier converged after 3 iterations


In [29]:
#SVM classifier
a_con_s=sol['x']
ans = np.dot(np.transpose(a_con_s), np.transpose(Xtest_all))
match = sum([1 for i in range(len(Y_true)) if Y_true[i]*ans[0][i] > 0])
print("Accuracy is equal to {} percent".format(match/len(Y_true)*100))
print("Note: the classifier converged after 8 iterations")

Accuracy is equal to 100.0 percent
Note: the classifier converged after 8 iterations
