In [None]:
import os.path
from urllib.request import urlretrieve

url = """http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz"""
if not os.path.exists(url.split("/")[-1]):
    urlretrieve(url, url.split("/")[-1])
    print("Downloaded", url)

Downloaded http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz


In [None]:
import tarfile
import numpy as np
import math
from sklearn.model_selection import ParameterSampler

# model

class feature_transform:
  """
  Transform each document array containing words to binary vectors array using the vocabulary where words from all documents are stored.
  If words are found in the vocabulary, vector element will be 1 otherwise 0.
  """
  def __init__(self, vocabulary):
    self.vocabulary = vocabulary

  def fit_transform(self, X):
    X_raw = X
    X = np.zeros((2000, len(self.vocabulary)))
    for i in range(len(X_raw)):
      for token in set(X_raw[i].split()):
        X[i, self.vocabulary[token]] = 1
    return X

class model:
  def __init__(self, learning_rate, reguliser_dampening, max_iteration):
    self.learning_rate = learning_rate
    self.reguliser_dampening = reguliser_dampening
    self.max_iteration = max_iteration
    self.para = np.random.normal(size=X.shape[1])

  def fit(self, X, y):
    """
    The model get trained maximum 500 times but it can be interrupted before the maximum time if the model have found an optimized parameter(weight).
    Training runs by initializing new parameter using gredient descent and returns the distance between the previous and new parameter vectors.
    If the distance between the previous and new parameter vectors is less than 0.001, the training stops.
    """
    for c in range(self.max_iteration):
      sum = np.zeros(X.shape[1])
      for i in range(len(X)):
        con = y[i]*np.dot(self.para,X[i])
        if con < 1:
          sum += -y[i]*X[i]
      g = self.reguliser_dampening*self.para + sum
      old_para = self.para.copy()
      self.para = self.para - self.learning_rate*g
      d = np.linalg.norm([old_para[j] - self.para[j] for j in range(len(self.para))])
      if d < 0.001:
        break

  def predict(self, X):
    """
    Predict poitive or negative label (1 or -1) at each document using hyperplane.
    """
    hyperplane = np.dot(X, self.para)
    pred_y = np.sign(hyperplane)
    return pred_y

  def score(self, X, y):
    """
    Score fundtion can be used for getting a stop criterion by calculating the loss of parameters.
    I don't use it in my model becasue I use distance between two parameter vectors as the stop criterion.
    However, I implemented this part to compare two stop criteria when working on the assignment.
    """
    sum = []
    r = self.reguliser_dampening*np.sum(self.para**2)/2
    for i in range(len(X)):
      sum.append(max(0, 1-y[i]*np.dot(self.para, X[i])))
    loss = r + np.sum(np.array(sum))
    return loss

# Read data files and creat initial X, y dataset.
tar = tarfile.open('review_polarity.tar.gz')
X_raw = []
y = []
for member in tar.getnames():
    f=tar.extractfile(member).read()
    if "neg" in member.split('/'):
      y.append(-1)
      X_raw.append(f.decode())
    elif "pos" in member.split('/'):
      y.append(1)
      X_raw.append(f.decode())
y = np.array(y)

# Creat a vocabulary containing all words from X dataset.
ordered_vocabulary = []
for x in X_raw:
  tokens = x.split()
  for word in tokens:
    ordered_vocabulary.append(word)

ordered_vocabulary = set(ordered_vocabulary)

vocabulary = dict()
for i, word in enumerate(ordered_vocabulary):
  vocabulary[word] = i

# Run the class for transforming X data.
my_feature = feature_transform(vocabulary)
X = my_feature.fit_transform(X_raw)

# Devide X, y dataset for train and test (8:2, classes are balanced).
X_train, X_test, y_train, y_test = np.concatenate((X[:800],X[1000:1800])), np.concatenate((X[800:1000],X[1800:])), np.concatenate((y[:800],y[1000:1800])), np.concatenate((y[800:1000],y[1800:]))

# Find the best learning rate and reguliser_dampening
parameter_distribution = {'learning_rate': np.exp(np.linspace(np.log(0.0001), np.log(3), 10)),
                          'reguliser_dampening': np.exp(np.linspace(np.log(0.0001), np.log(3), 10))}

best_hyperparameters = None
print("Learning rate:\tReg.dampening:\tTraining set accuracy:")

for hyperparameters in ParameterSampler(parameter_distribution, n_iter=10):

  learning_rate = hyperparameters['learning_rate']
  reguliser_dampening = hyperparameters['reguliser_dampening']
  max_iteration = 500
  train_model = model(learning_rate, reguliser_dampening, max_iteration)

  # Train the model
  train_model.fit(X_train, y_train)

  # Calculate the training accuracy
  training_accuracy = np.sum(train_model.predict(X_train)==y_train)/len(y_train)

  # Store the hyperparameters and training accuracy
  if best_hyperparameters is None or best_hyperparameters[1] < training_accuracy:
    best_hyperparameters = (hyperparameters, training_accuracy)
  print("%.5f\t\t%.5f\t\t%.1f%%" % (hyperparameters['learning_rate'], hyperparameters['reguliser_dampening'], 100*training_accuracy))

best_learning_rate = best_hyperparameters[0]['learning_rate']
best_reguliser_dampening = best_hyperparameters[0]['reguliser_dampening']
print("Best parameters: %.5f, %.5f" % (best_learning_rate, best_reguliser_dampening))

# Test model with the best hyperparameters.
test_model = model(best_learning_rate, best_reguliser_dampening, max_iteration)
test_model.fit(X_train, y_train)
test_accuracy = np.sum(test_model.predict(X_test)==y_test)/len(y_test)

print("Test set accuracy %.1f%%" % (100*test_accuracy))

"""
I have learned more about Numpy and how to get a basic model strcutre where we can train and evaluate it.
The most challenging part in this assignment was how to implement objective function and gredient descent with vectorized data.
It took alot of time to figure it out to plug the vectorized data into different formula to get the correct calculations.

When I tested my model with the best learning rate and reguliser dampening given datasets,
I got 85.8% accuracy which means that my model manganged to classify the data about 86% correctly.
This result shows that the model learned in training given data to perform as a classifier with 85% accuracy.
"""

Learning rate:	Reg.dampening:	Training set accuracy:
0.03071		0.00977		100.0%
0.09655		0.00099		100.0%
0.00099		3.00000		99.9%
0.30353		0.95425		50.0%
0.00099		0.03071		100.0%
3.00000		0.95425		50.0%
0.00311		0.00311		100.0%
0.00010		0.00977		87.0%
0.03071		0.30353		99.9%
0.09655		0.00311		100.0%
Best parameters: 0.03071, 0.00977
Test set accuracy 85.8%


'\nI have learned more about numpy and how to get a basic model strcutre where we can train and evaluate it.\nThe most challenging part in this assignment was how to implement objective function and gredient descent with vectorized data.\nIt took alot of time to figure it out to plug the vectorized data into different formula to get the correct calculations.\n\nWhen I tested my model with the best learning rate and reguliser dampening given after trainig the model 10 times,\nI got 54.8% accuracy which means that my model manganged to classify the data about 55% correctly.\nThus, it is hard to say the model is good or not as a classifier but it show that it works as a classifier.\n'