# Imports

In [3]:
import os
import tarfile
import jdc      #for class handling in jupyter
import numpy as np

# Part1

In [4]:
#!wget -N http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

In [5]:
def read_tar():
    # open file 
    review_file = tarfile.open('review_polarity.tar.gz') 
  
    # extracting file 
    review_file.extractall('.') 
  
    review_file.close() 

In [6]:
"""_summary_
grab data from directory paths and lists
"""
def get_data(dir_list, dir_path, review_polarity):
  words = set()
  reviews = []
  sentiment = []
  
  for file_name in dir_list:
      if isinstance(file_name, str):
          f = open(os.path.join(dir_path, file_name),'r')
          review = f.read()
          reviews.append(review)
          words = words|set(review.split()) #.strip("") for future comparison
          sentiment.append(review_polarity)
          f.close()
  return np.array(reviews), np.array(sentiment), words

In [7]:
index_dict = {} #dictionary of words:index in bag of words vector
#read_tar()
neg_path = 'txt_sentoken/neg'
pos_path = 'txt_sentoken/pos'
neg_files = os.listdir(neg_path)
pos_files = os.listdir(pos_path)

neg_rev, neg_sent, neg_words = get_data(neg_files, neg_path, -1)
pos_rev, pos_sent, pos_words = get_data(pos_files, pos_path, 1)
#form word bag, X_raw, and prediction sets
words = neg_words|pos_words
X_raw = np.concatenate((neg_rev,pos_rev),axis=0)
y = np.concatenate((neg_sent,pos_sent),axis=0)
sorted_words = sorted(words)

In [8]:
#shuffles both predictions and raw data with same permutation.
def shuffle(data, labels):
    p = np.random.permutation(len(data))
    return data[p], labels[p]

In [9]:
X_shuffled, y_shuffled = shuffle(X_raw, y)

In [10]:
X_shuffled.shape

(2000,)

dimensions look good! now to move onto implementing BOW

# Part2


In [11]:
"""_summary_
transform x into a sparse BoW array. implementation part 2
"""
class Vectorizer:
  def __init__(self, sorted_vocab, tokenizer = 'none'):
    self.tokenizer = tokenizer
    self.index_dict = dict()
    for i, word in enumerate(sorted_vocab):
      self.index_dict[word] = i
  
  def transform(self, data):
    sparse_data = []
    for review in data:
      review_list = review.split()            #replace with tokenizer if needed
      sparse_review = np.zeros(len(self.index_dict))
      for word in review_list:
          sparse_review[self.index_dict[word]] = 1
      sparse_data.append(sparse_review)
            
    assert len(data) == len(sparse_data)        #is the output array the same length as input?
    assert sum(sparse_data[0]) != 0             #do these sparse arrays contain anything?
    
    return np.array(sparse_data)

In [12]:
victor = Vectorizer(sorted_words)

In [13]:
index_dict = dict()
for i, word in enumerate(sorted_words):
  index_dict[word] = i

for word in ['dolphin', 'the', 'coffee']:
  if word in sorted_words:
    print("'%s' is represented as feature dimension %i" %(word, index_dict[word]))
  else:
    print("'%s' is not in the vocabulary" % word)

'dolphin' is represented as feature dimension 13868
'the' is represented as feature dimension 45372
'coffee' is represented as feature dimension 9677


In [14]:
vocabulary = set(sorted_words)      #I will take it as a given this will be useful.

In [15]:
X = victor.transform(X_shuffled)            #output should be bag of words rep of sentence

In [16]:
X[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Part3
**Implementation task:** You should implement your versions of the following parts (you can also find this in the slides):

In [17]:
class model:
  def __init__(self, eta0 = 0.001, loss = 'hinge', alpha = .0001, learning_rate = "constant", penalty = 'L2', random_state = 'none',
               tol = 5):
    
    self.loss = loss  #hinge loss default
    self.loss_history = []
    self.lr, self.eta0 = eta0, eta0  #starting learning rate
    self.alpha = alpha #regularization dampener
    self.learning_rate = learning_rate  #do we use constant learning rate or schedule?
    self.penalty = penalty #penalty for hinge loss i think?
    self.random_state = random_state #random permutation desired for input? maybe not needed
    self.tol = tol  #tolerance of learning rate scheduler if i ever get around to implementing that
    self.weights = np.random.rand(self.X_raw.shape[0])
    self.weight_history = [].append(self.weights)
    
  def fit(self, X, y):
    """Does the training. X is a matrix with one data point per row, while y is flat."""
    best_score = self.score(X,y)
    self.loss.append(best_score)
    iter_wo_improve = 0
    
    for i in range(self.n_max_iterations):
      self._fit(X,y)
      if best_score-.001 < self.loss[-1]: #if the new theta isn't better than last, then increment iter_wo_improve.
        iter_wo_improve += 1
      else:
        best_score = self.loss[-1]
        iter_wo_improve = 0
      if iter_wo_improve >= self.tol: #stop criterion
        break
      
  """gradient descent"""
  def _fit(self, X, y): 
    #gradient descent
    cond = X.dot(self.weights)*y
    out = -X*y[:,np.newaxis]   #why is a (1,2000) matrix valid but (,2000) isn't
    #if cond is < 1, then 1-cond > 0, sign(1-cond) will be 1 when cond < 1.
    #multiply out by max between 0 and sign(1-cond) to get out when cond < 1
    grad_sum = np.sum(out*np.sign(np.maximum(0,1-cond[:,np.newaxis])),axis=0)
    
    grad = self.alpha*self.weights + grad_sum
    
    self.weights -= self.lr*grad
    
    self.loss_history.append(self.score(X,y))
    self.weight_history.append(self.weights)
    
  """#hyperplane model here?"""
  def predict(self, X):
    return np.sign(np.sum(X*self.weights))

  """#hinge loss for basemodel"""
  def score(self, X, y):
    if self.loss == 'hinge':
      term1 = self.alpha/2*np.linalg.norm(self.weights)**2
      term2 = np.sum(np.maximum(0,1-y.dot(self.weights*X)))
      
      return term1+term2

In [18]:
reguliser_dampening = 0.001   # lambda
learning_rate = .001          # gamma

# Create the untrained classifier
classy = model(loss='hinge', penalty='l2',
                      alpha=reguliser_dampening,
                      learning_rate='constant', eta0=learning_rate)

# Train the classifier
classy.fit(X, y_shuffled)

# Get the parameter vector
omega = classy.weights

In [None]:
import matplotlib.pyplot as plt
 
plt.figure(figsize=(20, 3))
plt.plot(omega[1:])
plt.xlabel("Value")
plt.xlabel("Weights")
plt.show()

In [None]:
assert (len(omega)-1) == len(vocabulary)

# Sort by absolute value
idx = np.argsort(np.abs(omega[1:]))

print("                Word   Weight  Occurences")
for i in idx[-20:]:   # Pick those with highest 'voting' values
  print("%20s   %.3f\t%i " % (ordered_vocabulary[i], omega[i+1], np.sum([ordered_vocabulary[i] in d for d in X_raw])))

# Part4
**Implementation task:** Implement code for printing a sorted table of your sampled hyperparameters. Note, you do not have to reimplement the grid search.