# Imports

In [1]:
import os
import tarfile
import jdc      #for class handling in jupyter
import numpy as np

# Model definition

In [1]:
class model:
  def __init__(self, eta0 = 0.1, loss = 'hinge', alpha = .0001, learning_rate = "constant", penalty = 'L2', random_state = 'none',
               tol = 5):
    self.index_dict = {} #dictionary of words:index in bag of words vector
    #read_tar()
    neg_path = 'txt_sentoken/neg'
    pos_path = 'txt_sentoken/pos'
    neg_files = os.listdir(neg_path)
    pos_files = os.listdir(pos_path)

    neg_rev, neg_sent, neg_words = self.get_data(neg_files, neg_path, -1)
    pos_rev, pos_sent, pos_words = self.get_data(pos_files, pos_path, 1)
    #form word bag, X_raw, and prediction sets
    words = neg_words|pos_words
    self.X_raw = np.concatenate((neg_rev,pos_rev),axis=0)
    self.y = np.concatenate((neg_sent,pos_sent),axis=0)
    self.sorted_words = sorted(words)
    
    self.loss = loss  #hinge loss
    self.loss_history = []
    self.eta0 = eta0  #starting learning rate
    self.alpha = alpha #regularization dampener
    self.learning_rate = learning_rate  #do we use constant learning rate or schedule?
    self.penalty = penalty #penalty for hinge loss i think?
    self.random_state = random_state #random permutation desired for input? maybe not needed
    self.tol = tol  #tolerance of learning rate scheduler if i ever get around to implementing that
    self.weights = np.random.rand(self.X_raw.shape)
    
  def fit(self, X, y):
    """Does the training. X is a matrix with one data point per row, while y is flat."""
    X = X.ravel()
    best_score = self.score(X,y)
    self.loss.append(best_score)
    early_stop,iter_wo_improve = 20,0
    
    for i in range(self.n_max_iterations):
      self._fit(X,y)
      if best_score == self.loss[-1]: #if the new theta isn't better than last, then increment iter_wo_improve.
        iter_wo_improve += 1
      else:
        best_score = self.loss[-1]
      if iter_wo_improve >= early_stop: #implement learning rate scheduler here
        break
      
  #gradient descent here?
  def _fit(self, X, y): 
    """Internal method that performs one iteration of the training. Should store a loss value in self.loss .""" 
    #is new param vector better than last one? compare score residual, if better keep, otherwise discard.
    old_score = self.loss[-1] 
    old_theta = self.theta.copy()
    self.theta += np.random.normal(size=len(self.theta)) #replace with gradient descent.
    new_score = self.score(X,y)
    if old_score > new_score:
      self.loss.append(new_score)
    else:
      self.theta = old_theta

  #hyperplane model here?
  def predict(self, X):
    for review in X:
      proddy = np.dot(review,self.weights)
      if proddy < 0:
        return -1
      elif X == 0:
        return 0
      else:
        return 1

  #hinge loss for base
  def score(self, X, y):
    #how to define t? if on right side of hyperplane, 1, otherwise -1
    if self.loss == 'hinge':
      L2_sum = sum(max(0,1-))
      
      return max(0,1-t*y)

# Part1
**Implementation task:** Implement a parser for the dataset. The output should be a list/array of strings (`X_raw`) and a list/array of labels (`y`) encoded as {-1,1}.

In [None]:
#!wget -N http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

In [2]:
def read_tar():
    # open file 
    review_file = tarfile.open('review_polarity.tar.gz') 
  
    # extracting file 
    review_file.extractall('.') 
  
    review_file.close() 

In [5]:
"""_summary_
grab data from directory paths and lists
"""
%%add_to model
def get_data(self, dir_list, dir_path, review_polarity):
  words = set()
  reviews = []
  sentiment = []
  
  for file_name in dir_list:
      if isinstance(file_name, str):
          f = open(os.path.join(dir_path, file_name),'r')
          review = f.read()
          reviews.append(review)
          words = words|set(review.split()) #.strip("") for future comparison
          sentiment.append(review_polarity)
          f.close()
  return np.array(reviews), np.array(sentiment), words

In [7]:
#shuffles both predictions and raw data with same permutation.
%%add_to model
    def shuffle(data, labels):
        p = np.random.permutation(len(data))
        return data[p], labels[p]

In [8]:
classifier = model()
classifier.X_raw

dimensions look good! now to move onto implementing BOW

# Part2

**Implementation task:** You should re-implement the feature extraction above. The list/array called `ordered_vocabulary` should contain the words for each feature dimension, and X should contain the BOW binary vectors. Remember to use the same method names as the original sklearn class.

In [None]:
"""_summary_
transform x into a sparse BoW array. implementation part 2
"""
class Vectorizer:
  def __init__(self, tokenizer = 'none'):
    self.tokenizer = tokenizer
  
  def transform(self, data):
    sparse_data = []
    for review in data:
      review_list = review.split()            #replace with tokenizer if needed
      sparse_review = np.zeros(len(review_list))
      for word in review_list:
          sparse_review[self.index_dict[word]] = 1
    sparse_data.append(sparse_review)
            
    assert len(data) == len(sparse_data)        #is the output array the same length as input?
    assert sum(sparse_data[0]) != 0             #do these sparse arrays contain anything?
    
    return np.array(sparse_data)

In [None]:
victor = Vectorizer()

In [9]:
index_dict = dict()
for i, word in enumerate(classifier.sorted_words):
  index_dict[word] = i

for word in ['dolphin', 'the', 'coffee']:
  if word in classifier.words:
    print("'%s' is represented as feature dimension %i" %(word, index_dict[word]))
  else:
    print("'%s' is not in the vocabulary" % word)

'dolphin' is represented as feature dimension 13868
'the' is represented as feature dimension 45372
'coffee' is represented as feature dimension 9677


In [15]:
vocabulary = set(classifier.sorted_words)      #I will take it as a given this will be useful.

In [10]:
X = victor.transform(classifier.X_raw)            #output should be bag of words rep of sentence

TypeError: transform() missing 1 required positional argument: 'data'

# Part3
**Implementation task:** You should implement your versions of the following parts (you can also find this in the slides):

In [None]:
# Set hyperparameters (these variables are only here for clarity)
reguliser_dampening = 0.001   # lambda
learning_rate = .1            # gamma

# Create the untrained classifier
classy = model(loss='hinge', penalty='l2',
                      alpha=reguliser_dampening,
                      learning_rate='constant', eta0=learning_rate)

# Train the classifier
model.fit(X, y)

# Get the parameter vector
omega = np.concatenate([model.intercept_, model.coef_.ravel()])

In [None]:
import matplotlib.pyplot as plt
 
plt.figure(figsize=(20, 3))
plt.plot(omega[1:])
plt.xlabel("Value")
plt.xlabel("Weights")
plt.show()

In [None]:
assert (len(omega)-1) == len(vocabulary)

# Sort by absolute value
idx = np.argsort(np.abs(omega[1:]))

print("                Word   Weight  Occurences")
for i in idx[-20:]:   # Pick those with highest 'voting' values
  print("%20s   %.3f\t%i " % (ordered_vocabulary[i], omega[i+1], np.sum([ordered_vocabulary[i] in d for d in X_raw])))

# Part4
**Implementation task:** Implement code for printing a sorted table of your sampled hyperparameters. Note, you do not have to reimplement the grid search.