# Notebook for experimenting and trying to improve model

### Considerations to improve 
- Stopwords 
- Word2vec
- bigger n-grams 
- bert? 
- max words

In [1]:
import gzip
import json
import torch 
from nltk.tokenize import TweetTokenizer
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import classification_report
import tensorflow as tf

In [27]:
def build_vocab(filepath, padding = False):
    train_vocab = {}
    train = gzip.open(filepath)
    counter1 = 0
    counter2 = 0
    counter3 = 0
    counter = 0
    if padding: 
        train_vocab['<PAD>'] = 0
        counter2 += 1
    no_reviewText = []
    labels = {}
    sentences = {}
    tokenizer = TweetTokenizer()
    for line in train:
        counter1 +=1
        #print(line)
        if 'reviewText' in json.loads(line).keys():
            a = json.loads(line)
            sentences[counter3] = a['reviewText']
            counter3 += 1
            if a['sentiment'] == 'positive':
                labels[counter] = 1
            elif a['sentiment'] == 'negative': 
                labels[counter] = 0
            counter +=1
            for word in tokenizer.tokenize(json.loads(line)['reviewText']):
                if word not in train_vocab.keys():
                    train_vocab[word] = counter2
                    counter2 += 1
        else:
            no_reviewText.append(counter1)
    final_dict = {'line_count' : counter1,
                 'review_count' : counter3,
                 'vocab_size' : counter2,
                 'no_text_reviews' : no_reviewText,
                 'labels' : labels,
                 'vocabulary' : train_vocab,
                 'sentences' : sentences}
    return final_dict

def sen_vectorizer(filepath, cutoff = False): 
    vocab, index = {}, 1
    data = gzip.open(filepath)
    vocab['<PAD>'] = 0
    counter1 = 0
    counter2 = 0
    counter3 = 0
    counter = 0
    no_reviewText = []
    sentences = {}
    tokenizer = TweetTokenizer()
    labels = {}
    for line in data:
        counter1 +=1
        #print(line)
        if 'reviewText' in json.loads(line).keys():
            a = json.loads(line)
            b = tokenizer.tokenize(a['reviewText'])
            if cutoff: 
                b = b[:cutoff]
            sentences[counter3] = b
            counter3 += 1
            if a['sentiment'] == 'positive':
                labels[counter] = 1
            elif a['sentiment'] == 'negative': 
                labels[counter] = 0
            counter +=1
            for word in b:
                if word not in vocab.keys():
                    vocab[word] = index
                    index += 1
        else:
            no_reviewText.append(counter1)
    inverse_vocab = {index: token for token, index in vocab.items()}
    final_dict = {'line_count' : counter1,
                 'review_count' : counter3,
                 'vocab_size' : counter2,
                 'no_text_reviews' : no_reviewText,
                 'labels' : labels,
                 'vocabulary' : vocab,
                 'sentences' : sentences,
                 'inverse_vocab' : inverse_vocab}
    return final_dict
    
def create_onehot(vocab, sentences, tokenzier):
    # Create matrix
    m1 = torch.zeros(len(sentences), len(vocab))
    # Correct indices
    for sen in range(len(sentences)): 
        for word in sentences[sen]: 
            if word in vocab.keys():
                m1[sen, vocab[word]] = 1
    return m1

def create_batches(matrix, batch_size,labels): 
    num_batches = int(len(matrix)/batch_size)
    feats_batches = matrix[:batch_size*num_batches].view(num_batches,batch_size, matrix.shape[1])
    bingus = torch.FloatTensor(list(labels.values()))
    num_batches = int(len(bingus)/batch_size)
    label_batches = bingus[:batch_size*num_batches].view(num_batches,batch_size,1)
    return feats_batches, label_batches

paths = {'train':'../classification/music_reviews_train.json.gz',
        'test':'../classification/music_reviews_test_masked.json.gz',
        'dev' : '../classification/music_reviews_dev.json.gz'}
tokenizer = TweetTokenizer()

In [28]:
train_data = sen_vectorizer(paths['train'], cutoff = 100)
train_matrix = create_onehot(train_data['vocabulary'], train_data['sentences'], TweetTokenizer)

In [29]:
bingus = train_data['sentences'][0]

In [30]:
bingus

['So',
 'creative',
 '!',
 'Love',
 'his',
 'music',
 '-',
 'the',
 'words',
 ',',
 'the',
 'message',
 '!',
 'Some',
 'of',
 'my',
 'favorite',
 'songs',
 'on',
 'this',
 'CD',
 '.',
 'I',
 'should',
 'have',
 'bought',
 'it',
 'years',
 'ago',
 '!']

In [37]:
example_sequence = [train_data['vocabulary'][word] for word in bingus]
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=train_data['vocab_size'],
      window_size=window_size,
      negative_samples=0)
positive_skip_grams

[[14, 16],
 [21, 22],
 [19, 20],
 [5, 4],
 [3, 4],
 [20, 22],
 [16, 15],
 [23, 24],
 [3, 1],
 [5, 3],
 [3, 12],
 [16, 17],
 [4, 6],
 [8, 11],
 [10, 8],
 [3, 2],
 [24, 22],
 [11, 12],
 [24, 25],
 [4, 3],
 [10, 8],
 [10, 9],
 [23, 22],
 [27, 26],
 [3, 8],
 [25, 27],
 [11, 8],
 [18, 16],
 [12, 11],
 [11, 10],
 [2, 1],
 [22, 24],
 [7, 5],
 [19, 17],
 [14, 15],
 [13, 14],
 [13, 3],
 [3, 13],
 [8, 10],
 [3, 27],
 [9, 10],
 [27, 25],
 [19, 21],
 [8, 10],
 [7, 8],
 [5, 7],
 [12, 14],
 [25, 26],
 [2, 3],
 [21, 23],
 [4, 5],
 [8, 9],
 [8, 6],
 [3, 5],
 [20, 18],
 [21, 19],
 [24, 26],
 [6, 7],
 [7, 9],
 [12, 13],
 [21, 20],
 [19, 18],
 [15, 17],
 [4, 2],
 [6, 8],
 [8, 9],
 [3, 26],
 [18, 19],
 [8, 3],
 [9, 8],
 [13, 12],
 [2, 4],
 [15, 14],
 [23, 25],
 [17, 18],
 [24, 23],
 [15, 16],
 [22, 23],
 [26, 3],
 [1, 2],
 [17, 16],
 [6, 5],
 [10, 11],
 [20, 19],
 [17, 19],
 [15, 13],
 [7, 6],
 [23, 21],
 [16, 14],
 [17, 15],
 [1, 3],
 [3, 11],
 [12, 3],
 [22, 21],
 [14, 12],
 [11, 3],
 [25, 23],
 [22, 20

In [33]:
for target, context in positive_skip_grams[:5]:
    print(f"({target}, {context}): ({train_data['inverse_vocab'][target]}, {train_data['inverse_vocab'][context]})")

(25, 27): (it, ago)
(5, 6): (his, music)
(17, 19): (on, CD)
(8, 7): (the, -)
(20, 22): (., should)


In [35]:
skip_grams = {}
counter = 0
for line in train_data['sentences']:
    a = tf.keras.preprocessing.sequence.skipgrams(
    train_data['sentences'][line],
    vocabulary_size=train_data['vocab_size'],
    window_size=window_size,
    negative_samples=0)
    skip_grams[counter] = a
    counter+=1

In [25]:
bruh = 'this is a string'
lol = tokenizer.tokenize(bruh)
tokenizer?

In [36]:
skip_grams[0]

([['my', 'favorite'],
  ['the', 'music'],
  ['songs', 'favorite'],
  ['have', 'should'],
  ['.', 'I'],
  ['CD', 'on'],
  ['favorite', 'on'],
  ['it', 'ago'],
  ['!', 'his'],
  ['should', '.'],
  ['years', '!'],
  ['on', 'favorite'],
  ['!', 'Some'],
  ['message', ','],
  ['!', 'ago'],
  ['music', 'the'],
  ['I', 'CD'],
  ['favorite', 'of'],
  ['-', 'music'],
  ['So', '!'],
  ['I', 'have'],
  ['!', 'creative'],
  ['!', 'years'],
  ['!', 'the'],
  ['of', 'my'],
  ['should', 'bought'],
  ['ago', 'years'],
  ['his', 'music'],
  ['!', 'Love'],
  ['of', '!'],
  ['Some', '!'],
  ['creative', 'Love'],
  ['bought', 'years'],
  ['I', 'should'],
  ['Some', 'my'],
  ['on', 'CD'],
  ['ago', 'it'],
  ['CD', 'this'],
  ['his', '!'],
  ['words', '-'],
  [',', 'the'],
  ['Love', '!'],
  ['bought', 'it'],
  ['have', 'I'],
  ['the', 'words'],
  ['his', 'Love'],
  ['songs', 'my'],
  ['!', 'So'],
  ['words', ','],
  ['of', 'Some'],
  ['words', 'the'],
  [',', 'message'],
  ['!', 'of'],
  ['Love', 'music'],