# Notebook for experimenting and trying to improve model

### Considerations to improve 
- Stopwords 
- Word2vec
- bigger n-grams 
- bert? 
- max words

In [23]:
import gzip
import json
import torch 
import torch.nn as nn
from nltk.tokenize import TweetTokenizer
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import classification_report
import tensorflow as tf

In [2]:
def build_vocab(filepath, padding = False):
    train_vocab = {}
    train = gzip.open(filepath)
    counter1 = 0
    counter2 = 0
    counter3 = 0
    counter = 0
    if padding: 
        train_vocab['<PAD>'] = 0
        counter2 += 1
    no_reviewText = []
    labels = {}
    sentences = {}
    tokenizer = TweetTokenizer()
    for line in train:
        counter1 +=1
        #print(line)
        if 'reviewText' in json.loads(line).keys():
            a = json.loads(line)
            sentences[counter3] = a['reviewText']
            counter3 += 1
            if a['sentiment'] == 'positive':
                labels[counter] = 1
            elif a['sentiment'] == 'negative': 
                labels[counter] = 0
            counter +=1
            for word in tokenizer.tokenize(json.loads(line)['reviewText']):
                if word not in train_vocab.keys():
                    train_vocab[word] = counter2
                    counter2 += 1
        else:
            no_reviewText.append(counter1)
    final_dict = {'line_count' : counter1,
                 'review_count' : counter3,
                 'vocab_size' : counter2,
                 'no_text_reviews' : no_reviewText,
                 'labels' : labels,
                 'vocabulary' : train_vocab,
                 'sentences' : sentences}
    return final_dict

def sen_vectorizer(filepath, cutoff = False): 
    vocab, index = {}, 1
    data = gzip.open(filepath)
    vocab['<PAD>'] = 0
    counter1 = 0
    counter2 = 0
    counter3 = 0
    counter = 0
    no_reviewText = []
    sentences = {}
    tokenizer = TweetTokenizer()
    labels = {}
    for line in data:
        counter1 +=1
        #print(line)
        if 'reviewText' in json.loads(line).keys():
            a = json.loads(line)
            b = tokenizer.tokenize(a['reviewText'])
            if cutoff: 
                b = b[:cutoff]
            sentences[counter3] = b
            counter3 += 1
            if a['sentiment'] == 'positive':
                labels[counter] = 1
            elif a['sentiment'] == 'negative': 
                labels[counter] = 0
            counter +=1
            for word in b:
                if word not in vocab.keys():
                    vocab[word] = index
                    index += 1
        else:
            no_reviewText.append(counter1)
    inverse_vocab = {index: token for token, index in vocab.items()}
    final_dict = {'line_count' : counter1,
                 'review_count' : counter3,
                 'vocab_size' : counter2,
                 'no_text_reviews' : no_reviewText,
                 'labels' : labels,
                 'vocabulary' : vocab,
                 'sentences' : sentences,
                 'inverse_vocab' : inverse_vocab}
    return final_dict
    
def create_onehot(vocab, sentences, tokenzier):
    # Create matrix
    m1 = torch.zeros(len(sentences), len(vocab))
    # Correct indices
    for sen in range(len(sentences)): 
        for word in sentences[sen]: 
            if word in vocab.keys():
                m1[sen, vocab[word]] = 1
    return m1

def create_batches(matrix, batch_size,labels): 
    num_batches = int(len(matrix)/batch_size)
    feats_batches = matrix[:batch_size*num_batches].view(num_batches,batch_size, matrix.shape[1])
    bingus = torch.FloatTensor(list(labels.values()))
    num_batches = int(len(bingus)/batch_size)
    label_batches = bingus[:batch_size*num_batches].view(num_batches,batch_size,1)
    return feats_batches, label_batches

paths = {'train':'../classification/music_reviews_train.json.gz',
        'test':'../classification/music_reviews_test_masked.json.gz',
        'dev' : '../classification/music_reviews_dev.json.gz'}
tokenizer = TweetTokenizer()

In [3]:
train_data = sen_vectorizer(paths['train'], cutoff = 100)
train_matrix = create_onehot(train_data['vocabulary'], train_data['sentences'], TweetTokenizer)

In [4]:
bingus = train_data['sentences'][0]

In [5]:
bingus

['So',
 'creative',
 '!',
 'Love',
 'his',
 'music',
 '-',
 'the',
 'words',
 ',',
 'the',
 'message',
 '!',
 'Some',
 'of',
 'my',
 'favorite',
 'songs',
 'on',
 'this',
 'CD',
 '.',
 'I',
 'should',
 'have',
 'bought',
 'it',
 'years',
 'ago',
 '!']

In [6]:
example_sequence = [train_data['vocabulary'][word] for word in bingus]
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=train_data['vocab_size'],
      window_size=window_size,
      negative_samples=0)
positive_skip_grams

[[6, 4],
 [3, 13],
 [20, 22],
 [8, 11],
 [11, 12],
 [3, 4],
 [3, 12],
 [19, 20],
 [19, 21],
 [3, 5],
 [8, 6],
 [18, 17],
 [12, 11],
 [4, 6],
 [21, 19],
 [5, 6],
 [4, 2],
 [11, 3],
 [2, 1],
 [19, 18],
 [8, 9],
 [9, 8],
 [22, 23],
 [27, 3],
 [7, 6],
 [10, 8],
 [20, 19],
 [3, 8],
 [7, 5],
 [26, 24],
 [23, 25],
 [3, 26],
 [23, 21],
 [20, 18],
 [23, 22],
 [3, 1],
 [21, 22],
 [26, 3],
 [6, 7],
 [13, 12],
 [10, 8],
 [5, 3],
 [15, 16],
 [1, 2],
 [8, 7],
 [19, 17],
 [24, 26],
 [22, 20],
 [15, 14],
 [3, 2],
 [17, 18],
 [15, 17],
 [7, 9],
 [14, 16],
 [8, 9],
 [14, 12],
 [13, 15],
 [21, 20],
 [22, 24],
 [17, 19],
 [26, 27],
 [9, 7],
 [9, 10],
 [18, 20],
 [16, 14],
 [26, 25],
 [10, 11],
 [16, 18],
 [25, 24],
 [3, 11],
 [9, 8],
 [8, 10],
 [14, 15],
 [5, 7],
 [7, 8],
 [11, 8],
 [17, 15],
 [2, 3],
 [10, 9],
 [21, 23],
 [4, 5],
 [15, 13],
 [24, 25],
 [25, 27],
 [27, 25],
 [1, 3],
 [16, 15],
 [25, 23],
 [18, 19],
 [6, 8],
 [20, 21],
 [3, 27],
 [6, 5],
 [4, 3],
 [8, 10],
 [18, 16],
 [25, 26],
 [5, 4],
 [

In [7]:
for target, context in positive_skip_grams[:5]:
    print(f"({target}, {context}): ({train_data['inverse_vocab'][target]}, {train_data['inverse_vocab'][context]})")

(6, 4): (music, Love)
(3, 13): (!, of)
(20, 22): (., should)
(8, 11): (the, message)
(11, 12): (message, Some)


In [32]:
skip_grams = {}
counter = 0
for line in train_data['sentences']:
    example_sequence = [train_data['vocabulary'][word] for word in train_data['sentences'][line]]
    a = tf.keras.preprocessing.sequence.skipgrams(
    example_sequence,
    vocabulary_size=train_data['vocab_size'],
    window_size=2,
    negative_samples=0,
    shuffle = False)
    skip_grams[counter] = a
    counter+=1

In [33]:
bruh = 'this is a string'
lol = tokenizer.tokenize(bruh)

In [37]:
skip_grams[0][0]

[[1, 2],
 [1, 3],
 [2, 1],
 [2, 3],
 [2, 4],
 [3, 1],
 [3, 2],
 [3, 4],
 [3, 5],
 [4, 2],
 [4, 3],
 [4, 5],
 [4, 6],
 [5, 3],
 [5, 4],
 [5, 6],
 [5, 7],
 [6, 4],
 [6, 5],
 [6, 7],
 [6, 8],
 [7, 5],
 [7, 6],
 [7, 8],
 [7, 9],
 [8, 6],
 [8, 7],
 [8, 9],
 [8, 10],
 [9, 7],
 [9, 8],
 [9, 10],
 [9, 8],
 [10, 8],
 [10, 9],
 [10, 8],
 [10, 11],
 [8, 9],
 [8, 10],
 [8, 11],
 [8, 3],
 [11, 10],
 [11, 8],
 [11, 3],
 [11, 12],
 [3, 8],
 [3, 11],
 [3, 12],
 [3, 13],
 [12, 11],
 [12, 3],
 [12, 13],
 [12, 14],
 [13, 3],
 [13, 12],
 [13, 14],
 [13, 15],
 [14, 12],
 [14, 13],
 [14, 15],
 [14, 16],
 [15, 13],
 [15, 14],
 [15, 16],
 [15, 17],
 [16, 14],
 [16, 15],
 [16, 17],
 [16, 18],
 [17, 15],
 [17, 16],
 [17, 18],
 [17, 19],
 [18, 16],
 [18, 17],
 [18, 19],
 [18, 20],
 [19, 17],
 [19, 18],
 [19, 20],
 [19, 21],
 [20, 18],
 [20, 19],
 [20, 21],
 [20, 22],
 [21, 19],
 [21, 20],
 [21, 22],
 [21, 23],
 [22, 20],
 [22, 21],
 [22, 23],
 [22, 24],
 [23, 21],
 [23, 22],
 [23, 24],
 [23, 25],
 [24, 22],
 [24

In [66]:
tokenizer = TweetTokenizer()

def rob_skipgram(tokenized_sents, tokenizer, word2idx, window_size):
    PAD = '<PAD>'
    fullData = []
    labels = []
    for sent in tokenized_sents:
        for tgtIdx in range(len(tokenized_sents[sent])):
            labels.append(word2idx[tokenized_sents[sent][tgtIdx]])
            dataLine = []
            # backwards
            for dist in reversed(range(1,window_size+1)):
                srcIdx = tgtIdx - dist
                if srcIdx < 0:
                    dataLine.append(word2idx[PAD])
                else:
                    dataLine.append(word2idx[tokenized_sents[sent][srcIdx]])
            # forwards
            for dist in range(1,window_size+1):
                srcIdx = tgtIdx + dist
                if srcIdx >= len(tokenized_sents[sent]):
                    dataLine.append(word2idx[PAD])
                else:
                    dataLine.append(word2idx[tokenized_sents[sent][srcIdx]])
            fullData.append(dataLine)
    return fullData, labels
data, labels = rob_skipgram(train_data['sentences'], tokenizer, train_data['vocabulary'], 2)
labels = torch.tensor(labels)
data = torch.tensor(data)

In [67]:
embed_dim = 64

class CBOW(nn.Module):
    def __init__(self, emb_dim, vocab_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_dim, emb_dim)
        # note that embeddingsbag can also be used, then sum can be skipped in forward()
        self.linear = nn.Linear(emb_dim, vocab_dim)
        #self.activation_function = nn.Softmax(dim=0)
        self.loss_function = nn.CrossEntropyLoss()

    
    def forward(self, inputs, gold):
        embeds = self.embeddings(inputs)
        out = torch.sum(embeds,dim=0)
        out = self.linear(out)
        out = self.loss_function(out, gold)
        return out


cbow = CBOW(embed_dim,len(train_data['vocabulary']))
print(cbow)

CBOW(
  (embeddings): Embedding(84746, 64)
  (linear): Linear(in_features=64, out_features=84746, bias=True)
  (loss_function): CrossEntropyLoss()
)


In [None]:
import torch.optim as optim

# compile and train the model
optimizer = optim.SGD(cbow.parameters(), lr=0.001)

for epoch in range(10):  # loop over the dataset multiple times
    running_loss = 0.0
    for window, label in zip(data, labels):
        window = window.view(-1, 1)
        label = label.view(1)
        # zero the parameter gradients
        optimizer.zero_grad()
        
        # forward + backward + optimize
        loss = cbow.forward(window, label)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(running_loss)

print('Finished Training')


In [62]:
type(data)

list