In [50]:
import sys
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from collections import defaultdict
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import pickle
from scipy import sparse
from sklearn.model_selection import train_test_split
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [51]:
import warnings
warnings.filterwarnings('ignore')

In [52]:
class SentimentClassifier(nn.Module):

  def __init__(self):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained('bert-base-cased')
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, 3)
  
  def forward(self, input_ids, attention_mask):
    bert_out = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    pooled_output = bert_out['pooler_output']
    # print(pooled_output)
    output = self.drop(pooled_output)
    prob = F.softmax(self.out(output))
    return prob

In [53]:
MAX_LEN = 250

In [54]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [55]:
# model = torch.load('../models/bert_sentiment_model.pt', map_location=torch.device('cpu'))
model = SentimentClassifier()
model.load_state_dict(torch.load('../models/bert_sentiment_model.pt', map_location=torch.device('cpu')))
model = model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [56]:
def get_sentence_sentiment(s):
    text = s
    encoded_review = tokenizer.encode_plus(
    text,
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',
    )
    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)

    output = model(input_ids, attention_mask)
    return int(torch.argmax(output) - 1)


# in this function, we need the scalar, the n (number of rows and columns), and m, the row to apply the scalar to
def generate_matrix(scalar, n, r):
    mat = np.identity(n)
    mat[:,r] *= scalar
    return mat

In [57]:
with open("../pickles/masks.pkl","rb") as f:
    mask_dict = pickle.load(f)
# need to process this to be able to use it
mask_vectors = list(mask_dict.values())
mask_vectors = [i.todense() for i in mask_vectors]
mask_matrix = np.stack(mask_vectors)

with open("../pickles/restaurant_tfidf_dict.pkl","rb") as f:
    rid_sid_dict = pickle.load(f)

with open("../pickles/restaurant_sentence_list_dict.pkl","rb") as f:
    rid_tfidf_dict = pickle.load(f)

In [58]:
# iterate through the rids
rids = list(rid_sid_dict.keys())
num_rids = len(rids)

rid_feature_dict = {}
for i in range(num_rids):
    # progress tracker
    if (i % 10 == 0) :
        print((i / num_rids) * 100, '% complete')
        
    rid = rids[i]
    # get the sentences in order
    sentences = list(rid_sid_dict[rid].values)
    sentiments = np.array([get_sentence_sentiment(s) for s in sentences])
    
#     print('sentiments calculated: ', len(sentiments))
     
    tfidf_matrix = rid_tfidf_dict[rid]
    tfidf_matrix_w = tfidf_matrix.copy()
    tfidf_matrix_w = (tfidf_matrix_w.T.multiply(sentiments)).T # multiply the sentiments through for weighted matrix
    tfidf_matrix_w = np.array(tfidf_matrix_w.todense())
    tfidf_matrix = np.array(tfidf_matrix.todense())
        
    # sum over all columns of the matrix, for both weighted and not weighted
    freq_vec = np.sum(tfidf_matrix, axis=0)
    freq_vec_w = np.sum(tfidf_matrix_w, axis=0)
    
    # normalize the vectors
    norm = np.linalg.norm(freq_vec)
    norm_w = np.linalg.norm(freq_vec_w)
    freq_vec = freq_vec / norm
    freq_vec_w = freq_vec_w / norm_w
    
    # apply the mask for params and save both weighted and non weighted vectors for this rid
    rid_feature_dict[rid] = (np.matmul(mask_matrix, freq_vec.T).T, np.matmul(mask_matrix, freq_vec_w.T).T)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


0.0 % complete
10.0 % complete
20.0 % complete
30.0 % complete
40.0 % complete
50.0 % complete
60.0 % complete
70.0 % complete
80.0 % complete
90.0 % complete


In [59]:
# pickle the dictionary
with open('../pickles/rid_feature_dict.pkl', 'wb') as handle:
    pickle.dump(rid_feature_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [78]:
len(rid_feature_dict)

high_so_far = 0
high_rid = None
rid_scores = []
for key, value in rid_feature_dict.items():
    rid_scores.append((key, sum(value[1]).item()))
    if sum(value[1]).item() > high_so_far:
        high_so_far = sum(value[1]).item()
        high_rid = key

rid_scores = sorted(rid_scores, key=lambda x: x[1], reverse = True)
rid_scores

[('X9sz3xeaLujW9PhsSLCQyg', 0.14871434114858917),
 ('VPqWLp9kMiZEbctCebIZUA', 0.1345436262560355),
 ('kvXWN5kB7CEzmMCf557Xug', 0.1256071647116429),
 ('JgjMHbZ4A407ZOmvEEoa6g', 0.11327886708919),
 ('GDs0ymtRPWWHlUMBfNT5yg', 0.10766604287171508),
 ('SCwzUgW_RUdahXNObknkTg', 0.10476257632362315),
 ('p2BkIrOuIsxGqtV0lwOZUw', 0.10181158419779535),
 ('eUbq0uNxRlXQ6sy7phM7yA', 0.0980889325752394),
 ('AvXqLbcGCxdIEF_qZTY0Kw', 0.09803051194041147),
 ('8zehGz9jnxPqXtOc7KaJxA', 0.09634001588805935),
 ('C--wIpxJ4j1y01G_raHdbA', 0.09223970154339176),
 ('DiRIdYhGyuTNZurKyuWf7A', 0.08739411298674277),
 ('zmZ3HkVCeZPBefJJxzdJ7A', 0.08159229670766677),
 ('0WqROvvlHjvpeHIP0fg9EQ', 0.08073619180284945),
 ('CoZmZKv2lCYd-UoAsAUobA', 0.07794962362088068),
 ('hpz2qRnei2IJROLJUJVvPQ', 0.07574199166562945),
 ('paHbKmPjwirnIP7esoTPCQ', 0.07479472345968627),
 ('mOnesB4IF9j6-ZmHoOHOig', 0.07209447350170148),
 ('DMC9ZMkDHNQmlhdPDmz-Cw', 0.07206927920029216),
 ('xdpH27x6qGSG21LLa6TaXQ', 0.07130060436626756),
 ('bZi

In [60]:
rid_feature_dict['zmZ3HkVCeZPBefJJxzdJ7A']

(matrix([[0.06492115],
         [0.00284262]]),
 matrix([[0.04342017],
         [0.00556748]]))