In [75]:
import sys
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from collections import defaultdict
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [76]:
class SentimentClassifier(nn.Module):

  def __init__(self):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained('bert-base-cased')
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, 3)
  
  def forward(self, input_ids, attention_mask):
    bert_out = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    pooled_output = bert_out['pooler_output']
    # print(pooled_output)
    output = self.drop(pooled_output)
    prob = F.softmax(self.out(output))
    return prob

In [77]:
MAX_LEN = 250

In [78]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [79]:
# model = torch.load('../models/bert_sentiment_model.pt', map_location=torch.device('cpu'))
model = SentimentClassifier()
model.load_state_dict(torch.load('../models/bert_sentiment_model.pt', map_location=torch.device('cpu')))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [80]:
def get_sentence_sentiment(s):
    text = s
    encoded_review = tokenizer.encode_plus(
    text,
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',
    )
    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)

    output = model(input_ids, attention_mask)
    return int(torch.argmax(output) - 1)


# in this function, we need the scalar, the n (number of rows and columns), and m, the row to apply the scalar to
def generate_matrix(scalar, n, r):
    mat = np.identity(n)
    mat[:,r] *= scalar
    return mat

In [74]:
with open("../pickles/masks.pkl","rb") as f:
    mask_dict = pickle.load(f)
print(mask_dict)

with open("../pickles/restaurant_tfidf_dict.pkl","rb") as f:
    rid_sid_dict = pickle.load(f)
print(rid_sid_dict)

with open("../pickles/restaurant_sentence_list_dict.pkl","rb") as f:
    rid_tfidf_dict = pickle.load(f)
print(rid_tfidf_dict)


# mask_dict = pickle.load("../script_data/masks.pkl")
# rid_sid_dict = pickle.load("../script_data/restaurant_tfidf_dict.pkl")
# rid_tfidf_dict = pickle.load("../script_data/restaurant_sentence_list_dict.pkl")

  mask_dict = pickle.load(f)
  rid_tfidf_dict = pickle.load(f)


{'quality': <1x44645 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>, 'price': <1x44645 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>}
{'-L69Ix0-xX4BlHA61fGvrQ': 33361                recommended want something quick fast
33347    word mouth advertising worst best depending cu...
33312                             twice seem mix order lot
33317                                                     
33309                           chicken express right year
                               ...                        
33350                                         disorganized
33314                                   chicken cross road
33340                                     meal count fresh
33323                                    told maybe minute
33372    managerbossowner right left car would said som...
Name: pre_process, Length: 131, dtype: object, '-_GnwXmzC3DXsHR9nyaC2g': 

{'-L69Ix0-xX4BlHA61fGvrQ': <131x44645 sparse matrix of type '<class 'numpy.float64'>'
	with 794 stored elements in Compressed Sparse Row format>, '-_GnwXmzC3DXsHR9nyaC2g': <2582x44645 sparse matrix of type '<class 'numpy.float64'>'
	with 15828 stored elements in Compressed Sparse Row format>, '05pmc_4J0TxoZrft1QxmJg': <72x44645 sparse matrix of type '<class 'numpy.float64'>'
	with 591 stored elements in Compressed Sparse Row format>, '0AzLzHfOJgL7ROwhdww2ew': <785x44645 sparse matrix of type '<class 'numpy.float64'>'
	with 5306 stored elements in Compressed Sparse Row format>, '0BGoel6on7yGvojzOqOEAQ': <1196x44645 sparse matrix of type '<class 'numpy.float64'>'
	with 7834 stored elements in Compressed Sparse Row format>, '0WqROvvlHjvpeHIP0fg9EQ': <1328x44645 sparse matrix of type '<class 'numpy.float64'>'
	with 7417 stored elements in Compressed Sparse Row format>, '1BvysshfkDS2eJ0k8XiDjQ': <439x44645 sparse matrix of type '<class 'numpy.float64'>'
	with 2344 stored elements in Compres

In [83]:
# imagine that we have 2 dictionaries
# one dictionary is rid to list of sentences
# one dictionary is rid to tf idf matrix

# iterate through the rids
rids = list(rid_sid_dict.keys())
num_rids = len(rids)

rid_feature_dict = {}
for i in range(num_rids):
    # progress tracker
    if (i % 10 == 0) :
        print(i / num_rids)
        
    rid = rids[i]
    # get the sentences in order
    sentences = list(rid_sid_dict[rid].values)
    sentiments = [get_sentence_sentiment(s) for s in sentences]
    
    
    # this is the number of rows in the tf idf matrix
    n = len(sentiments)
    
    # create the matrices for sentiments (to be able to multiply through)
    sentiment_matrices = []
    for i in range(0, n):
        sentiment_matrices.append(generate_matrix(sentiments[i], n, i))
        
    tfidf_matrix = rid_tfidf_dict[rid]
    tfidf_matrix_w = tfidf_matrix.copy()
    
    print(tfidf_matrix_w)
    print(sentiment_matrices)
    
    # for each sentiment matrix, multiply it by the tfidf weighted
    for m in sentiment_matrices:
        tfidf_matrix_w = np.matmul(m, tfidf_matrix_w)
        
    # sum over all columns of the  m
    # sum over all columns of the matrix, for both weighted and not weighted
    freq_vec = np.sum(tfidf_matrix, axis=0)
    freq_vec_w = np.sum(tfidf_matrix_w, axis=0)
    
    # apply the mask for params and save both weighted and non weighted vectors for this rid
    rid_feature_dict[rid] = (np.matmul(param_mask, freq_vec.T).T, np.matmul(param_mask, freq_vec_w.T).T)


0.0


  prob = F.softmax(self.out(output))


  (0, 42883)	0.39181708098508566
  (0, 36746)	0.41212775699483933
  (0, 32250)	0.48920111785766207
  (0, 31684)	0.4685433966665391
  (0, 14202)	0.46666844621178893
  (1, 43944)	0.35982364875798567
  (1, 43888)	0.36031383274669215
  (1, 35158)	0.20901712415634563
  (1, 25510)	0.32880765049265176
  (1, 10597)	0.4189925621302442
  (1, 9760)	0.2985228572832634
  (1, 3680)	0.2263953581723859
  (1, 507)	0.5224679252066659
  (2, 40969)	0.491693380435377
  (2, 34836)	0.4910913295597493
  (2, 27766)	0.33028843526215707
  (2, 25041)	0.5148271824441424
  (2, 22983)	0.37806011085007485
  (4, 44265)	0.4622131926820483
  (4, 33349)	0.4123196043541221
  (4, 13804)	0.6867288612051954
  (4, 7047)	0.38046679167010555
  (5, 37179)	0.3122854549761323
  (5, 30735)	0.3921477381189993
  (5, 24743)	0.44469251112210606
  :	:
  (124, 1566)	0.4236964360284717
  (125, 40995)	0.25689280636480616
  (125, 28820)	0.3129817392892757
  (125, 27766)	0.22717825837542158
  (125, 25013)	0.38603723572159443
  (125, 19659)	0

ValueError: matmul: Input operand 1 does not have enough dimensions (has 0, gufunc core with signature (n?,k),(k,m?)->(n?,m?) requires 1)

In [54]:
rid_sid_dict[list(rid_sid_dict.keys())[0]].values[0]

'recommended want something quick fast'

In [30]:
rid_sid_dict

{'-L69Ix0-xX4BlHA61fGvrQ': 33361                recommended want something quick fast
 33347    word mouth advertising worst best depending cu...
 33312                             twice seem mix order lot
 33317                                                     
 33309                           chicken express right year
                                ...                        
 33350                                         disorganized
 33314                                   chicken cross road
 33340                                     meal count fresh
 33323                                    told maybe minute
 33372    managerbossowner right left car would said som...
 Name: pre_process, Length: 131, dtype: object,
 '-_GnwXmzC3DXsHR9nyaC2g': 8282    take seat patio overlook parking lot watch car...
 8392    chicken bean andmy wife ordered guacamole pret...
 8107    justlyfamous avocado margarita perfect unlike ...
 8160    anyway last meal austin sadly way leave town g...
 914

In [84]:
generate_matrix(2, 6, 1)

array([[1., 0., 0., 0., 0., 0.],
       [0., 2., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [None]:

rid_tfidf_dict[rid]

In [89]:
rid = rids[0]
tfidf_matrix = rid_tfidf_dict[rid]
tfidf_matrix

TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]