# User input

In [None]:
# This session should be connected to GPU

In [None]:
# enter the location of the sentiment clasification model
model_loc = '/content/drive/MyDrive/roberta_model'

# enter the location of the tokenizer
tokenizer_loc = '/content/drive/MyDrive/roberta_tokenizer'

# enter the source dataset location
source_dataset_loc = '/content/drive/MyDrive/nasdaq.csv'

# enter the folder where all subsequent datasets will be saved
lexicon_folder_loc = '/content/drive/MyDrive/nasdaq'

# Installations

In [None]:
!pip install transformers

In [None]:
pip install pytorch_transformers

In [None]:
pip install shap

In [None]:
pip install shortuuid

# Word Extraction Class

In [None]:
import pandas as pd
import numpy as np
import torch
import transformers
import shap
from datetime import datetime
import pytz
from transformers import pipeline
import shortuuid
from os import listdir
from os.path import isfile, join

class WordExtractor:

  def __init__(self, model, explainer, drive_loc):
    self.model = model
    self.explainer = explainer
    
    self.drive_loc = drive_loc if drive_loc[-1] == '/' else f'{drive_loc}/'
    self.log_uuid = shortuuid.uuid()

    self.negative_class = 0
    self.positive_class = 1
    self.score_col = 'score'

    self.log_df_list = []
    self.positive_words_df_list = []
    self.negative_words_df_list = []

    self.log_columns = ['uuid', 'sentence', 'positive_score', 'negative_score', 'label', 'values', "base_values", "data"]
    self.pos_neg_columns = ['word', 'count', 'shap_values', 'sentence_uuid']

    self.tz_utc = pytz.timezone('UTC')


  def extracting_sentence_label(self, sentence):
    # prediction
    pred = self.model([sentence])[0]

    # extraction poitive and negative score from prediction
    neg_score = pred[self.negative_class][self.score_col]
    pos_score = pred[self.positive_class][self.score_col]

    # determining the label
    label = self.negative_class if neg_score >= pos_score else self.positive_class

    return sentence, label, neg_score, pos_score

  def shap_explaining_results(self, sentence):
    # shap explaining the prediction of the model
    shap_values = self.explainer([sentence])

    # extract data from the prediction
    words = shap_values.data[0]
    # extract shap values from the prediction
    values = shap_values.values[0]
    # extract shap base values from the prediction
    base_value = shap_values.base_values[0]
    return words, values, base_value


  def log_item(self, uuid, sentence, pos_score, neg_score, label, words, values, base_value):
    self.log_df_list.append([uuid, sentence, pos_score, neg_score, label, values, base_value, words])

  def add_new_word(self, word, probability, dataset, uuid):
    dataset.append([word, 1, str(probability), uuid])

  def save_location(self, location, type_dataset, prefix):
    if location == 'drive':
      return f'{self.drive_loc}{self.log_uuid}--{prefix}{type_dataset}.csv'

    return f'/content/{self.log_uuid}--{prefix}{type_dataset}.csv'

  def create_df(self):
    # create dataframes from the lists
    log_df = pd.DataFrame(self.log_df_list, columns = self.log_columns)
    positive_df = pd.DataFrame(self.positive_words_df_list, columns = self.pos_neg_columns)
    negative_df = pd.DataFrame(self.negative_words_df_list, columns = self.pos_neg_columns)

    return log_df, positive_df, negative_df

  def save_datasets(self, prefix='tmp_', location = None):
    log_df, positive_df, negative_df = self.create_df()

    log_df.to_csv(self.save_location(location, 'log_dataset', prefix), index=False)
    positive_df.to_csv(self.save_location(location, 'positive_words', prefix), index=False)
    negative_df.to_csv(self.save_location(location, 'negative_words', prefix), index=False)

    return log_df, positive_df, negative_df


  def print_log(self, count):
    # loging in utc time zone
    print('Printing line number ' + str(count))
    datetime_utc = datetime.now(self.tz_utc)
    print("Current time:", datetime_utc.strftime("%H:%M:%S"))
    self.save_datasets()

  def classify_word(self, word, shap_value, label, uuid):
    word = str.lower(word)
    if label == self.negative_class:
      # if the score that is from the predicted label (negative) is greater or equal to 0, then add the words to the negative dataset, else add the same value, but abs to the positive dataset
      if shap_value[label] >= 0:
        self.add_new_word(word, shap_value[label], self.negative_words_df_list, uuid)
      else:
        self.add_new_word(word, abs(shap_value[label]), self.positive_words_df_list, uuid)

    else:
      # if the score that is from the predicted label (positive) is greater or equal to 0, then add the words to the positive dataset, else add the same value, but abs to the negative dataset
      if shap_value[label] >= 0:
        self.add_new_word(word, shap_value[label], self.positive_words_df_list, uuid)
      else:
        self.add_new_word(word, abs(shap_value[label]), self.negative_words_df_list, uuid)

  def execute_process(self, sentence):
    # creating sentence uuids that are added to the words extracted from that sentence
    # this way we can debug and identify the origin sentences of one word in the dataset
    uuid = shortuuid.uuid()

    # log the progress on each 100 sentences
    # save the datasets on each 100 sentences
    # saving partial datasets in case when the session on Google Colab breaks
    count = len(self.log_df_list)
    if count % 100 == 0:
      self.print_log(count)
      self.save_datasets(location = 'drive')

    # extracting the label, probability for the negative label and probability for the positive label
    sentence, label, neg_score, pos_score = self.extracting_sentence_label(sentence)

    # extract words, shap values and base values from SHAP explainer
    words, values, base_value = self.shap_explaining_results(sentence)

    # log the progress
    self.log_item(uuid, sentence, pos_score, neg_score, label, words, values, base_value)

    # classify the extracted words
    [self.classify_word(word, shap_value, label, uuid) for word, shap_value in zip(words, values)]


  def run_extraction(self, df, text_column):
    start_number_point = self.extract_start_point()
    sentences_left = len(df) - start_number_point
    if sentences_left == 0:
      raise Exception('No more sentences left for processing. Please continue with the next steps')
    else:
      print(f'{sentences_left} sentences are left for processing')

    df[start_number_point:][text_column].apply(lambda sentence: self.execute_process(sentence))

    self.save_datasets(prefix = '', location = 'drive')
    return self.save_datasets('')


  def extract_rows_number_from_execution(self, exec, files_locations):
    exec_files = [f for f in files_locations if exec in f]
    if len(exec_files) > 3:
      final_files = [f for f in exec_files if 'tmp' not in f]
    else:
      final_files = exec_files.copy(True)

    log_file = [f for f in final_files if 'log' in f][0]
    log_df = pd.read_csv(log_file)
    return len(log_df)

  def extract_start_point(self):
    files_locations = [join(self.drive_loc, f) for f in listdir(self.drive_loc) if isfile(join(self.drive_loc, f))]
    if len(files_locations) == 0:
      return 0

    executions = set([f.split('/')[-1].split(sep='--')[0] for f in files_locations])
    start_number_point = np.array([self.extract_rows_number_from_execution(exec, files_locations) for exec in executions]).sum()

    return start_number_point

# Explainer creation

In [None]:
import shap
import pandas as pd
import numpy as np
import torch
import transformers
from datetime import datetime
import pytz
from transformers import pipeline


#loading model and tokenizer from drive
model = torch.load(model_loc)
tokenizer = torch.load(tokenizer_loc)

# For using it with CPU
# model = torch.load(model_loc, map_location=torch.device('cpu'))
# tokenizer = torch.load(tokenizer_loc, map_location=torch.device('cpu'))

# creating pipeline for sentiment analysis
md = pipeline('sentiment-analysis', model, tokenizer=tokenizer, return_all_scores=True, device=0)

# For using it with CPU
# md = pipeline('sentiment-analysis', model, tokenizer=tokenizer, return_all_scores=True)

# setting sigmoid as activation function 
md.function_to_apply = 'sigmoid'

# creating shap transformer pipeline
shap_p = shap.models.TransformersPipeline(md, rescale_to_logits=False)
# creating shap explainer
explainer = shap.Explainer(shap_p)

# Words Collection

In [None]:
# reading the source dataset
df = pd.read_csv(source_dataset_loc)

# creating the word extractor object
word_extractor = WordExtractor(md, explainer, lexicon_folder_loc)
# extracting the words
log_df, positive_df, negative_df = word_extractor.run_extraction(df, 'text')