In [None]:
!pip3 install lime #if running for first time, uncomment this line

In [None]:
import lime
import sklearn
import sklearn.ensemble
import sklearn.metrics
import sys
import numpy
import pandas
import nltk


## Fetching data, training a classifier

In the [previous tutorial](http://marcotcr.github.io/lime-ml/tutorials/Lime%20-%20basic%20usage%2C%20two%20class%20case.html), we looked at lime in the two class case. In this tutorial, we will use the [20 newsgroups dataset](http://scikit-learn.org/stable/datasets/#the-20-newsgroups-text-dataset) again, but this time using all of the classes.

In [None]:
# Colab stuff
from IPython.display import clear_output
from IPython.display import Javascript
#%tensorflow_version 1.x

!git clone -b master https://github.com/huridocs/classification-utils.git classification_utils

if not 'classification_utils' in sys.path:
  sys.path += ['classification_utils']

# import python modules
from utils import modeling, optimization, tokenization
from utils.analysis import plot_category_distribution

# gcsfs for saving dev/test set
!pip3 install gcsfs

In [None]:
# Connect to google cloud
from google.colab import auth
auth.authenticate_user()

In [None]:
#@title Load
from load import load_data, load_unique_labels
from utils import io

#@markdown Get possible data ids from https://github.com/huridocs/classification-utils/blob/master/config.yml
DATA_ID = 'UHRI_affected_persons' #@param ["UPR", "PlanInternational_themes", "PlanInternational_persons", "PlanInternational_paragraphs", "UHRI_themes", "UHRI_affected_persons", "UHRI_sdgs", "UPR_action"]
cfg_path = 'classification_utils/config.yml'

data = load_data(cfg_path, DATA_ID)
all_labels = load_unique_labels(data)

In [None]:
#@title Train/dev/test split

train_fraction = 0.8 #@param {type:"slider", min:0.0, max:1.0, step:0.05}

train_values = data.sample(frac=train_fraction, random_state=42)
test_values = data.drop(train_values.index)


print('# of total examples: {}'.format(len(data)))
print('# of train examples: {}'.format(len(train_values)))
print('# of test examples: {}'.format(len(test_values)))

In [None]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = train_values['text'].tolist()
newsgroups_test = test_values['text'].tolist() 
class_names = all_labels

In [None]:
class_i = -1 # the index for the class we focus on
specific_class = 'non-citizens' # the class name we focus on

# finding the given class' index
for i, class_name in enumerate(class_names):
  if class_name == specific_class: 
    class_i = i
    
print(f'The index of {specific_class} is {class_i}')

In [None]:
def get_labels(one_hot_labels_list):
  y = numpy.zeros(len(one_hot_labels_list))
  for i in range(len(one_hot_labels_list)):
    y[i] = one_hot_labels_list[i][class_i]
  return y

In [None]:
train_labels = get_labels(train_values['one_hot_labels'].to_list())
test_labels = get_labels(test_values['one_hot_labels'].to_list())

In [None]:
def get_count_samples_in_class(arr):
  count = 0
  for i in range(len(arr)):
    if arr[i] == 1.0:
      count = count + 1
  return count

In [None]:
# get number of samples in train and test for the given class so that

train_count = get_count_samples_in_class(train_labels)
test_count = get_count_samples_in_class(test_labels)

print('Class specific train count is %d' % train_count)
print('Total train amount is %d' % len(train_labels))
print('Class specific test count is %d' % test_count)
print('Total test amount is %d' % len(test_labels))

In [None]:
print(','.join(class_names))

Again, let's use the tfidf vectorizer, commonly used for text.

In [None]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=False)
train_vectors = vectorizer.fit_transform(newsgroups_train)
test_vectors = vectorizer.transform(newsgroups_test)

This time we will use Multinomial Naive Bayes for classification, so that we can make reference to [this document](http://scikit-learn.org/stable/datasets/#filtering-text-for-more-realistic-training).

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB(alpha=.01)

nb.fit(train_vectors, train_labels)


In [None]:
pred = nb.predict(test_vectors)
sklearn.metrics.f1_score(test_labels, pred, average='weighted')

We see that this classifier achieves a very high F1 score. [The sklearn guide to 20 newsgroups](http://scikit-learn.org/stable/datasets/#filtering-text-for-more-realistic-training) indicates that Multinomial Naive Bayes overfits this dataset by learning irrelevant stuff, such as headers, by looking at the features with highest coefficients for the model in general. We now use lime to explain individual predictions instead.

## Explaining predictions using lime

In [None]:
from sklearn.pipeline import make_pipeline
c = make_pipeline(vectorizer, nb)

In [None]:
print(c.predict_proba([newsgroups_test[0]]).round(3))

In [None]:
# redefine classes
class_vect = ['nothing', specific_class]

In [None]:
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=class_vect)

Previously, we used the default parameter for label when generating explanation, which works well in the binary case.  
For the multiclass case, we have to determine for which labels we will get explanations, via the 'labels' parameter.  
Below, we generate explanations for labels 0 and 17.

In [None]:
# find the correctly classified class predictions and their indices!!
for i, val in enumerate(test_vectors):
  pred = int(nb.predict(val))
  correct = int(test_labels[i])
  if pred == 1 and correct == 1:
    print('found a successful prediction: %d' % i)

In [None]:
idx = 41
exp = explainer.explain_instance(newsgroups_test[idx], c.predict_proba, num_features=15, labels=[0, 1])
print(exp.available_labels())
print('Document id: %d' % idx)
print('Predicted class =', class_vect[int(nb.predict(test_vectors[idx]))])
print(f'True class: {class_vect[int(test_labels[idx])]}')

Now, we can see the explanations for different labels. Notice that the positive and negative signs are with respect to a particular label - so that words that are negative towards class 0 may be positive towards class 15, and vice versa.

In [None]:
print('Explanation for class %s' % class_vect[0])
print('\n'.join(map(str, exp.as_list(label=0))))
print()
print('Explanation for class %s' % class_vect[1])
print('\n'.join(map(str, exp.as_list(label=1))))

In [None]:
# SP Lime !!! 
from lime import submodular_pick

sp_obj = submodular_pick.SubmodularPick(explainer, newsgroups_test, c.predict_proba, sample_size=100, num_features=10, num_exps_desired=20) # method='full'
# can add "method='full'" to get explanations from entire data
# num_exps_desired is the number of explanation objects returned
# num_features is maximum number of features present in explanation
# sample_size is the number of instances to explain if method == 'sample'
# ^ default method == 'sample' will sample the data uniformly at random

In [None]:
# shows us the features for the instances selected for one label
[exp.as_pyplot_figure(label=exp.available_labels()[0]) for exp in sp_obj.sp_explanations]; 