## Train a simple text classification model to apply LIME and totwoLIME explanations

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
#for text pre-processing
import re, string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
#for model-building
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix,roc_curve,auc
# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Read the data
df_train=pd.read_csv("/Users/ivankondyurin/Downloads/nlp-getting-started/train.csv")
df_test=pd.read_csv("/Users/ivankondyurin/Downloads/nlp-getting-started/test.csv")
#convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text)
    text=' '.join([i for i in text.split() if i not in stopwords.words('english')])
    return text

#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
# function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)
def finalpreprocess(text):
    return lemmatizer(preprocess(text))
df_train['cleaned_text'] = df_train['text'].apply(lambda x: finalpreprocess(x))

#SPLITTING THE TRAINING DATASET INTO TRAINING AND VALIDATION
X_train, X_val, y_train, y_val = train_test_split(df_train["cleaned_text"],df_train["target"],test_size=0.2, shuffle=True)

#TF-IDF
# Convert x_train to vector
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_val_vectors_tfidf = tfidf_vectorizer.transform(X_val)
#model
model=RandomForestClassifier(n_estimators = 100, random_state = 10)
model.fit(X_train_vectors_tfidf, y_train) 
#Predict y value for test dataset
y_pred = model.predict(X_val_vectors_tfidf)
y_prob = model.predict_proba(X_val_vectors_tfidf)[:,1]
print(classification_report(y_val,y_pred))
print('Confusion Matrix:',confusion_matrix(y_val, y_pred))
 
fpr, tpr, thresholds = roc_curve(y_val, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

In [None]:
model.classes_

In [None]:
# for LIME import necessary packages
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from sklearn.pipeline import make_pipeline
from lime.lime_text import IndexedString,IndexedCharacters
from lime.lime_base import LimeBase
from sklearn.linear_model import Ridge, lars_path
from lime.lime_text import explanation
from functools import partial
import scipy as sp
from sklearn.utils import check_random_state

# Explaining the predictions and important features for predicting the label 1
c = make_pipeline(tfidf_vectorizer, model)
explainer = LimeTextExplainer(class_names=model.classes_)
# classifier_fn is the probability function that takes a string and returns prediction probabilities.
# num_features is the max. number of features we want in the explanation(default is 10).
# labels=(1,) means we want the explanation for the label 1
exp = explainer.explain_instance(X_val.iloc[20], c.predict_proba, num_features=5,labels=(1,))
exp.show_in_notebook()

In [None]:
X_val.iloc[20]

### Compare the explanations of standard LIME and totwoLIME

In [None]:
text = 'Perturbed samples are created in the neighbourhood of the instance of interest. &&  nice job? good job calgary transit, co http co rgoguyt lf'

In [None]:
explainer_standard = LimeTextExplainer(class_names=model.classes_, bow=True)

In [None]:
exp_standard = explainer_standard.explain_instance(text, classifier_fn=c.predict_proba)

In [None]:
exp_standard.show_in_notebook(text = True)
#standard

In [None]:
exp_standard.show_in_notebook(text = True)
#standatd with bow=True

In [None]:
explainer_totwo = MyLimeTextExplainer(class_names=model.classes_, bow=False, mode='right')

In [None]:
exp_totwo = explainer_totwo.explain_instance(text, classifier_fn=c.predict_proba)

In [None]:
exp_totwo.show_in_notebook(text = True)
#right

In [None]:
exp_totwo.show_in_notebook(text = True)
#left

In [None]:
exp_totwo.show_in_notebook(text = True)
#rand

## Explore how the explanation is constructed

In [None]:
## Perturbed samples are created in the neighbourhood of the instance of interest.
# classifier_fn is the probability function that takes a string and returns prediction probabilities.
# 5000 samples are created in the neighbourhood as default.
# Cosine distance is computed to calculate the distance between original and perturbed samples(default).
data,yss,distances=explainer._LimeTextExplainer__data_labels_distances(IndexedString(X_val.iloc[20]),classifier_fn=c.predict_proba,num_samples=5000)
## Top 2 closest perturbed samples
df=pd.DataFrame(distances,columns=['distance'])
df1=df.sort_values(by='distance')
req_index=df1.index[1:3]
closest_perturbed_sample=[]
for k in req_index:
    perturbed_text =' '.join([re.split(r'\W+',X_val.iloc[20])[i] for i,x in enumerate(data[k]) if x==1.0])
    closest_perturbed_sample.append(perturbed_text)
closest_perturbed_sample

In [None]:
data3,yss3,distances3=explainer._MyLimeTextExplainer__data_labels_distances(MyIndexedString(text, bow = False),classifier_fn=c.predict_proba,num_samples=5000,mode='rand')

In [None]:
## Giving weightage to the perturbed samples
# Exponential kernel
def kernel(d, kernel_width):
    return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2))
# exponential kernel with kernel width 25
kernel_fn = partial(kernel, kernel_width=25)
# Samples weight using exponential kernel
weights=kernel_fn(distances)

In [None]:
from lime.lime_base import LimeBase

local_model=LimeBase(kernel_fn, verbose=False)
# method is the method of selecting the features.
# data is the perturbed samples that are created
# labels_column is the label for which we want the explanation
# weights is the weights that are given by the exponential kernel to the perturbed samples
# num_features is the max. number of features we need in the explanation
labels_column = yss[:, 1]
used_features=local_model.feature_selection(data,labels_column,weights,num_features=5,method='auto')
used_features

In [None]:
# After getting the features Ridge regression is used to fit the local model as default
from sklearn.linear_model import Ridge, lars_path


model_regressor = Ridge(alpha=1, fit_intercept=True)
                                
easy_model = model_regressor
easy_model.fit(data[:, used_features],
               labels_column, sample_weight=weights)
prediction_score = easy_model.score(
    data[:, used_features],
    labels_column, sample_weight=weights)

local_pred = easy_model.predict(data[0, used_features].reshape(1, -1))

# final output 
l = local_model.explain_instance_with_data(data,yss,distances,label=1,num_features=5,feature_selection='highest_weights')


In [None]:
domain_mapper = TextDomainMapper(IndexedString(X_val.iloc[20], bow=False))
random_state = numpy.random.RandomState()
class_names = [str(x) for x in range(yss[0].shape[0])]

ret_exp = explanation.Explanation(domain_mapper=domain_mapper, class_names=class_names, random_state=random_state)


In [None]:
from lime.lime_text import TextDomainMapper
from lime import lime_base
import numpy

def kernel(d, kernel_width):
    return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2))

kernel_fn = partial(kernel, kernel_width=25)

random_state = numpy.random.RandomState()

base = lime_base.LimeBase(kernel_fn, verbose=False, random_state=random_state)

domain_mapper = TextDomainMapper(IndexedString(X_val.iloc[20], bow=False))
class_names = [str(x) for x in range(yss[0].shape[0])]

ret_exp = explanation.Explanation(domain_mapper=domain_mapper, class_names=class_names, random_state=random_state)

print(ret_exp.intercept, ret_exp.local_exp, ret_exp.score, ret_exp.local_pred)

for label in (1,):
    b = base.explain_instance_with_data(data, yss, distances, label, num_features=5, model_regressor=None, feature_selection='auto')

    print(b)


In [None]:
#from lime.lime_text import TextDomainMapper
from lime import lime_base
import numpy

def kernel(d, kernel_width):
    return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2))

kernel_fn = partial(kernel, kernel_width=25)

random_state = numpy.random.RandomState()

base = lime_base.LimeBase(kernel_fn, verbose=False, random_state=random_state)

domain_mapper = MyTextDomainMapper(MyIndexedString(text, bow=False))
class_names = [str(x) for x in range(yss[0].shape[0])]

ret_exp = MyExplanation(domain_mapper=domain_mapper, class_names=class_names, random_state=random_state)
ret_exp.predict_proba = yss[0]

print(ret_exp.intercept, ret_exp.local_exp, ret_exp.score, ret_exp.local_pred)

for label in (1,):
    b = base.explain_instance_with_data(data3, yss3, distances3, label, num_features=5, model_regressor=None, feature_selection='auto')

    print(b)

    (ret_exp.intercept[label], ret_exp.local_exp[label], ret_exp.score[label], ret_exp.local_pred[label]) = b

print(ret_exp)

In [None]:
ret_exp.predict_proba

In [None]:
ret_exp.show_in_notebook(text = True)

In [None]:
## Perturbed samples are created in the neighbourhood of the instance of interest.
# classifier_fn is the probability function that takes a string and returns prediction probabilities.
# 5000 samples are created in the neighbourhood as default.
# Cosine distance is computed to calculate the distance between original and perturbed samples(default).
data,yss,distances=explainer._MyLimeTextExplainer__data_labels_distances(MyIndexedString(text, bow = False),classifier_fn=c.predict_proba,num_samples=5000,mode='rand')
## Top 2 closest perturbed samples
df=pd.DataFrame(distances,columns=['distance'])
df1=df.sort_values(by='distance')
req_index=df.index[1:50]
closest_perturbed_sample=[]
for k in req_index:
    perturbed_text =' '.join([re.split(r'\W+',text)[i] for i,x in enumerate(data[k]) if x==1.0])
    closest_perturbed_sample.append(perturbed_text)
closest_perturbed_sample