In [1]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, f1_score
from lime.lime_text import LimeTextExplainer
import csv
import json
from nltk.stem import WordNetLemmatizer 
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import random

# Load sub-cateories from 20newsgroups dataset

In [2]:
# Selecting same categories as Riveiro & Thill (2021) and removing parts in the data that may bias the model (headers/footers/quotes). 
cats = ['talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc',
        'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space',
        'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']

# Fetching the dataset
data_train = fetch_20newsgroups(subset='train', categories=cats, remove=('headers', 'footers', 'quotes'), random_state=1)
data_test = fetch_20newsgroups(subset='test', categories=cats, remove=('headers', 'footers', 'quotes'), random_state=1)

# Original labels
train_label = data_train.target
test_label = data_test.target



# Define a mapping from detailed categories to broader ones (giving all overarching categories the same label)
category_mapping = {label: 'politics' if label.startswith('talk') else
                           'science' if label.startswith('sci') else
                           'leisure' for label in data_train.target_names}

# Convert target names to broad categories
broad_categories = [category_mapping[name] for name in data_train.target_names]


# Adjusting the original labels to broad category names directly
train_broad_labels_names = np.array([category_mapping[data_train.target_names[label]] for label in train_label])
test_broad_labels_names = np.array([category_mapping[data_test.target_names[label]] for label in test_label])


len(test_label)


4219

# Pre-Process / Clean 

In [3]:
def preprocessing_text(text):
    lemmatizer = WordNetLemmatizer()
    emoji_pattern = r'^(?:[\u2700-\u27bf]|(?:\ud83c[\udde6-\uddff]){1,2}|(?:\ud83d[\udc00-\ude4f]){1,2}|[\ud800-\udbff][\udc00-\udfff]|[\u0021-\u002f\u003a-\u0040\u005b-\u0060\u007b-\u007e]|\u3299|\u3297|\u303d|\u3030|\u24c2|\ud83c[\udd70-\udd71]|\ud83c[\udd7e-\udd7f]|\ud83c\udd8e|\ud83c[\udd91-\udd9a]|\ud83c[\udde6-\uddff]|\ud83c[\ude01-\ude02]|\ud83c\ude1a|\ud83c\ude2f|\ud83c[\ude32-\ude3a]|\ud83c[\ude50-\ude51]|\u203c|\u2049|\u25aa|\u25ab|\u25b6|\u25c0|\u25fb|\u25fc|\u25fd|\u25fe|\u2600|\u2601|\u260e|\u2611|[^\u0000-\u007F])+$'
    
    text= text.lower()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text)  
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(emoji_pattern, '', text)
    text= re.sub(r'\s+', ' ', text)
    
    return text

# Model / Pipeline

In [4]:
#stop_words = text.ENGLISH_STOP_WORDS
stop_words = list(set(stopwords.words('english')))

svc_pipe = Pipeline(
   [ ("vect",TfidfVectorizer(preprocessor=preprocessing_text, lowercase=True, stop_words=stop_words, ngram_range=(1, 1))),
    ("clf", SVC(probability=True))])

In [5]:
# Train a basic model
svc_pipe.fit(data_train.data, train_broad_labels_names)

# Save LIME explanations to CSV

In [6]:
def LIMEexplanationsText_toCSV(data, labels, classifier, k=6, n=5):
    random.seed(42)
    # Sample indices of instances with length >= 10
    eligible_indices = [i for i, instance in enumerate(data.data) if len(instance) >= 10]
    sampled_indices = random.sample(eligible_indices, n)
    
    # Use sampled indices to select instances and their corresponding labels
    sampled_data = [data.data[i] for i in sampled_indices]
    sampled_labels = [labels[i] for i in sampled_indices]
    
    classes = np.unique(labels)
    explainer = LimeTextExplainer(random_state=1, class_names=classes)
    explanations_instance = []
    
    for idx, (instance, label) in enumerate(zip(sampled_data, sampled_labels)):
        expl_instance = {}
        expl_instance['id'] = str(idx)
        expl_instance['text'] = instance.strip("\n").strip()
        expl_instance['pred'] = classifier.predict([instance])[0]
        
        probs = classifier.predict_proba([instance])[0]
        max_prob = max(probs)
        expl_instance['pred_prob'] = float(f"{max_prob:.3f}") 
        
        expl_instance['true_label'] = label
        
        exp = explainer.explain_instance(instance, classifier.predict_proba, num_features=k, labels=range(len(classes)))
    

        for label in exp.available_labels():
            expl_instance[classes[label]+"_prob"] = probs[label]
            expl_instance[classes[label]+"_rationales"] = exp.as_list(label=(label))

        explanations_instance.append(expl_instance)

        print(idx+1," of ",len(sampled_indices), "explained")

# Save to CSV
    csv_columns = expl_instance.keys()
    csv_file = "LIME_Explanations_Faithfulness.csv"

    with open(csv_file, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        for data in explanations_instance:
            writer.writerow(data)

In [7]:
# Read the JSON file and parse it into a Python dictionary
with open('params_LLM.json', 'r') as file:
    params = json.load(file)

# Save LIME explanations to CSV

In [8]:
LIMEexplanationsText_toCSV(data=data_test, labels=test_broad_labels_names, classifier=svc_pipe, k=params["rationales"]["k"], n=params["rationales"]["n"])

1  of  50 explained
2  of  50 explained
3  of  50 explained
4  of  50 explained
5  of  50 explained
6  of  50 explained
7  of  50 explained
8  of  50 explained
9  of  50 explained
10  of  50 explained
11  of  50 explained
12  of  50 explained
13  of  50 explained
14  of  50 explained
15  of  50 explained
16  of  50 explained
17  of  50 explained
18  of  50 explained
19  of  50 explained
20  of  50 explained
21  of  50 explained
22  of  50 explained
23  of  50 explained
24  of  50 explained
25  of  50 explained
26  of  50 explained
27  of  50 explained
28  of  50 explained
29  of  50 explained
30  of  50 explained
31  of  50 explained
32  of  50 explained
33  of  50 explained
34  of  50 explained
35  of  50 explained
36  of  50 explained
37  of  50 explained
38  of  50 explained
39  of  50 explained
40  of  50 explained
41  of  50 explained
42  of  50 explained
43  of  50 explained
44  of  50 explained
45  of  50 explained
46  of  50 explained
47  of  50 explained
48  of  50 explained
4