In [1]:
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from datasets import load_dataset
import shap

In [2]:
model = AutoModelForSeq2SeqLM.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps").cuda()

In [3]:
tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")

In [4]:

# Define function
def f(x):
    inputs = tokenizer(x.tolist(), return_tensors="pt", padding=True).to('cuda')
    with torch.no_grad():
        out = model.generate(**inputs,
        min_length=128, 
        max_length=512, 
        no_repeat_ngram_size=7,
        num_beams=2,
        early_stopping=True,
        num_return_sequences=1)
    sentence = [tokenizer.decode(g, skip_special_tokens=True) for g in out]
    print(sentence)
    return np.array(sentence)

In [5]:
# wrap model with TeacherForcingLogits class
teacher_forcing_model = shap.models.TeacherForcing(f, similarity_model=model, similarity_tokenizer=tokenizer, device=model.device,batch_size=16)
# create a Text masker
masker = shap.maskers.Text(tokenizer, mask_token = "...", collapse_mask_token=True)

In [6]:
explainer_model_agnostic = shap.Explainer(teacher_forcing_model, masker)

In [7]:
text=["""Introduction: cardiovascular diseases account for the highest mortality rate worldwide and are expected to be the major cause of death by 2020 ( 1 ) . studies in iran showed that cardiovascular diseases are the most important cause of death and accounted for 42% of all deaths in 2002 ( 2 ) . timely diagnosis and proactive measures play a significant role in effective treatment , prevention development of heart damages and the faster recovery of cardiovascular diseases . the time it takes to reach a hospital or a coronary care unit ( ccu ) and the time spent for diagnostic and treatment measures are factors influencing the delay or loss of time from the outset of severe cardiac infarction to the beginning of treatment ( 3 ) . overall , the mortality rate of severe cardiovascular deaths is 30% , more than half of which occurs before hospitalization ( 3 ) . however , with the advancement of the epidemic this social trend reverses and people at lower socio - economic classes become more vulnerable ( 3 ) . studies conducted in some developing countries showed that the underprivileged are more susceptible to cardiac arrests compared to affluent people ( 4 ) . furthermore , limited accessibility to health services and the inequality in availability of the services furnished by health and treatment systems deprives these people of acceptable and effective treatment ( 5 ) . limited and unequal accessibility to health services in iran are one of the controversial issues mentioned in several studies ( 5 - 7 ) . in general , one of the most important goals of a health system is to provide public accessibility and equality in receiving health and treatment services . as one measure of development , accessibility to health is one of the basic rights of individuals in the society . one of the main points in meting out fairness in the health system is that the distribution of health services must not be based on the socio - economic status and incomes of individuals ( 8) . inequality in health services in different countries has taken the form of a global challenge affected by various factors , including individual , social , economic and geographical variables ( 9 ) . these factors are stronger in developing countries ( 8,10 ) so that distribution of health services resources in developing countries has turned into a fundamental issue . therefore , the measurement of fair distribution of health services on top of previous measures has been emphasized by world health organization ( who ) ( 11 ) . inequality in distribution of health services is measured by different scales one of the most common is the gini coefficient which is based on the lorenz curve ( 12 ) . although multiple measures of inequality have been shown to be highly correlated with each other ( 7,12 ) , the results of several studies are encouraging and confirming the utility of the gini index in quantifying inequalities in health - related subjects . thus , the gini index and lorenz curve are commonly used in analyzing the inequality in distribution of health care resources ( 13 - 16 ) . since accessibility to intensive healthcare , in particular for cardiovascular conditions , is of utmost importance and since the appropriate distribution of coronary care unit ( ccu ) beds and cardiologists can be taken as a measure , the present study aims to examine the inequality of the geographical distribution of ccu beds and cardiologists in iran using the gini coefficient and the lorenz curve . 

Methods: this study conducted useing demographic data from national census in 2012 collected by the statistics center of iran ( sci ) . the number of ccu beds and cardiologists in public sector by province in 2012 was obtained from iran ministry of health and medical education . in the present study among various possible inequality measures ( such as gini coefficient , decile ratio , robin hood index and atkinson index ) preference was given to the gini coefficient because it has at least three major advantages ( 17 ) . first , it is less sensitive to the number of regions and the size of the numerical values in the data material in general . third , gini coefficient can be presented visually by means of a lorenz curve ; which provides an effective visualization of the inequalities . the lorenz curve compares the distribution of a given variable with the normal distribution ( of same variable ) that represents equality . the grater gap between the lorenz curve and diagonal line means the higher inequality . in this curve , the x axis represents the cumulative percentage of population and the y axis shows the percentage of the variable which is expressed as the proportion of the cumulative population . the 45-degree line is called the equality line because of showing the completely equal distribution ( 5 ) . in our study , the x axis represents the cumulative percentage of population of iranian provinces and the y axis shows the cumulative frequency of cardiologists and ccu beds in the iranian provinces . the gini coefficient is defined as the division of the area between diagonal line and the lorenz curve ( 14 ) . mathematically speaking , g1=1k=1n(xkxk1)(yk+yk1 ) where x represents the cumulative percentage of the population and y represents the cumulative percentage of ccu beds and cardiologists . the gini coefficient ranges between 0 and 1 in which , theoretically , 0 represents complete equality and 1 shows complete inequality of the distribution of a given variable . in practice , however , a coefficient below 0.2 is considered complete equality , one ranging between 0.2 and 0.3 shows high equality , one ranging between 0.3 and 0.4 indicates inequality ; a coefficient varying between 0.4 and 0.6 indicates high inequality and a coefficient larger than 0.6 represents complete inequality ( 14 ) . in this study it is assumed that more cardiologist and ccu beds are associated with more access to cardiovascular health services , so that people in provinces with higher per capital level of cardiologist and ccu beds have more access to cardiovascular health resources / services . finally , in order to calculate gini coefficient , the demographic data and also the number of ccu beds and cardiologists by province were entered and analyzed through ms excel software .

Results: in 2012 , iran had a total population of 75,149,669 with total number of 3665 ccu beds and the total number of 953 cardiologists in public sector . the province of tehran had the largest percentage of ccu beds ( 23.7% ) and cardiologists ( 17% ) . there were respectively 4.8 and 1.3 ccu beds and cardiologists across the nation per 100,000 individuals . table 1 shows the number of cardiologists and ccu beds per 100,000 individuals and the number of cardiologists per 10 ccu beds by province in 2012 . the lorenz curves for the ccu beds and cardiologists are illustrated in figures 1 and2 . the gini coefficients obtained for ccu beds and cardiologists were 0.129 and 0.045 , respectively . 

Discussion: the findings showed that the national mean number of ccu beds for a population of 100,000 is 4.88 , which 23 ( out of a total of 31 ) provinces are below the national mean . however , the obtained gini coefficients prove statically adequate equality for the geographical distribution of ccu beds across iran . a previous study showed that the province of yazd enjoys the most equal distribution of ccu beds on one hand , and the province of ilam suffers from the least equal distribution of ccu beds ( 16 ) . a study on distribution of active hospital bed in iran , reported the gini coefficient for active bed 0.08 and active beds for a population of 10,000 people in 2006 9.2 , while the province of yazd had maximum , and the province of lorestan the minimum beds per population ( 18 ) . another study conducted in 2002 using morris imbalance coefficient , showed the province of yazd stands at the highest rank in terms of the number of hospital beds , and the province of semnan ranks the highest in terms of the number of health centers ( 19 ) . a study conducted in the netherlands on the distribution of ccu beds in 24 university hospitals for the period 2004 - 2006 reported gini coefficients of 0.638 , 0.569 and 0.569 , which reflected unequal distribution ( 20 ) . a study conducted at 2006 reflects equal distribution of non - cardiac intensive care beds in iran . the numbers of icu , post icu and nicu beds per 100,000 individuals in iran reported 5.3 , 0.4 and 1.6 , respectively and the gini coefficient for icu , post icu and nicu beds 0.17 , 0.15 and 0.23 ( 14 ) . another study conducted at 2011 showed high degree of inequality in the distribution of kidney transplant beds in iran . the province of tehran had more than half of the beds installed in its hospitals while 17 provinces had no kidney transplant beds at all ( 5 ) . in south africa , there were 4,168 icu beds counted in 2005 of which 86% were installed in three provinces . the proportion of bed varied greatly in different provinces of this country , from 1:20,000 to 1:80,000 ( 21 ) . the distribution of secondary and tertiary health services in the palestine is also reported to be unequal , mostly concentrated in the downtown . in the gaza strip and in the eastern strip the proportions of iu beds per 1,000 were 1.4 and 1.2 , respectively ( 22 ) . in another study in the u.s.a the trend of gini coefficient during 1970 - 1997 was used to measure the distribution of hospital beds . the northern states have been reported to enjoy an equal distribution of hospital beds ( 15 ) . lorenz curve of ccu beds in 2012 lorenz curve of cardiologists in 2012 the present study findings showed that the mean number of cardiologists per 100,000 individuals is 1.27 . also , the mean number of cardiologists per 10 ccu is 2.6 which 13 provinces are below the national mean ( out of a total of 31 provinces ) . the obtained gini coefficient ( 0.045 ) proves statically adequate equality in the geographical distribution of cardiologists across iran . a study conducted on the distribution of specialist physicians in iran at 2010 showed that the numbers of total specialists per 10,000 individuals and per 10 active beds in the country were 1.7 and 1.9 , respectively and the gini coefficient of distribution of total specialists reported 0.052 ( 18 ) . the gini coefficient was used in a study at 2000 to measure the resource distribution in mental health services in the us , showing the gini coefficient for psychologists as 0.13 ( 15 ) . in another study in japan gini coefficient was used to measure equality in geographical distribution of orthopedic services ; findings showed the orthopedist per 100,000 individuals was 4.3 and the gini coefficient reported 0.36 ( 23 ) . the low gini coefficients obtained in this study suggest that the health system in iran has been relatively successful in establishing a fair geographical distribution of coronary care services , which can be attributed to the implementation of the fair national health services distribution strategic plan . iran is the first east mediterranean country to implement this strategic health plan as its fourth development plan and through iran s 20-year vision plan in the health services department ( 24 ) . furthermore , in a 1992 comprehensive program , the health services classification system was developed in order to avoid unnecessary parallel activities , create fairness in the distribution of inpatient health services , make accessible the health services to all people across the country , reduce the number of unused beds and develop a health services classification system on the basis of different specialties and their respective stations by province . variables such as patient admission , population , mean hospital stay length , national divisions , available facilities and the geographical situation ( in terms of route and accessibility ) were determining factors in locating health centers ( 24,25 ) for instance , the maximum accessibility time to a ccu bed is set at 30 minutes . therefore , if the temporal distance of a province to the next level of numerical reference shows a time length longer than 30 minutes , the calculated beds , which may even be below the installation basis , would belong to that province and would not be transferred ( 26 ) . reports show that in the 2006 - 2007 687 ccu beds were installed in iran , indicating the iranian health system approach to promote health and create the conditions for the fair accessibility of health systems to all citizens ( 27 ) . our study descriptive statistics showed a skewed distribution of ccu beds in iran , so that the capital tehran , accommodating one sixth of the total population of iran , owns one fourth of total ccu beds in iran . in a similar study by ameryoun et al . on the non - cardiac intensive care beds in tehran the ratios of all non - cardiac intensive care beds were almost two times higher than other provinces ( 14 ) . also , in another study by tofighi et al . on the distribution of specialists , tehran owns 23% of all specialists across the country ( 18 ) . however , a study conducted in 2002 ( 19 ) showed that by morris measure , the health centers in tehran do not constitute a balanced distribution , burdened with problems such as traffic jams and other problems rampant in metropolitan cities , causing delays and waste of time from the beginning of acute myocardial infarction symptoms to the initiation of treatment ( accessibility to the ccu bed ) . furthermore , problems such as overcrowded hospitals in tehran , a drop in quality and quantity of health services , inadequacy of services on one hand and waste of funds for the provinces and finally a drop in public health on the other hand are other problems plaguing megacities . therefore , health policy makers should emphasize decentralization in tehran and identify extra health facilities in different cities so that a balanced allocation of resources may be achieved . fair geographical distribution of health services would lead to efficient and cost - effective health system . while existing classificatory models in iran focus on structural development , the number of beds and a model of providing health services , few studies have concentrated on the distribution by area , classification and defining areas on the basis of optimal distribution of health services ( 26 ) . therefore , current study may provide a foundation for large - scale planning for the health systems in order to remove the inequality in the distribution of health services . further research and surveys are strongly recommended to acquire more information and monitor the fairness of distribution of different health systems nationally and across provinces . the aforementioned results represent an initial empirical approximation , the validity of which is limited for the following reasons . first , counting the number of cardiologists and ccu beds is not an appropriate way to measure extant cardiovascular care and access to it ( a more sufficient reliable measure in iran is not available yet ) . second , the methodology used to measure inequality is not perfect and biased with limitations in measuring inequality ; for example , utilizing the gini coefficient showed that there is no significant inequality in the distribution of pubic cardiovascular health services in iran , though , primary information on the distribution of beds and cardiologists showed that about one - fourth of the ccu beds are existed in tehran , where one - sixth of the total population of iran is living . such interpretations are helpful for providing an approximation of existing distributions and can show some limitations of using inequality measures . despite the fact that the present study is one of the few studies conducted on the geographical distribution of one health services ( cardiovascular health services ) in iran , selecting only two factors ( the number of beds and physicians in the distribution of facilities ) and administering only one measure ( the gini coefficient ) may be considered as the limitations of this study . also , because of the unavailability of private sector and province - specific data , the research did not consider the private sector capacity and could not produce detailed provincial results , which are hoped to be overcome in future studies . measuring equality in distribution of private sector resources or all the private and public resources together could have extremely different results . 

Conclusion: using gini measure showed that there is no significant inequality in the distribution of pubic cardiovascular health services in iran . nevertheless , our descriptive statistics showed that there is a skewness in distribution of pubic cardiovascular health services in iran . moreover , even the equal distribution of cardiovascular health facilities such as ccu beds , does not mean they are sufficiently provided in iran ."""]

In [8]:
shap_values_model_agnostic = explainer_model_agnostic(text)

The `device` argument is deprecated and will be removed in v5 of Transformers.


['despite the fact that there is']
['despite the fact that there are']


The `device` argument is deprecated and will be removed in v5 of Transformers.


['despite the fact that there is']


The `device` argument is deprecated and will be removed in v5 of Transformers.


['despite the fact that there is']
['despite the fact that there are']
['a new study has shown that']
['a new study has shown that']
['a new study has shown that']
['a new study has shown that']


Partition explainer: 2it [00:11, 11.41s/it]               


In [10]:
print(shap_values_model_agnostic)

.values =
array([[[-0.00086129, -0.04356976, -0.00918334,  0.0819841 ,
         -0.07590344,  0.13828089,  0.63209124],
        [-0.00086129, -0.04356976, -0.00918334,  0.0819841 ,
         -0.07590344,  0.13828089,  0.63209124],
        [ 0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ]]])

.base_values =
array([[-2.0718706 ,  0.3727606 , -0.10753374,  1.6976804 ,  6.53850959,
         0.85673017, -1.23351705]])

.data =
(array(['Hello ', 'world', ''], dtype=object),)


In [9]:
shap.plots.text(shap_values_model_agnostic)