/
text_classification.py
166 lines (126 loc) 路 7.12 KB
/
text_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
from typing import Dict
import numpy as np
from ..utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
from .base import PIPELINE_INIT_ARGS, GenericTensor, Pipeline
if is_tf_available():
from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
if is_torch_available():
from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
def sigmoid(_outputs):
return 1.0 / (1.0 + np.exp(-_outputs))
def softmax(_outputs):
maxes = np.max(_outputs, axis=-1, keepdims=True)
shifted_exp = np.exp(_outputs - maxes)
return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
class ClassificationFunction(ExplicitEnum):
SIGMOID = "sigmoid"
SOFTMAX = "softmax"
NONE = "none"
@add_end_docstrings(
PIPELINE_INIT_ARGS,
r"""
return_all_scores (`bool`, *optional*, defaults to `False`):
Whether to return all prediction scores or just the one of the predicted class.
function_to_apply (`str`, *optional*, defaults to `"default"`):
The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:
- `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the model
has several labels, will apply the softmax function on the output.
- `"sigmoid"`: Applies the sigmoid function on the output.
- `"softmax"`: Applies the softmax function on the output.
- `"none"`: Does not apply any function on the output.
""",
)
class TextClassificationPipeline(Pipeline):
"""
Text classification pipeline using any `ModelForSequenceClassification`. See the [sequence classification
examples](../task_summary#sequence-classification) for more information.
This text classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
`"sentiment-analysis"` (for classifying sequences according to positive or negative sentiments).
If multiple classification labels are available (`model.config.num_labels >= 2`), the pipeline will run a softmax
over the results. If there is a single label, the pipeline will run a sigmoid over the result.
The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See
the up-to-date list of available models on
[huggingface.co/models](https://huggingface.co/models?filter=text-classification).
"""
return_all_scores = False
function_to_apply = ClassificationFunction.NONE
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.check_model_type(
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
if self.framework == "tf"
else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
)
def _sanitize_parameters(self, return_all_scores=None, function_to_apply=None, **tokenizer_kwargs):
preprocess_params = tokenizer_kwargs
postprocess_params = {}
if hasattr(self.model.config, "return_all_scores") and return_all_scores is None:
return_all_scores = self.model.config.return_all_scores
if return_all_scores is not None:
postprocess_params["return_all_scores"] = return_all_scores
if isinstance(function_to_apply, str):
function_to_apply = ClassificationFunction[function_to_apply.upper()]
if function_to_apply is not None:
postprocess_params["function_to_apply"] = function_to_apply
return preprocess_params, {}, postprocess_params
def __call__(self, *args, **kwargs):
"""
Classify the text(s) given as inputs.
Args:
args (`str` or `List[str]`):
One or several texts (or one list of prompts) to classify.
return_all_scores (`bool`, *optional*, defaults to `False`):
Whether to return scores for all labels.
function_to_apply (`str`, *optional*, defaults to `"default"`):
The function to apply to the model outputs in order to retrieve the scores. Accepts four different
values:
If this argument is not specified, then it will apply the following functions according to the number
of labels:
- If the model has a single label, will apply the sigmoid function on the output.
- If the model has several labels, will apply the softmax function on the output.
Possible values are:
- `"sigmoid"`: Applies the sigmoid function on the output.
- `"softmax"`: Applies the softmax function on the output.
- `"none"`: Does not apply any function on the output.
Return:
A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:
- **label** (`str`) -- The label predicted.
- **score** (`float`) -- The corresponding probability.
If `self.return_all_scores=True`, one such dictionary is returned per label.
"""
result = super().__call__(*args, **kwargs)
if isinstance(args[0], str):
# This pipeline is odd, and return a list when single item is run
return [result]
else:
return result
def preprocess(self, inputs, **tokenizer_kwargs) -> Dict[str, GenericTensor]:
return_tensors = self.framework
return self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)
def _forward(self, model_inputs):
return self.model(**model_inputs)
def postprocess(self, model_outputs, function_to_apply=None, return_all_scores=False):
# Default value before `set_parameters`
if function_to_apply is None:
if self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1:
function_to_apply = ClassificationFunction.SIGMOID
elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1:
function_to_apply = ClassificationFunction.SOFTMAX
elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None:
function_to_apply = self.model.config.function_to_apply
else:
function_to_apply = ClassificationFunction.NONE
outputs = model_outputs["logits"][0]
outputs = outputs.numpy()
if function_to_apply == ClassificationFunction.SIGMOID:
scores = sigmoid(outputs)
elif function_to_apply == ClassificationFunction.SOFTMAX:
scores = softmax(outputs)
elif function_to_apply == ClassificationFunction.NONE:
scores = outputs
else:
raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}")
if return_all_scores:
return [{"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores)]
else:
return {"label": self.model.config.id2label[scores.argmax().item()], "score": scores.max().item()}