In [313]:
import pandas as pd

csv_list = [
    'OCR Dataset - Bank Jago.csv',
    'OCR Dataset - Shopee Pay.csv',
    'OCR Dataset - BLU.csv',
    'OCR Dataset - Gopay.csv',
    'OCR Dataset - Livin By Mandiri.csv',
    'OCR Dataset - MyBCA.csv',
    'OCR Dataset - MyBCA SS.csv',
    'OCR Dataset - OVO.csv',
    'OCR Dataset - Donasi sintesis.csv',
    'OCR Dataset - Health Sintesis.csv',
    'OCR Dataset - Education Sintesis.csv'
]

dfs = []

for csv_file in csv_list:
    df = pd.read_csv('datasets/' + csv_file)
    df['source'] = csv_file 
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)

df


Unnamed: 0,Nama File,Hasil OCR,Kategori,Method,source
0,Jago_01,Dimsum Nat food\nRp13.500\nID Transaksi\n23878...,Food and Beverages,QRIS,OCR Dataset - Bank Jago.csv
1,Jago_02,KIOS TALENTA\nRp2.200\nID Transaksi\n234014770...,Shopping,QRIS,OCR Dataset - Bank Jago.csv
2,Jago_03,Jus Mbak Yuli\nRp11.000\nID Transaksi\n2365882...,Food and Beverages,QRIS,OCR Dataset - Bank Jago.csv
3,Jago_04,KEDAI YO\nRp6.000\nID Transaksi\n2350877O19\nS...,Food and Beverages,QRIS,OCR Dataset - Bank Jago.csv
4,Jago_05,"SUMARNI, Otomotif\nBOGOR\nRp13.000\nID Transak...",Transport,QRIS,OCR Dataset - Bank Jago.csv
...,...,...,...,...,...
599,blu_565,blu\n Transaction Receipt\n 26 Feb 2025 15:55:...,Education,Virtual Account,OCR Dataset - Education Sintesis.csv
600,blu_566,blu\n Transaction Receipt\n 27 Mar 2024 18:02:...,Education,Transfer,OCR Dataset - Education Sintesis.csv
601,blu_567,blu\n Transaction Receipt\n 01 Apr 2025 21:09:...,Education,Qris,OCR Dataset - Education Sintesis.csv
602,blu_568,blu\n Transaction Receipt\n 02 Mei 2024 00:16:...,Education,Transfer,OCR Dataset - Education Sintesis.csv


In [314]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 604 entries, 0 to 603
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Nama File  604 non-null    object
 1   Hasil OCR  604 non-null    object
 2   Kategori   603 non-null    object
 3   Method     603 non-null    object
 4   source     604 non-null    object
dtypes: object(5)
memory usage: 23.7+ KB


In [315]:
# cek yang null siapa
df.isnull().sum()

Nama File    0
Hasil OCR    0
Kategori     1
Method       1
source       0
dtype: int64

In [316]:
df[df.isnull().any(axis=1)]

Unnamed: 0,Nama File,Hasil OCR,Kategori,Method,source
235,ERROR,ERROR,,,OCR Dataset - Livin By Mandiri.csv


In [317]:
#drop yang null
df = df.dropna().reset_index(drop=True)

df.isnull().sum()

Nama File    0
Hasil OCR    0
Kategori     0
Method       0
source       0
dtype: int64

In [318]:
df['Kategori'].value_counts()

Kategori
Food and Beverages    100
Shopping               98
Transport              81
Other                  79
Donations              77
Education              75
Health                 73
Bills                  11
Entertaiment            9
Name: count, dtype: int64

In [319]:
df['Method'].value_counts()

Method
QRIS               190
E-WALLET           180
TRANSFER           138
VA                  25
Transfer            23
E-Wallet            21
Qris                14
Virtual Account     12
Name: count, dtype: int64

In [320]:
# karena Bills, Entertaiment, Education, Health sedikit, kita gabung menjadi Lifestyle/Reccurring
df['Kategori'] = df['Kategori'].replace({
    'Bills': 'Lifestyle/Reccurring',
    'Entertaiment': 'Lifestyle/Reccurring',
})

In [321]:
#delete all rows with Lifestyle/Reccurring
df = df[df['Kategori'] != 'Lifestyle/Reccurring'].reset_index(drop=True)

In [322]:
df['Kategori'].value_counts()

Kategori
Food and Beverages    100
Shopping               98
Transport              81
Other                  79
Donations              77
Education              75
Health                 73
Name: count, dtype: int64

In [323]:
import re

def clean_ocr_basic(text):
    text = str(text)
    text = text.lower()
    text = text.replace("\n", " ")
    text = re.sub(r"\s+", " ", text)   # spasi ganda
    return text.strip()

def remove_ui_noise(text):
    noise_patterns = [
        r"id transaksi",
        r"transaksi berhasil",
        r"success",
        r"ref no",
    ]
    for p in noise_patterns:
        text = re.sub(p, " ", text)
    return re.sub(r"\s+", " ", text).strip()


def clean_ocr(text):
    text = clean_ocr_basic(text)
    text = remove_ui_noise(text)
    return text

df['Hasil OCR'] = df['Hasil OCR'].apply(clean_ocr)
df


Unnamed: 0,Nama File,Hasil OCR,Kategori,Method,source
0,Jago_01,dimsum nat food rp13.500 2387821914 sumber aku...,Food and Beverages,QRIS,OCR Dataset - Bank Jago.csv
1,Jago_02,kios talenta rp2.200 2340147702 sumber akun ke...,Shopping,QRIS,OCR Dataset - Bank Jago.csv
2,Jago_03,jus mbak yuli rp11.000 2365882380 sumber akun ...,Food and Beverages,QRIS,OCR Dataset - Bank Jago.csv
3,Jago_04,kedai yo rp6.000 2350877o19 sumber akun kezia ...,Food and Beverages,QRIS,OCR Dataset - Bank Jago.csv
4,Jago_05,"sumarni, otomotif bogor rp13.000 2574180843 su...",Transport,QRIS,OCR Dataset - Bank Jago.csv
...,...,...,...,...,...
578,blu_565,blu transaction receipt 26 feb 2025 15:55:35 w...,Education,Virtual Account,OCR Dataset - Education Sintesis.csv
579,blu_566,blu transaction receipt 27 mar 2024 18:02:46 w...,Education,Transfer,OCR Dataset - Education Sintesis.csv
580,blu_567,blu transaction receipt 01 apr 2025 21:09:57 w...,Education,Qris,OCR Dataset - Education Sintesis.csv
581,blu_568,blu transaction receipt 02 mei 2024 00:16:08 w...,Education,Transfer,OCR Dataset - Education Sintesis.csv


In [324]:
from sklearn.model_selection import train_test_split

X = df[["Hasil OCR"]]
y = df["Kategori"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


In [325]:
from sklearn.pipeline import Pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Transformer untuk bersihin kolom teks (inputnya 2D: DataFrame 1 kolom)
# 2) Define vectorizers
word_tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=10000,
    min_df=2,
    sublinear_tf=True
)

char_tfidf = TfidfVectorizer(
    analyzer="char_wb",
    ngram_range=(3, 5),
    min_df=2,
    sublinear_tf=True
)

# 3) Gabung word + char features dari kolom ke-0 (single text column)
features = ColumnTransformer(
    transformers=[
        ("word", word_tfidf, 0),
        ("char", char_tfidf, 0),
    ],
    remainder="drop"
)

# 4) Final pipeline
pipeline = Pipeline([
    ("feat", features),
    ("clf", LogisticRegression(
        max_iter=2000,
        class_weight="balanced",
        solver="lbfgs"   # atau saga
    ))
])


In [326]:
pipeline.fit(X_train, y_train)

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('feat', ...), ('clf', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"transformers  transformers: list of tuples List of (name, transformer, columns) tuples specifying the transformer objects to be applied to subsets of the data. name : str  Like in Pipeline and FeatureUnion, this allows the transformer and  its parameters to be set using ``set_params`` and searched in grid  search. transformer : {'drop', 'passthrough'} or estimator  Estimator must support :term:`fit` and :term:`transform`.  Special-cased strings 'drop' and 'passthrough' are accepted as  well, to indicate to drop the columns or to pass them through  untransformed, respectively. columns : str, array-like of str, int, array-like of int, array-like of bool, slice or callable  Indexes the data on its second axis. Integers are interpreted as  positional columns, while strings can reference DataFrame columns  by name. A scalar string or int should be used where  ``transformer`` expects X to be a 1d array-like (vector),  otherwise a 2d array will be passed to the transformer.  A callable is passed the input data `X` and can return any of the  above. To select multiple columns by name or dtype, you can use  :obj:`make_column_selector`.","[('word', ...), ('char', ...)]"
,"remainder  remainder: {'drop', 'passthrough'} or estimator, default='drop' By default, only the specified columns in `transformers` are transformed and combined in the output, and the non-specified columns are dropped. (default of ``'drop'``). By specifying ``remainder='passthrough'``, all remaining columns that were not specified in `transformers`, but present in the data passed to `fit` will be automatically passed through. This subset of columns is concatenated with the output of the transformers. For dataframes, extra columns not seen during `fit` will be excluded from the output of `transform`. By setting ``remainder`` to be an estimator, the remaining non-specified columns will use the ``remainder`` estimator. The estimator must support :term:`fit` and :term:`transform`. Note that using this feature requires that the DataFrame columns input at :term:`fit` and :term:`transform` have identical order.",'drop'
,"sparse_threshold  sparse_threshold: float, default=0.3 If the output of the different transformers contains sparse matrices, these will be stacked as a sparse matrix if the overall density is lower than this value. Use ``sparse_threshold=0`` to always return dense. When the transformed output consists of all dense data, the stacked result will be dense, and this keyword will be ignored.",0.3
,"n_jobs  n_jobs: int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"transformer_weights  transformer_weights: dict, default=None Multiplicative weights for features per transformer. The output of the transformer is multiplied by these weights. Keys are transformer names, values the weights.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each transformer will be printed as it is completed.",False
,"verbose_feature_names_out  verbose_feature_names_out: bool, str or Callable[[str, str], str], default=True - If True, :meth:`ColumnTransformer.get_feature_names_out` will prefix  all feature names with the name of the transformer that generated that  feature. It is equivalent to setting  `verbose_feature_names_out=""{transformer_name}__{feature_name}""`. - If False, :meth:`ColumnTransformer.get_feature_names_out` will not  prefix any feature names and will error if feature names are not  unique. - If ``Callable[[str, str], str]``,  :meth:`ColumnTransformer.get_feature_names_out` will rename all the features  using the name of the transformer. The first argument of the callable is the  transformer name and the second argument is the feature name. The returned  string will be the new feature name. - If ``str``, it must be a string ready for formatting. The given string will  be formatted using two field names: ``transformer_name`` and ``feature_name``.  e.g. ``""{feature_name}__{transformer_name}""``. See :meth:`str.format` method  from the standard library for more info. .. versionadded:: 1.0 .. versionchanged:: 1.6  `verbose_feature_names_out` can be a callable or a string to be formatted.",True
,"force_int_remainder_cols  force_int_remainder_cols: bool, default=False This parameter has no effect. .. note::  If you do not access the list of columns for the remainder columns  in the `transformers_` fitted attribute, you do not need to set  this parameter. .. versionadded:: 1.5 .. versionchanged:: 1.7  The default value for `force_int_remainder_cols` will change from  `True` to `False` in version 1.7. .. deprecated:: 1.7  `force_int_remainder_cols` is deprecated and will be removed in 1.9.",'deprecated'

0,1,2
,"input  input: {'filename', 'file', 'content'}, default='content' - If `'filename'`, the sequence passed as an argument to fit is  expected to be a list of filenames that need reading to fetch  the raw content to analyze. - If `'file'`, the sequence items must have a 'read' method (file-like  object) that is called to fetch the bytes in memory. - If `'content'`, the input is expected to be a sequence of items that  can be of type string or byte.",'content'
,"encoding  encoding: str, default='utf-8' If bytes or files are given to analyze, this encoding is used to decode.",'utf-8'
,"decode_error  decode_error: {'strict', 'ignore', 'replace'}, default='strict' Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given `encoding`. By default, it is 'strict', meaning that a UnicodeDecodeError will be raised. Other values are 'ignore' and 'replace'.",'strict'
,"strip_accents  strip_accents: {'ascii', 'unicode'} or callable, default=None Remove accents and perform other character normalization during the preprocessing step. 'ascii' is a fast method that only works on characters that have a direct ASCII mapping. 'unicode' is a slightly slower method that works on any characters. None (default) means no character normalization is performed. Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`.",
,"lowercase  lowercase: bool, default=True Convert all characters to lowercase before tokenizing.",True
,"preprocessor  preprocessor: callable, default=None Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. Only applies if ``analyzer`` is not callable.",
,"tokenizer  tokenizer: callable, default=None Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``.",
,"analyzer  analyzer: {'word', 'char', 'char_wb'} or callable, default='word' Whether the feature should be made of word or character n-grams. Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input. .. versionchanged:: 0.21  Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data  is first read from the file and then passed to the given callable  analyzer.",'word'
,"stop_words  stop_words: {'english'}, list, default=None If a string, it is passed to _check_stop_list and the appropriate stop list is returned. 'english' is currently the only supported string value. There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`). If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if ``analyzer == 'word'``. If None, no stop words will be used. In this case, setting `max_df` to a higher value, such as in the range (0.7, 1.0), can automatically detect and filter stop words based on intra corpus document frequency of terms.",
,"token_pattern  token_pattern: str, default=r""(?u)\\b\\w\\w+\\b"" Regular expression denoting what constitutes a ""token"", only used if ``analyzer == 'word'``. The default regexp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator). If there is a capturing group in token_pattern then the captured group content, not the entire match, becomes the token. At most one capturing group is permitted.",'(?u)\\b\\w\\w+\\b'

0,1,2
,"input  input: {'filename', 'file', 'content'}, default='content' - If `'filename'`, the sequence passed as an argument to fit is  expected to be a list of filenames that need reading to fetch  the raw content to analyze. - If `'file'`, the sequence items must have a 'read' method (file-like  object) that is called to fetch the bytes in memory. - If `'content'`, the input is expected to be a sequence of items that  can be of type string or byte.",'content'
,"encoding  encoding: str, default='utf-8' If bytes or files are given to analyze, this encoding is used to decode.",'utf-8'
,"decode_error  decode_error: {'strict', 'ignore', 'replace'}, default='strict' Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given `encoding`. By default, it is 'strict', meaning that a UnicodeDecodeError will be raised. Other values are 'ignore' and 'replace'.",'strict'
,"strip_accents  strip_accents: {'ascii', 'unicode'} or callable, default=None Remove accents and perform other character normalization during the preprocessing step. 'ascii' is a fast method that only works on characters that have a direct ASCII mapping. 'unicode' is a slightly slower method that works on any characters. None (default) means no character normalization is performed. Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`.",
,"lowercase  lowercase: bool, default=True Convert all characters to lowercase before tokenizing.",True
,"preprocessor  preprocessor: callable, default=None Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. Only applies if ``analyzer`` is not callable.",
,"tokenizer  tokenizer: callable, default=None Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``.",
,"analyzer  analyzer: {'word', 'char', 'char_wb'} or callable, default='word' Whether the feature should be made of word or character n-grams. Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input. .. versionchanged:: 0.21  Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data  is first read from the file and then passed to the given callable  analyzer.",'char_wb'
,"stop_words  stop_words: {'english'}, list, default=None If a string, it is passed to _check_stop_list and the appropriate stop list is returned. 'english' is currently the only supported string value. There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`). If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if ``analyzer == 'word'``. If None, no stop words will be used. In this case, setting `max_df` to a higher value, such as in the range (0.7, 1.0), can automatically detect and filter stop words based on intra corpus document frequency of terms.",
,"token_pattern  token_pattern: str, default=r""(?u)\\b\\w\\w+\\b"" Regular expression denoting what constitutes a ""token"", only used if ``analyzer == 'word'``. The default regexp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator). If there is a capturing group in token_pattern then the captured group content, not the entire match, becomes the token. At most one capturing group is permitted.",'(?u)\\b\\w\\w+\\b'

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",'balanced'
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [327]:
from sklearn.metrics import classification_report

y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))


                    precision    recall  f1-score   support

         Donations       1.00      0.91      0.95        23
         Education       0.88      0.96      0.92        23
Food and Beverages       0.68      0.87      0.76        30
            Health       0.91      0.91      0.91        22
             Other       0.86      0.79      0.83        24
          Shopping       0.91      0.69      0.78        29
         Transport       0.80      0.83      0.82        24

          accuracy                           0.85       175
         macro avg       0.86      0.85      0.85       175
      weighted avg       0.86      0.85      0.85       175



In [328]:
import sklearn, inspect
from sklearn.linear_model import LogisticRegression

print("sklearn version:", sklearn.__version__)
print("LogisticRegression object:", LogisticRegression)
print("signature:", inspect.signature(LogisticRegression))


sklearn version: 1.8.0
LogisticRegression object: <class 'sklearn.linear_model._logistic.LogisticRegression'>
signature: (penalty='deprecated', *, C=1.0, l1_ratio=0.0, dual=False, tol=0.0001, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, verbose=0, warm_start=False, n_jobs=None)


In [329]:
import numpy as np

def get_feature_names_from_pipeline(pipeline):
    feat = pipeline.named_steps["feat"]  # ColumnTransformer
    # ambil feature names dari masing-masing transformer
    word_names = feat.named_transformers_["word"].get_feature_names_out()
    char_names = feat.named_transformers_["char"].get_feature_names_out()

    # ColumnTransformer output = [word_features | char_features] (concat)
    feature_names = np.concatenate([word_names, char_names])
    return feature_names, len(word_names), len(char_names)

def top_features_per_class(pipeline, top_k=20, only="word"):
    clf = pipeline.named_steps["clf"]
    feature_names, n_word, n_char = get_feature_names_from_pipeline(pipeline)

    classes = clf.classes_
    coefs = clf.coef_  # shape: (n_classes, n_features)

    # pilih subset fitur
    if only == "word":
        idx = np.arange(0, n_word)
    elif only == "char":
        idx = np.arange(n_word, n_word + n_char)
    else:  # "all"
        idx = np.arange(0, n_word + n_char)

    results = {}
    for i, c in enumerate(classes):
        coef = coefs[i, idx]
        names = feature_names[idx]

        top_pos = np.argsort(coef)[-top_k:][::-1]      # paling mendorong kelas c
        top_neg = np.argsort(coef)[:top_k]             # paling menentang kelas c

        results[c] = {
            "top_positive": list(zip(names[top_pos], coef[top_pos])),
            "top_negative": list(zip(names[top_neg], coef[top_neg])),
        }
    return results

res = top_features_per_class(pipeline, top_k=15, only="word")

for cls, info in res.items():
    print("\n=== Kelas:", cls, "===")
    print("Top kata yang MENDORONG kelas ini:")
    for w, v in info["top_positive"]:
        print(f"  {w:25s} {v:+.4f}")



=== Kelas: Donations ===
Top kata yang MENDORONG kelas ini:
  ago                       +0.5714
  id                        +0.5664
  pusat                     +0.5494
  pan                       +0.4819
  a01                       +0.4438
  kitabisa                  +0.4161
  00o                       +0.3961
  osukses                   +0.3936
  ini ago                   +0.3936
  ago jagoo                 +0.3936
  ago jago                  +0.3936
  resi nomor                +0.3936
  ag ada                    +0.3936
  jago osukses              +0.3936
  osukses oge               +0.3936

=== Kelas: Education ===
Top kata yang MENDORONG kelas ini:
  education                 +0.7009
  rp                        +0.6733
  status berhasil           +0.5852
  status                    +0.5065
  bukti pembayaran          +0.4915
  wib                       +0.4619
  2024                      +0.4503
  education merchant        +0.4495
  keterangan                +0.4276
  sekolah     