In [1]:
import pandas as pd

csv_list = [
    'OCR Dataset - Bank Jago.csv',
    'OCR Dataset - Shopee Pay.csv',
    'OCR Dataset - BLU.csv',
    'OCR Dataset - Gopay.csv',
    'OCR Dataset - Livin By Mandiri.csv',
    'OCR Dataset - MyBCA.csv',
    'OCR Dataset - MyBCA SS.csv',
    'OCR Dataset - OVO.csv',
    'OCR Dataset - Donasi sintesis.csv',
    'OCR Dataset - Health Sintesis.csv',
    'OCR Dataset - Education Sintesis.csv'
]

dfs = []

for csv_file in csv_list:
    df = pd.read_csv('datasets/' + csv_file)
    df['source'] = csv_file 
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)

df


Unnamed: 0,Nama File,Hasil OCR,Kategori,Method,source
0,Jago_01,Dimsum Nat food\nRp13.500\nID Transaksi\n23878...,Food and Beverages,QRIS,OCR Dataset - Bank Jago.csv
1,Jago_02,KIOS TALENTA\nRp2.200\nID Transaksi\n234014770...,Shopping,QRIS,OCR Dataset - Bank Jago.csv
2,Jago_03,Jus Mbak Yuli\nRp11.000\nID Transaksi\n2365882...,Food and Beverages,QRIS,OCR Dataset - Bank Jago.csv
3,Jago_04,KEDAI YO\nRp6.000\nID Transaksi\n2350877O19\nS...,Food and Beverages,QRIS,OCR Dataset - Bank Jago.csv
4,Jago_05,"SUMARNI, Otomotif\nBOGOR\nRp13.000\nID Transak...",Transport,QRIS,OCR Dataset - Bank Jago.csv
...,...,...,...,...,...
599,blu_565,blu\n Transaction Receipt\n 26 Feb 2025 15:55:...,Education,Virtual Account,OCR Dataset - Education Sintesis.csv
600,blu_566,blu\n Transaction Receipt\n 27 Mar 2024 18:02:...,Education,Transfer,OCR Dataset - Education Sintesis.csv
601,blu_567,blu\n Transaction Receipt\n 01 Apr 2025 21:09:...,Education,Qris,OCR Dataset - Education Sintesis.csv
602,blu_568,blu\n Transaction Receipt\n 02 Mei 2024 00:16:...,Education,Transfer,OCR Dataset - Education Sintesis.csv


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 604 entries, 0 to 603
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Nama File  604 non-null    object
 1   Hasil OCR  604 non-null    object
 2   Kategori   603 non-null    object
 3   Method     603 non-null    object
 4   source     604 non-null    object
dtypes: object(5)
memory usage: 23.7+ KB


In [3]:
# cek yang null siapa
df.isnull().sum()

Nama File    0
Hasil OCR    0
Kategori     1
Method       1
source       0
dtype: int64

In [4]:
df[df.isnull().any(axis=1)]

Unnamed: 0,Nama File,Hasil OCR,Kategori,Method,source
235,ERROR,ERROR,,,OCR Dataset - Livin By Mandiri.csv


In [5]:
#drop yang null
df = df.dropna().reset_index(drop=True)

df.isnull().sum()

Nama File    0
Hasil OCR    0
Kategori     0
Method       0
source       0
dtype: int64

In [6]:
df['Kategori'].value_counts()

Kategori
Food and Beverages    100
Shopping               98
Transport              81
Other                  79
Donations              77
Education              75
Health                 73
Bills                  11
Entertaiment            9
Name: count, dtype: int64

In [7]:
df['Method'].value_counts()

Method
QRIS               190
E-WALLET           180
TRANSFER           138
VA                  25
Transfer            23
E-Wallet            21
Qris                14
Virtual Account     12
Name: count, dtype: int64

In [8]:
# karena Bills, Entertaiment, Education, Health sedikit, kita gabung menjadi Lifestyle/Reccurring
df['Kategori'] = df['Kategori'].replace({
    'Bills': 'Lifestyle/Reccurring',
    'Entertaiment': 'Lifestyle/Reccurring',
})

In [9]:
#delete all rows with Lifestyle/Reccurring
df = df[df['Kategori'] != 'Lifestyle/Reccurring'].reset_index(drop=True)

In [10]:
df['Kategori'].value_counts()

Kategori
Food and Beverages    100
Shopping               98
Transport              81
Other                  79
Donations              77
Education              75
Health                 73
Name: count, dtype: int64

In [11]:
import re

def clean_ocr_basic(text):
    text = str(text)
    text = text.lower()
    text = text.replace("\n", " ")
    text = re.sub(r"\s+", " ", text)   # spasi ganda
    return text.strip()

def remove_ui_noise(text):
    noise_patterns = [
        r"id transaksi",
        r"transaksi berhasil",
        r"success",
        r"ref no",
    ]
    for p in noise_patterns:
        text = re.sub(p, " ", text)
    return re.sub(r"\s+", " ", text).strip()


def clean_ocr(text):
    text = clean_ocr_basic(text)
    text = remove_ui_noise(text)
    return text

df['Hasil OCR'] = df['Hasil OCR'].apply(clean_ocr)
df


Unnamed: 0,Nama File,Hasil OCR,Kategori,Method,source
0,Jago_01,dimsum nat food rp13.500 2387821914 sumber aku...,Food and Beverages,QRIS,OCR Dataset - Bank Jago.csv
1,Jago_02,kios talenta rp2.200 2340147702 sumber akun ke...,Shopping,QRIS,OCR Dataset - Bank Jago.csv
2,Jago_03,jus mbak yuli rp11.000 2365882380 sumber akun ...,Food and Beverages,QRIS,OCR Dataset - Bank Jago.csv
3,Jago_04,kedai yo rp6.000 2350877o19 sumber akun kezia ...,Food and Beverages,QRIS,OCR Dataset - Bank Jago.csv
4,Jago_05,"sumarni, otomotif bogor rp13.000 2574180843 su...",Transport,QRIS,OCR Dataset - Bank Jago.csv
...,...,...,...,...,...
578,blu_565,blu transaction receipt 26 feb 2025 15:55:35 w...,Education,Virtual Account,OCR Dataset - Education Sintesis.csv
579,blu_566,blu transaction receipt 27 mar 2024 18:02:46 w...,Education,Transfer,OCR Dataset - Education Sintesis.csv
580,blu_567,blu transaction receipt 01 apr 2025 21:09:57 w...,Education,Qris,OCR Dataset - Education Sintesis.csv
581,blu_568,blu transaction receipt 02 mei 2024 00:16:08 w...,Education,Transfer,OCR Dataset - Education Sintesis.csv


In [12]:
from sklearn.model_selection import train_test_split

# X = df[["Hasil OCR"]]
X = df ["Hasil OCR"]
y = df["Kategori"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


In [13]:
from sklearn.pipeline import Pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression

# Transformer untuk bersihin kolom teks (inputnya 2D: DataFrame 1 kolom)
# 2) Define vectorizers
word_tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=10000,
    min_df=2,
    sublinear_tf=True
)

char_tfidf = TfidfVectorizer(
    analyzer="char_wb",
    ngram_range=(3, 5),
    min_df=2,
    sublinear_tf=True
)

# 3) Gabung word + char features dari kolom ke-0 (single text column)
features = ColumnTransformer(
    transformers=[
        ("word", word_tfidf, 0),
        ("char", char_tfidf, 0),
    ],
    remainder="drop"
)

# 4) Final pipeline
# pipeline = Pipeline([
#     ("feat", features),
#     ("clf", LogisticRegression(
#         max_iter=2000,
#         class_weight="balanced",
#         solver="lbfgs"   # atau saga
#     ))
# ])

# alternatif pake HashingVectorizer + SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import FeatureUnion
pipeline = Pipeline([
    ("feat", FeatureUnion([
        ("word", HashingVectorizer(
            n_features=2**18,
            alternate_sign=False,
            analyzer="word",
            ngram_range=(1,2),
            norm="l2",
            lowercase=True
        )),
        ("char", HashingVectorizer(
            n_features=2**18,
            alternate_sign=False,
            analyzer="char_wb",
            ngram_range=(3,5),
            norm="l2",
            lowercase=True
        ))
    ])),
    ("clf", SGDClassifier(
        loss="log_loss",
        class_weight="balanced",
        alpha=1e-5,
        max_iter=3000,
        tol=1e-3,
        random_state=42
    ))
])


In [14]:
pipeline.fit(X_train, y_train)

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('feat', ...), ('clf', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"transformer_list  transformer_list: list of (str, transformer) tuples List of transformer objects to be applied to the data. The first half of each tuple is the name of the transformer. The transformer can be 'drop' for it to be ignored or can be 'passthrough' for features to be passed unchanged. .. versionadded:: 1.1  Added the option `""passthrough""`. .. versionchanged:: 0.22  Deprecated `None` as a transformer in favor of 'drop'.","[('word', ...), ('char', ...)]"
,"n_jobs  n_jobs: int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. .. versionchanged:: v0.20  `n_jobs` default changed from 1 to None",
,"transformer_weights  transformer_weights: dict, default=None Multiplicative weights for features per transformer. Keys are transformer names, values the weights. Raises ValueError if key not present in ``transformer_list``.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each transformer will be printed as it is completed.",False
,"verbose_feature_names_out  verbose_feature_names_out: bool, default=True If True, :meth:`get_feature_names_out` will prefix all feature names with the name of the transformer that generated that feature. If False, :meth:`get_feature_names_out` will not prefix any feature names and will error if feature names are not unique. .. versionadded:: 1.5",True

0,1,2
,"input  input: {'filename', 'file', 'content'}, default='content' - If `'filename'`, the sequence passed as an argument to fit is  expected to be a list of filenames that need reading to fetch  the raw content to analyze. - If `'file'`, the sequence items must have a 'read' method (file-like  object) that is called to fetch the bytes in memory. - If `'content'`, the input is expected to be a sequence of items that  can be of type string or byte.",'content'
,"encoding  encoding: str, default='utf-8' If bytes or files are given to analyze, this encoding is used to decode.",'utf-8'
,"decode_error  decode_error: {'strict', 'ignore', 'replace'}, default='strict' Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given `encoding`. By default, it is 'strict', meaning that a UnicodeDecodeError will be raised. Other values are 'ignore' and 'replace'.",'strict'
,"strip_accents  strip_accents: {'ascii', 'unicode'} or callable, default=None Remove accents and perform other character normalization during the preprocessing step. 'ascii' is a fast method that only works on characters that have a direct ASCII mapping. 'unicode' is a slightly slower method that works on any character. None (default) means no character normalization is performed. Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`.",
,"lowercase  lowercase: bool, default=True Convert all characters to lowercase before tokenizing.",True
,"preprocessor  preprocessor: callable, default=None Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. Only applies if ``analyzer`` is not callable.",
,"tokenizer  tokenizer: callable, default=None Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``.",
,"stop_words  stop_words: {'english'}, list, default=None If 'english', a built-in stop word list for English is used. There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`). If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if ``analyzer == 'word'``.",
,"token_pattern  token_pattern: str or None, default=r""(?u)\\b\\w\\w+\\b"" Regular expression denoting what constitutes a ""token"", only used if ``analyzer == 'word'``. The default regexp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator). If there is a capturing group in token_pattern then the captured group content, not the entire match, becomes the token. At most one capturing group is permitted.",'(?u)\\b\\w\\w+\\b'
,"ngram_range  ngram_range: tuple (min_n, max_n), default=(1, 1) The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used. For example an ``ngram_range`` of ``(1, 1)`` means only unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means only bigrams. Only applies if ``analyzer`` is not callable.","(1, ...)"

0,1,2
,"input  input: {'filename', 'file', 'content'}, default='content' - If `'filename'`, the sequence passed as an argument to fit is  expected to be a list of filenames that need reading to fetch  the raw content to analyze. - If `'file'`, the sequence items must have a 'read' method (file-like  object) that is called to fetch the bytes in memory. - If `'content'`, the input is expected to be a sequence of items that  can be of type string or byte.",'content'
,"encoding  encoding: str, default='utf-8' If bytes or files are given to analyze, this encoding is used to decode.",'utf-8'
,"decode_error  decode_error: {'strict', 'ignore', 'replace'}, default='strict' Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given `encoding`. By default, it is 'strict', meaning that a UnicodeDecodeError will be raised. Other values are 'ignore' and 'replace'.",'strict'
,"strip_accents  strip_accents: {'ascii', 'unicode'} or callable, default=None Remove accents and perform other character normalization during the preprocessing step. 'ascii' is a fast method that only works on characters that have a direct ASCII mapping. 'unicode' is a slightly slower method that works on any character. None (default) means no character normalization is performed. Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`.",
,"lowercase  lowercase: bool, default=True Convert all characters to lowercase before tokenizing.",True
,"preprocessor  preprocessor: callable, default=None Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. Only applies if ``analyzer`` is not callable.",
,"tokenizer  tokenizer: callable, default=None Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``.",
,"stop_words  stop_words: {'english'}, list, default=None If 'english', a built-in stop word list for English is used. There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`). If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if ``analyzer == 'word'``.",
,"token_pattern  token_pattern: str or None, default=r""(?u)\\b\\w\\w+\\b"" Regular expression denoting what constitutes a ""token"", only used if ``analyzer == 'word'``. The default regexp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator). If there is a capturing group in token_pattern then the captured group content, not the entire match, becomes the token. At most one capturing group is permitted.",'(?u)\\b\\w\\w+\\b'
,"ngram_range  ngram_range: tuple (min_n, max_n), default=(1, 1) The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used. For example an ``ngram_range`` of ``(1, 1)`` means only unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means only bigrams. Only applies if ``analyzer`` is not callable.","(3, ...)"

0,1,2
,"loss  loss: {'hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'}, default='hinge' The loss function to be used. - 'hinge' gives a linear SVM. - 'log_loss' gives logistic regression, a probabilistic classifier. - 'modified_huber' is another smooth loss that brings tolerance to  outliers as well as probability estimates. - 'squared_hinge' is like hinge but is quadratically penalized. - 'perceptron' is the linear loss used by the perceptron algorithm. - The other losses, 'squared_error', 'huber', 'epsilon_insensitive' and  'squared_epsilon_insensitive' are designed for regression but can be useful  in classification as well; see  :class:`~sklearn.linear_model.SGDRegressor` for a description. More details about the losses formulas can be found in the :ref:`User Guide ` and you can find a visualisation of the loss functions in :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_loss_functions.py`.",'log_loss'
,"penalty  penalty: {'l2', 'l1', 'elasticnet', None}, default='l2' The penalty (aka regularization term) to be used. Defaults to 'l2' which is the standard regularizer for linear SVM models. 'l1' and 'elasticnet' might bring sparsity to the model (feature selection) not achievable with 'l2'. No penalty is added when set to `None`. You can see a visualisation of the penalties in :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_penalties.py`.",'l2'
,"alpha  alpha: float, default=0.0001 Constant that multiplies the regularization term. The higher the value, the stronger the regularization. Also used to compute the learning rate when `learning_rate` is set to 'optimal'. Values must be in the range `[0.0, inf)`.",1e-05
,"l1_ratio  l1_ratio: float, default=0.15 The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1. l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1. Only used if `penalty` is 'elasticnet'. Values must be in the range `[0.0, 1.0]` or can be `None` if `penalty` is not `elasticnet`. .. versionchanged:: 1.7  `l1_ratio` can be `None` when `penalty` is not ""elasticnet"".",0.15
,"fit_intercept  fit_intercept: bool, default=True Whether the intercept should be estimated or not. If False, the data is assumed to be already centered.",True
,"max_iter  max_iter: int, default=1000 The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the ``fit`` method, and not the :meth:`partial_fit` method. Values must be in the range `[1, inf)`. .. versionadded:: 0.19",3000
,"tol  tol: float or None, default=1e-3 The stopping criterion. If it is not None, training will stop when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive epochs. Convergence is checked against the training loss or the validation loss depending on the `early_stopping` parameter. Values must be in the range `[0.0, inf)`. .. versionadded:: 0.19",0.001
,"shuffle  shuffle: bool, default=True Whether or not the training data should be shuffled after each epoch.",True
,"verbose  verbose: int, default=0 The verbosity level. Values must be in the range `[0, inf)`.",0
,"epsilon  epsilon: float, default=0.1 Epsilon in the epsilon-insensitive loss functions; only if `loss` is 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'. For 'huber', determines the threshold at which it becomes less important to get the prediction exactly right. For epsilon-insensitive, any differences between the current prediction and the correct label are ignored if they are less than this threshold. Values must be in the range `[0.0, inf)`.",0.1


In [15]:
from sklearn.metrics import classification_report

y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))


                    precision    recall  f1-score   support

         Donations       0.91      0.91      0.91        23
         Education       0.91      0.91      0.91        23
Food and Beverages       0.74      0.93      0.82        30
            Health       0.90      0.82      0.86        22
             Other       0.95      0.88      0.91        24
          Shopping       0.84      0.72      0.78        29
         Transport       0.75      0.75      0.75        24

          accuracy                           0.85       175
         macro avg       0.86      0.85      0.85       175
      weighted avg       0.85      0.85      0.85       175



In [16]:
import sklearn, inspect
from sklearn.linear_model import LogisticRegression

print("sklearn version:", sklearn.__version__)
print("LogisticRegression object:", LogisticRegression)
print("signature:", inspect.signature(LogisticRegression))


sklearn version: 1.8.0
LogisticRegression object: <class 'sklearn.linear_model._logistic.LogisticRegression'>
signature: (penalty='deprecated', *, C=1.0, l1_ratio=0.0, dual=False, tol=0.0001, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, verbose=0, warm_start=False, n_jobs=None)
