# ML Model Implementation and Comparison

## Load Dataset

In [1]:
import pandas as pd

data = pd.read_csv("content/processed_data/preprocessed_dataset_with_clusters.csv")

data.head(10)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,FOKSZAM,NAPLOSZ,PARTNERKOD,FIZMOD,OSSZEG,DEVNEM,ELLENSZAMLA,KTGHKOD,AFA_KOD,NAPLO_JELLEG,CEGKOD,PENZMOZGAS,IRANY_CF,HONAP,NAP,Cluster
0,122,101,0,0,4.063576,0,272,0,0,0,0,1,0,1,1,7
1,491,101,0,0,-58.496764,0,6,0,0,0,0,1,0,1,1,10
2,491,101,0,0,-7.868419,0,25,0,0,0,0,1,0,1,1,10
3,161,101,0,0,4.063576,0,272,0,0,0,0,1,0,1,1,7
4,3614,101,0,0,0.031845,0,272,0,0,0,0,1,0,1,1,7
5,491,101,0,0,0.021079,0,54,0,0,0,0,1,0,1,1,10
6,36411,101,0,0,0.052437,0,272,0,0,0,0,1,0,1,1,7
7,491,101,0,0,0.000487,0,60,0,0,0,0,1,0,1,1,10
8,364111,101,0,0,0.026998,0,272,0,0,0,0,1,0,1,1,7
9,491,101,0,0,0.025926,0,62,0,0,0,0,1,0,1,1,10


## Set Target

In [2]:
target = "FOKSZAM"

X = data.drop(columns=target)
y = data[target]

## Train-Test and K-Fold Validation Split

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold 

test_size = 0.33
k_fold_splits = 5

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
kf = KFold(n_splits=k_fold_splits, shuffle=True, random_state=42)

## Models

### Function Definitons

In [8]:
# Accuracy Score
from sklearn.metrics import accuracy_score

def get_accuracy(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print("Train accuracy: ", train_accuracy)
    print("Test accuracy: ", test_accuracy)

In [10]:
# GridSearch CV
from sklearn.model_selection import GridSearchCV  

def gridOptimize(model, params, X_train, y_train, kf, scoring):
  grid_search = GridSearchCV(estimator=model, param_grid=params, cv=kf, scoring=scoring, verbose=1, n_jobs=-1) 
  grid_search.fit(X_train, y_train.ravel())

  best_params = grid_search.best_params_
  best_score = -grid_search.best_score_ * 100

  print("Best parameters:")
  print(best_params)
  print("Best score:")  
  print(best_score)

  return best_params, best_score

In [12]:
# Optuna
def optunaOptimize(model, params, X_train, y_train, kf, scoring):  
  import optuna      
  from sklearn.model_selection import cross_val_score
  import os
  import sys
  from contextlib import contextmanager  
    
  __keys = list(params.keys()) 

  @contextmanager
  def suppress_stdout_stderr():
    with open(os.devnull, "w") as devnull:
      old_stdout = sys.stdout
      old_stderr = sys.stderr
      sys.stdout = devnull
      sys.stderr = devnull
      try:
        yield
      finally:
        sys.stdout = old_stdout
        sys.stderr = old_stderr

  def objective(trial, params):
    with suppress_stdout_stderr():
      _params = {
        x: trial.suggest_categorical(x, params[x]) for x in __keys 
      }     
      scores = cross_val_score(model, X_train, y_train.ravel(), cv=kf, scoring=scoring)      
      
      mape_score = -scores.mean()
      
      return mape_score

  study = optuna.create_study(direction="minimize")
  study.optimize(lambda trial: objective(trial, params), n_trials = 50)

  best_params = study.best_params
  best_score = study.best_value 

  return(best_params, best_score)   

### Random Forest

In [13]:
random_forest = True

n_estimators = 110
max_depth = 18

if random_forest:
    from sklearn.ensemble import RandomForestClassifier

    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=0)
    rf.fit(X_train, y_train)

    get_accuracy(rf, X_train, X_test, y_train, y_test)

Train accuracy:  0.9684342219228178
Test accuracy:  0.8344241476995213


In [8]:
if random_forest:
        
    params = {        
        "n_estimators": [100, 110],
        "max_depth": [18,20],       
    }

    gridOptimize(rf, params, X_train, y_train, kf, "accuracy")

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  grid_search.fit(X_train, y_train.ravel())


Best parameters:
{'max_depth': 18, 'n_estimators': 110}
Best score:
-82.03728280023513


### Gradient Boosting

In [9]:
gradient_boosting = False

if gradient_boosting:

    from sklearn.ensemble import GradientBoostingClassifier

    gbc = GradientBoostingClassifier(max_depth=2, n_estimators=20, learning_rate=0.2)
    gbc.fit(X_train, y_train)

    get_accuracy(gbc, X_train, X_test, y_train, y_test)

### Light GBM

In [10]:
light_gbm = False


if light_gbm:

    from lightgbm import LGBMClassifier

    lgbm = LGBMClassifier(max_depth=4, n_estimators=50)
    lgbm.fit(X_train, y_train)

    get_accuracy(lgbm, X_train, X_test, y_train, y_test)

### CAT Boost

In [15]:
catboost = False

max_depth = 7
n_estimators = 5

if catboost:

    from catboost import CatBoostClassifier

    catboost = CatBoostClassifier(max_depth=max_depth, n_estimators=n_estimators, verbose=0)
    catboost.fit(X_train, y_train)

    get_accuracy(catboost, X_train, X_test, y_train, y_test)

In [12]:
if catboost:
 
    params = {
            "max_depth": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
            "n_estimators": [1,2,3,4,5,6,7,8,9,10]
        }

    gridOptimize(catboost, params, X_train, y_train, kf, "accuracy")

Fitting 5 folds for each of 150 candidates, totalling 750 fits


  grid_search.fit(X_train, y_train.ravel())


## Result

Random Forest with 110 estimators and max_depth of 18

In [16]:
predicted = rf.predict(data.drop(columns=target))

predicted

array([ 122,  491,  491, ..., 9699, 9699, 4541], dtype=int64)

In [17]:
data["Predicted"] = predicted

data.head(10)

Unnamed: 0,FOKSZAM,NAPLOSZ,PARTNERKOD,FIZMOD,OSSZEG,DEVNEM,ELLENSZAMLA,KTGHKOD,AFA_KOD,NAPLO_JELLEG,CEGKOD,PENZMOZGAS,IRANY_CF,HONAP,NAP,Cluster,Predicted
0,122,101,0,0,4.063576,0,272,0,0,0,0,1,0,1,1,7,122
1,491,101,0,0,-58.496764,0,6,0,0,0,0,1,0,1,1,10,491
2,491,101,0,0,-7.868419,0,25,0,0,0,0,1,0,1,1,10,491
3,161,101,0,0,4.063576,0,272,0,0,0,0,1,0,1,1,7,122
4,3614,101,0,0,0.031845,0,272,0,0,0,0,1,0,1,1,7,3614
5,491,101,0,0,0.021079,0,54,0,0,0,0,1,0,1,1,10,491
6,36411,101,0,0,0.052437,0,272,0,0,0,0,1,0,1,1,7,4629
7,491,101,0,0,0.000487,0,60,0,0,0,0,1,0,1,1,10,491
8,364111,101,0,0,0.026998,0,272,0,0,0,0,1,0,1,1,7,364111
9,491,101,0,0,0.025926,0,62,0,0,0,0,1,0,1,1,10,491


In [22]:
# Load original dataset

original_dataset = pd.read_csv("content/export/export_2023.csv")

original_dataset

  original_dataset = pd.read_csv("content/export/export_2023.csv")


Unnamed: 0,FOKSZAM,FOKSZAMMEGNEV,NAPLOSZ,KBIZSZ,BIZTET,KELT,TELJ,HATIDO,PARTNERKOD,PARTNERMEGNEV,...,KOCKAZAT_SZINT,Z_SCORE,REPORT_SOR,JELLEG,REPORT_KAT,REPORT_FOKAT,PARTNER,KAPCSOLT,CF_KAT,CF_FOKAT
0,122,"Telek, telkesítés ...",101,F1,1,2023-01-01,0,0,0,0,...,0,0.113107,"Telek, telkesítés",M,2. Tárgyi eszközök,1. BEFEKTETETT ESZKÖZÖK,0,0,0,0
1,491,Nyitómérleg számla ...,101,F1,1,2023-01-01,0,0,0,0,...,0,0.113107,Nyitómérleg számla,M,0,0,0,0,Tárgyi eszköz beszerzés,BEFEKTETÉSI CASH-FLOW
2,491,Nyitómérleg számla ...,101,F1,2,2023-01-01,0,0,0,0,...,0,0.113107,Nyitómérleg számla,M,0,0,0,0,Tárgyi eszköz beszerzés,BEFEKTETÉSI CASH-FLOW
3,161,Befejezetlen beruházások ...,101,F1,2,2023-01-01,0,0,0,0,...,0,0.113107,Beruházások,M,2. Tárgyi eszközök,1. BEFEKTETETT ESZKÖZÖK,0,0,0,0
4,3614,Elszámolási előleg ...,101,F3,1,2023-01-01,0,0,0,0,...,0,0.113107,Egyéb követelések,M,2. Követelések,2. FORGÓESZKÖZÖK,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31096,9699,Kerekítési különbözet elszámolása ...,601,10,6,2023-10-19,2023-10-19,0,0,0,...,0,0.127109,Különféle egyéb bevételek,EK,2. Egyéb bevételek,"7. ÜZLETI, ÜZEMI EREDMÉNY",0,0,Vevő fizetés,OPERATÍV CASH-FLOW
31097,9699,Kerekítési különbözet elszámolása ...,601,11,1,2023-07-03,2023-07-03,0,0,0,...,0,0.127183,Különféle egyéb bevételek,EK,2. Egyéb bevételek,"7. ÜZLETI, ÜZEMI EREDMÉNY",0,0,Szállító fizetés,OPERATÍV CASH-FLOW
31098,4541,Belföldi anyag- és áruszállítók ...,601,11,2,2023-07-03,2023-07-03,0,IND001,Indepack Magyarország Kft,...,0,0.127183,Szállítói tartozás,M,3. Rövid lejáratú kötelezettségek,5. KÖTELEZETTSÉGEK,0,0,Különféle egyéb bevételek,OPERATÍV CASH-FLOW
31099,4543,Bankkártyás szállítók ...,601,12,1,2023-10-04,2023-10-04,0,BIM001,BI Market Kft,...,0,0.127183,Szállítói tartozás,M,3. Rövid lejáratú kötelezettségek,5. KÖTELEZETTSÉGEK,0,0,Különféle egyéb bevételek,OPERATÍV CASH-FLOW


In [27]:


original_dataset["Predicted"] = data["Predicted"]
original_dataset["Match"] = (data["FOKSZAM"] == data["Predicted"])

mistmatch = original_dataset.loc[original_dataset["Match"] == False]

In [28]:
mistmatch.shape

(2351, 79)

In [29]:
# Exporting mismatches to CSV
mistmatch.to_csv("content/processed_data/mismatches.csv", index=False)