## 10/03/2025
### First Baseline using RF
### R@Precision: 10.97%


Todos:
- Create utils.
- Port to scripts.

In [61]:
import pandas as pd
import os
import numpy as np
from sklearn.linear_model import LogisticRegression
import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support, average_precision_score, roc_auc_score
import time


def r_at_k(y_true:np.ndarray, y_probas:np.ndarray, k='precision')->float:
    if k == 'precision':
        num_to_check = y_true.sum()
    else:
        if isinstance(k, int):
            num_to_check = k
        elif isinstance(k, float):
            num_to_check = int(k*len(y_test))
    k = min(num_to_check, len(y_test))
            
    sorted_indices = np.argsort(y_probas)[::-1]
    found_labels = y_true[sorted_indices]
    return found_labels[:num_to_check].sum() / num_to_check

columns_to_drop = ['CIK', 'Company', 'Type', 'Date', 'Period of Report', 'Fiscal Year End', "Basename", "gvkey", "" "restatement_year", "Misstatement_AA", "Misstatement", "Misstatement_BAO"]
not_sure_what = ['sich', 'insbnk', 'understatement', 'option']
not_leak = ['State', 'State of Inc', 'SIC']

target_col = "Misstatement"

path_to_files = "../data/"
results = []
for folder in tqdm.tqdm_notebook(os.listdir(path_to_files)):
    
    # LOAD DATA
    df_train = pd.read_csv(os.path.join(path_to_files, folder, "train_instances.csv"))
    df_test = pd.read_csv(os.path.join(path_to_files, folder, "test_instances.csv"))
    
    to_keep = df_train.columns[~df_train.columns.isin(columns_to_drop+not_sure_what+not_leak)]

    X_train = df_train[to_keep].values
    y_train = df_train[target_col].values.astype(int)
    
    X_test = df_test[to_keep].values
    y_test = df_test[target_col].values.astype(int)
    
    
    # FIT + PREDICT
    
    time_s = time.time()
    
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)
    
    y_test_proba = clf.predict_proba(X_test)[:, 1]
    
    time_e = time.time() - time_s
    
    
    # METRICS
    
    y_test_pred = (y_test_proba > 0.5).astype(int)
    prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_test_pred, average='binary', zero_division=0)
    ap = average_precision_score(y_test, y_test_proba)
    auroc = roc_auc_score(y_test, y_test_proba)
    
    metrics = {'prec_pos':prec, 'rec_pos':rec, 'f1_pos':f1, 'ap':ap, 'auroc':auroc}
    for k in [100, 'precision']:
        metrics[f"rec@{k}"] = r_at_k(y_test, y_test_proba, k=k)
    
    cur_res = {
        "split": int(folder.split('_')[1]),
        "num_train":y_train.shape[0],
        "num_pos_train": y_train.sum(),
        "num_test":y_test.shape[0],
        "num_pos_test": y_test.sum(),
        "time": time_e,
    }
    cur_res.update(metrics)
    results.append(cur_res) 
    #break

df_res = pd.DataFrame(results, columns=cur_res.keys())
df_res.sort_values('split', inplace=True)
df_res


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for folder in tqdm.tqdm_notebook(os.listdir(path_to_files)):


  0%|          | 0/12 [00:00<?, ?it/s]

Unnamed: 0,split,num_train,num_pos_train,num_test,num_pos_test,time,prec_pos,rec_pos,f1_pos,ap,auroc,rec@100,rec@precision
10,2003,8384,184,3192,93,3.94826,0.0,0.0,0.0,0.150283,0.680516,0.2,0.215054
7,2004,8977,228,3169,87,4.268331,0.0,0.0,0.0,0.094847,0.677486,0.15,0.149425
5,2005,9706,239,3101,82,4.571008,1.0,0.012195,0.024096,0.152082,0.670227,0.17,0.182927
9,2006,9462,199,3110,61,4.259512,0.0,0.0,0.0,0.038266,0.639527,0.04,0.04918
3,2007,9380,183,3111,54,4.498247,0.0,0.0,0.0,0.06311,0.634745,0.08,0.111111
1,2008,9322,151,3168,66,4.139749,0.0,0.0,0.0,0.060012,0.616337,0.08,0.090909
11,2009,9389,125,3087,69,3.777676,0.0,0.0,0.0,0.073279,0.570336,0.06,0.086957
0,2010,9366,125,3038,80,3.881507,0.0,0.0,0.0,0.068607,0.562344,0.1,0.1
6,2011,9293,139,2918,75,3.961459,0.0,0.0,0.0,0.048876,0.604887,0.09,0.106667
8,2012,9043,135,2841,86,4.053508,0.0,0.0,0.0,0.057735,0.623543,0.08,0.093023


In [62]:
df_res.mean(axis=0)

split            2008.500000
num_train        9142.583333
num_pos_train     166.750000
num_test         3034.666667
num_pos_test       78.833333
time                4.083638
prec_pos            0.083333
rec_pos             0.001016
f1_pos              0.002008
ap                  0.074134
auroc               0.613035
rec@100             0.097500
rec@precision       0.109147
dtype: float64