In [20]:
## 18-jun-2023
## Author: Iván Andrés Trujillo Abella

In [21]:
import pandas as pd

# For processing data
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from scipy import stats


## To modeling
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

# Assesment
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, confusion_matrix, precision_score, recall_score, roc_auc_score


def Xy(df,target):
    """
    Split the data in X,y to ML implementations
    """
    X = df.loc[ : , df.columns != target]
    y = df[target].astype('int')
    return X,y


## Models
# Grid search hyperparameters for a logistic regression model
def grid_lr(X_train, y_train):
    model = LogisticRegression(random_state=666, max_iter=1000)
    class_weight =  [{0:0.05, 1:0.95}, {0:0.1, 1:0.9}, {0:0.2, 1:0.8}]
    solvers = ['liblinear']
    penalty = ['l2','l1']
    c_values = [ 10, 1.0, 0.1, 0.01, 0.001, ]
    grid = dict(solver=solvers,penalty=penalty,C=c_values, class_weight= class_weight)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,
                           scoring='f1',error_score='raise')
    grid_result = grid_search.fit(X_train, y_train)
    return  grid_result.best_estimator_

In [22]:
url = "https://raw.githubusercontent.com/it-ces/Datasets/main/basesuper.csv"

In [23]:
df  = pd.read_csv(url)

In [24]:
# Build financial ratios...
features = ['Activos corrientes totales', 'Pasivos corrientes totales' , 'Ganancia bruta', 'Patrimonio total', 'Total pasivos', 'Total de patrimonio y pasivos', 'y']
df = df[features]
df.dropna(inplace=True)
df['razon_corriente']=  df['Activos corrientes totales'] / df['Pasivos corrientes totales']
df['rentabilidad_patrimonio'] = df['Ganancia bruta'] / df['Patrimonio total']
df['endeudamiento'] = df['Total pasivos'] / df[ 'Total de patrimonio y pasivos']
df['short_run_apalancamiento'] = df['Pasivos corrientes totales'] / df['Patrimonio total']

featuresy = ['razon_corriente', 'rentabilidad_patrimonio', 'endeudamiento' ,'short_run_apalancamiento' ,'y']
df = df[featuresy]
df.dropna(inplace=True)


import pandas as pd
import numpy as np

def clean(df):
    index = ~df.isin([ np.inf, -np.inf]).any(axis=1)
    return df[index].astype(np.float64)

df = clean(df)

In [25]:
X,y = Xy(df, 'y')

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state = 666, stratify=y)

In [27]:
## Logistic regression implementation
best_model = grid_lr(X_train, y_train) #entreno el modelo con X_train y y_train
preds = best_model.predict(X_test)
print(classification_report(preds, y_test))  # recall igual a sensibilidad

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2518
           1       0.00      0.00      0.00        24

    accuracy                           0.98      2542
   macro avg       0.50      0.49      0.49      2542
weighted avg       0.98      0.98      0.98      2542



In [28]:
! pip install imbalanced-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [29]:
# Undersampling
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=1234)
X_res, y_res = rus.fit_resample(X, y)

In [30]:
y_res.value_counts()

0    174
1    174
Name: y, dtype: int64

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, shuffle = True, random_state = 666, stratify=y_res)
## Logistic regression implementation
best_model = grid_lr(X_train, y_train) #entreno el modelo con X_train y y_train
preds = best_model.predict(X_test)
print(classification_report(preds, y_test))  # recall igual a sensibilidad

              precision    recall  f1-score   support

           0       0.23      0.80      0.36        10
           1       0.94      0.55      0.69        60

    accuracy                           0.59        70
   macro avg       0.59      0.68      0.53        70
weighted avg       0.84      0.59      0.65        70

