In [1]:
import credit_pipeline.data_exploration as dex
import credit_pipeline.training as tr
import credit_pipeline.reject_inference as ri

In [2]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split

In [3]:
#@title Set seed
import secrets


new_seed = False #@param {type:"boolean"}

if new_seed:
    seed_number = secrets.randbelow(1_000) #to name the results files

    while seed_number <100:
        seed_number = secrets.randbelow(1_000)
else:
    seed_number = 000

print(seed_number)

0


In [4]:
params_dict = ri.params_dict

In [5]:
params_dict['LightGBM_2'] = {'boosting_type': 'gbdt', 'class_weight': None,
              'colsample_bytree': 0.22534977954592625, 'importance_type': 'split',
              'learning_rate': 0.052227873762946964, 'max_depth': 5,
              'min_child_samples': 26, 'min_child_weight': 0.001,
              'min_split_gain': 0.0, 'n_estimators': 159, 'n_jobs': -1,
              'num_leaves': 12, 'objective': None, 'random_state': seed_number,
              'reg_alpha': 0.7438345471808012, 'reg_lambda': 0.46164693905368515,
                'verbose': -1, 'subsample': 0.8896599304061413,
              'subsample_for_bin': 200000, 'subsample_freq': 0,
              'is_unbalance': True}

In [6]:
import numpy as np
import pandas as pd

# Set seed for reproducibility
np.random.seed(4)

# Generating random data for 4 features
num_samples = 1000
feature_1 = np.random.normal(loc=50, scale=10, size=num_samples)
feature_2 = np.random.uniform(low=0, high=1, size=num_samples)
feature_3 = np.random.randint(low=20, high=60, size=num_samples)
feature_4 = np.random.choice([1, 0], size=num_samples, p=[0.95,0.05])

# Generating a binary target (let's say for classification)
binary_target = np.random.choice([0, 1], size=num_samples, p=[0.95,0.05])

# Creating a DataFrame
data = pd.DataFrame({
    'Feature_1': feature_1,
    'Feature_2': feature_2,
    'Feature_3': feature_3,
    'Sad': feature_4,
    'Target': binary_target
})

# Displaying the first few rows of the generated dataset
data.head()
# data.Target.mean()


Unnamed: 0,Feature_1,Feature_2,Feature_3,Sad,Target
0,50.505617,0.296426,34,1,0
1,54.999513,0.485956,46,1,0
2,40.040911,0.434089,27,1,0
3,56.935985,0.336606,22,1,0
4,45.816985,0.752061,39,1,0


In [7]:
data.Target.mean()

0.054

In [10]:
X = data.loc[:, data.columns != 'Target']
y = data.loc[:,'Target']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [11]:
clf = tr.create_pipeline(X_train, y_train, LGBMClassifier(**params_dict['LightGBM_2']))
clf.fit(X_train, y_train)

In [12]:
ri.get_metrics_RI({'clf':clf},X_test, y_test)

Unnamed: 0,clf
Overall AUC,0.560366
KS,0.194805
Balanced Accuracy,0.597403
Accuracy,0.32
Precision,0.068966
Recall,0.909091
F1,0.128205


In [20]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples = 1000, n_features=4, n_redundant=0, n_informative=4, n_clusters_per_class=1)

In [22]:
pd.DataFrame(X, columns=['A','B','C','D'])

Unnamed: 0,A,B,C,D
0,-0.145719,1.309900,2.909096,-0.226288
1,1.773050,-0.347861,0.607769,1.121623
2,-0.305632,-2.138959,1.060536,0.229240
3,0.942067,-1.526582,-0.814822,-0.114385
4,1.537237,0.019745,1.587393,0.495755
...,...,...,...,...
995,-1.462337,-1.966406,0.706333,-0.510807
996,-1.246051,0.475550,0.727790,-2.378382
997,-2.248591,-1.801279,0.067838,-2.350830
998,0.893325,-1.606336,-1.293346,-0.119724


In [14]:
X

array([[ 1.12031365,  5.75806083],
       [ 2.81630525,  1.01933868],
       [-1.21383631,  2.06378652],
       [ 0.87305123,  4.71438583],
       [-0.66246781,  2.17571724],
       [ 0.74285061,  1.46351659],
       [-4.07989383,  3.57150086],
       [ 3.54934659,  0.6925054 ],
       [ 2.49913075,  1.23133799],
       [ 1.9263585 ,  4.15243012]])