In [2]:
#
# source = inspect.getsource(train)
# parsed = ast.parse(source)

# for node in ast.walk(parsed):
#     if isinstance(node,ast.Call):
#         if isinstance(node.func, ast.Attribute):
#             if (node.func.value.id == 'layer'):
#                     if(node.func.attr == 'get_dataset'):
#                         print(ast.dump(node))
#                         print(node.args[0].value)


# ast.dump(parsed)

import pandas as pd
import inspect
import os
import sys
import inspect
import ast


class Layer:
    entities = []
    entity_context = None

    def __init__(self, project_name, environment):
        self.project_name = project_name
        self.environment = environment

    def setup(self):
        if os.path.exists(self.environment):
            file1 = open(self.environment, 'r')
            for lib in file1.readlines():
                print(f"Layer Infra: Installing {lib.strip()}...")
        else:
            print(f"Environment file not found: {self.environment}")

    def log_parameter(self, metric, value):
        print(f"\t{Layer.entity_context} > Parameter > {metric}:{value}")

    def log_metric(self, metric, value):
        print(f"\t{Layer.entity_context} > Metric >{metric}:{value}")

    def log(self, message):
        print(f"\t{Layer.entity_context} > {message}")

    def run(self, entities):
        self.entities = []
        for entity in entities:
            if entity._type == "dataset":
                self.entities.append(Dataset(entity))
            elif entity._type == "model":
                self.entities.append(Model(entity))

        print(f"--- Layer Infra: Running Project: {self.project_name} ---")

        self.setup()

        for entity in self.entities:
            entity.run()
        print(f"\n--- Layer Infra: Run Complete! ---")

    def get_dataset(self, name):
        for entity in self.entities:
            if entity.name == name:
                return entity
        raise Exception(f"Entity '{name}' not found!")


class Model:
    result = None

    def __init__(self, func):
        if func:
            self.name = func._name
            self.func = func

    def run(self):
        self.result = self.func()


class Dataset:
    result = None

    def __init__(self, func):
        if func:
            self.name = func._name
            self.func = func

    def run(self):
        self.result = self.func()

    def to_pandas(self):
        return self.result


def dataset(name):
    def inner(func):
        func._type = "dataset"
        func._name = name

        def wrapped(*args):
            Layer.entity_context = func._name
            print(f'\nBuilding {Layer.entity_context}...')
            res = func()
            # TODO save returning entity to catalog
            return res
        wrapped._type = "dataset"
        wrapped._name = name

        return wrapped

    return inner


def model(name):
    def inner(func):
        def wrapped(*args):
            Layer.entity_context = name
            print(f'\nTraining {Layer.entity_context}...')
            res = func()
            # TODO save returning entity to catalog
            return res
        wrapped._type = "model"
        wrapped._name = name

        return wrapped

    return inner

In [33]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.ensemble import HistGradientBoostingClassifier
import pandas as pd

@dataset("application_dataset")
def read_application_data():
    df = pd.read_csv("application_train.csv")
    layer.log(f"Total applications: {len(df)}")
    return df

@dataset("installments_payments")
def read_installments_data():
    df = pd.read_csv("installments_payments.csv")
    layer.log(f"Total installments_payments: {len(df)}")
    return df

@dataset("previous_application")
def read_previous_application_data():
    df = pd.read_csv("previous_application.csv")
    layer.log(f"Total previous_application: {len(df)}")
    return df

@dataset("application_features")
def extract_application_features():
    df = layer.get_dataset("application_dataset").to_pandas()

    # credit amount ratio relative to the income of a client
    df['CREDIT_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']

    # loan annuity percentage relative to the income of a client
    df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    # the length of the payment in months 
    df['CREDIT_TERM'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    # days employed relative to the age of the client
    df['DAYS_EMPLOYED_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    layer.log(f'Features: {list(df.columns)}')
    layer.log(f'Total Count: {len(df)}')
    return df


@model(name="credit-score-model")
def train():
    application_features = layer.get_dataset("application_features").to_pandas()
    previous_application_features = layer.get_dataset("previous_application").to_pandas()
    installments_payments = layer.get_dataset("installments_payments").to_pandas()
    dff = installments_payments.merge(previous_application_features, on=['SK_ID_PREV', 'SK_ID_CURR']).\
           merge(application_features,on=['SK_ID_CURR'])
    
    layer.log(f"Training data count: {len(dff)}")

    X = dff.drop(["TARGET", "SK_ID_CURR",'index'], axis=1)
    y = dff["TARGET"]
    random_state = 13
    test_size = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
                                                        random_state=random_state)
    categories = dff.select_dtypes(include=['object']).columns.tolist() 

    transformer = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore', drop="first"), categories)],remainder='passthrough')
     # Model Parameters
    learning_rate = 0.01
    max_depth = 6
    min_samples_leaf = 10
    random_state = 42
    early_stopping = True
    # Model: Define a HistGradient Boosting Classifier
    model = HistGradientBoostingClassifier(learning_rate=learning_rate,
                               max_depth=max_depth,
                               min_samples_leaf=min_samples_leaf,
                                early_stopping=early_stopping,
                               random_state=random_state)

     # Pipeline fit
    pipeline = Pipeline(steps=[('transformer', transformer), ('model', model)])
    pipeline.fit(X_train, y_train)
                     # Predict probabilities of target
    probs = pipeline.predict_proba(X_test)[:,1]
    # Calculate average precision and area under the receiver operating characteric curve (ROC AUC)
    avg_precision = average_precision_score(y_test, probs, pos_label=1)
    auc = roc_auc_score(y_test, probs)
    layer.log_metric("AUC", f'{auc:.4f}')
    layer.log_metric("avg_precision", f'{avg_precision:.4f}')

    
    return pipeline


# ++ init Layer
layer = Layer(project_name="credit-score", environment='requirements.txt')
# ++ To run the whole project on Layer Infra
layer.run([read_application_data,read_installments_data, read_previous_application_data,extract_application_features, train])

# ++ To train model on Layer infra
# layer.run([train])

# ++ To debug the code locally, just call the function:
# train()
# extract_features()

# read_and_clean_dataset()
# train()

--- Layer Infra: Running Project: credit-score ---
Layer Infra: Installing scikit-learn==1.0.1...

Building application_dataset...
	application_dataset > Total applications: 10000

Building installments_payments...
	installments_payments > Total installments_payments: 370905

Building previous_application...
	previous_application > Total previous_application: 45187

Building application_features...
	application_features > Features: ['index', 'SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'REGION_RATING_C