## import libraries and data.

In [18]:
import pickle
from pathlib import Path

import click
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, recall_score

In [None]:
loan_data = '../data/processed/train_loan_data.csv'
loan = pd.read_csv(loan_data)
loan.head(5).style

In [None]:
test_loan_data = '../data/processed/test_loan_data.csv'
test = pd.read_csv(test_loan_data)
test.head(5).style

In [None]:
# Get the column names for both datasets
columns_df1 = loan.columns
columns_df2 = test.columns

# Compare the columns
if list(columns_df1) == list(columns_df2):
    print("The datasets have the same columns.")
else:
    print("The datasets have different columns.")
    # You can print the specific differences if needed
    print("Columns in dataset1 but not in dataset2:", set(columns_df1) - set(columns_df2))
    print("Columns in dataset2 but not in dataset1:", set(columns_df2) - set(columns_df1))


## Train data

In [7]:
features = loan.drop('approval_status', axis = 1)
target = loan['approval_status']

In [9]:
test_features = test.drop('approval_status', axis = 1)
test_target = test['approval_status']

In [None]:
# Update the random forest classifier
rf_model = RandomForestClassifier()
rf_model.fit(features, target)
prediction = rf_model.predict(test_features)

# Measure Metrics
print(balanced_accuracy_score(prediction, test_target))


In [None]:
print(recall_score(prediction, test_target, average="micro"))

model_path = '../models/rf_model.pkl'
# Outputting mode
pickle.dump(rf_model, open(model_path, "wb"))


## Complete function

In [None]:
from src.utils import get_logger
from src.config import MODELS_DIR


# logging.
logger = get_logger("Train Machine Learning Model")


@click.command()
@click.argument("prepocessed_path", type=click.Path(exists=True))
@click.argument("test_preprocessed_path", type=click.Path(exists=True))
def main(
    prepocessed_path,
    test_preprocessed_path,
    model_path: Path = MODELS_DIR / "rf_model.pkl",
):
    logger.info("Training some model...")
    loan = pd.read_csv(prepocessed_path)

    test = pd.read_csv(test_preprocessed_path)

    # Data targets
    features = loan.drop("approval_status", axis=1)
    target = loan["approval_status"]

    # Test target
    test_features = test.drop("approval_status", axis=1)
    test_target = test["approval_status"]

    # Update the random forest classifier
    rf_model = RandomForestClassifier()
    rf_model.fit(features, target)
    prediction = rf_model.predict(test_features)

    # Measure Metrics
    logger.info(balanced_accuracy_score(prediction, test_target))
    logger.info(recall_score(prediction, test_target, average="micro"))

    # Outputting model.
    pickle.dump(rf_model, open(model_path, "wb"))

    logger.info("Modeling training complete.")


if __name__ == "__main__":
    main()
