In [1]:
import pandas as pd
import model_engine
import boto3
import  numpy as np
from functions_for_onboarding import *

# LOAD FILES

In [3]:
bucket_name = "power-client-data-staging"

In [6]:
files_trade_data = list_s3_files(bucket_name = bucket_name, 
                                prefix = 'CLIENT/PARSED/DATA/BUREAU=equifax/FORMAT=cms_6/TABLE=trade/PULL_NAME=20250201_oefcu_orangecounty_orlando_trustone_vantagewest/')

In [7]:
test_inquiry_df = load_df_from_list(list = files_trade_data, number = 1)

# Base Features

In [15]:
from __future__ import annotations

from typing import Any, Dict, List, Union
import pandas as pd

from feature_engine_parts.fe_parts_V2.mappers import converters


# --- 1) map "type" strings to actual converter classes ---
CONVERTER_REGISTRY = {
    "MappingBase": converters.MappingBase,
    "DateConverterV2": converters.DateConverterV2,
    "NumericConverterV2": converters.NumericConverterV2,
    "StringConverterV2": converters.StringConverterV2,
}


def _build_converter(spec: Dict[str, Any]):
    """
    spec example:
    {
        "type": "StringConverterV2",
        "params": {"raw_feature": "ZEST_KEY"}
    }
    """
    if not isinstance(spec, dict):
        raise TypeError(f"spec must be a dict, got {type(spec)}")

    type_name = spec.get("type")
    params = spec.get("params", {}) or {}

    if type_name not in CONVERTER_REGISTRY:
        known = ", ".join(sorted(CONVERTER_REGISTRY.keys()))
        raise ValueError(f"Unknown converter type '{type_name}'. Known types: {known}")

    if not isinstance(params, dict):
        raise TypeError(f"spec['params'] must be a dict, got {type(params)}")

    cls = CONVERTER_REGISTRY[type_name]
    return type_name, cls(**params)


def apply_converter_spec(df: pd.DataFrame, spec: Dict[str, Any], *, verbose: bool = True) -> pd.DataFrame:
    """
    Applies ONE converter spec to df and returns a new df.
    Prints which converter is used and what feature is created.
    """
    type_name, converter = _build_converter(spec)

    if verbose:
        # most of these classes have raw_feature + new_feature
        raw_feature = getattr(converter, "raw_feature", None)
        new_feature = getattr(converter, "new_feature", None)
        print(f"[apply_converter_spec] Using {type_name} | raw_feature={raw_feature!r} -> new_feature={new_feature!r}")

    return converter.transform(df)


def apply_converter_specs(
    df: pd.DataFrame,
    specs: Union[Dict[str, Any], List[Dict[str, Any]]],
    *,
    verbose: bool = True,
) -> pd.DataFrame:
    """
    Applies ONE spec or a LIST of specs in order.
    """
    out = df.copy()
    spec_list = [specs] if isinstance(specs, dict) else list(specs)

    for i, spec in enumerate(spec_list, start=1):
        if verbose:
            print(f"\n[apply_converter_specs] Step {i}/{len(spec_list)}")
        out = apply_converter_spec(out, spec, verbose=verbose)

    return out


# --- Optional: if you have a JSON blob with a top-level list field ---
def apply_from_json_obj(df: pd.DataFrame, json_obj: Dict[str, Any], *, key: str = "features", verbose: bool = True):
    """
    Example JSON shape:
    {"features": [ {spec1}, {spec2}, ... ]}
    """
    specs = json_obj.get(key)
    if specs is None:
        raise KeyError(f"json_obj missing key '{key}'")
    if not isinstance(specs, list):
        raise TypeError(f"json_obj['{key}'] must be a list, got {type(specs)}")
    return apply_converter_specs(df, specs, verbose=verbose)


## Mapper Functions

## high_credit_amt: The highest reported balance (outstanding debt) you have had so far for that tradeline. ,
* Installment Accounts: If you have't missed any payments, this should simple be the balance you started with. If you miss payments, this can acrue on the high credit.
* For revolving, would be the highest balance you had at that point. Even if you pay off immediately, would be the most expensive purchase
* This [link](https://zestfinance.atlassian.net/wiki/spaces/DS/pages/1710784539/App+Review+Guide) **Seems to apply that installment it is the original balance**, is THAT TRUE WHAT ABOUT LATE FEES/ETC??
* We apply a numeric converter from CREDIT_LIMIT
## Balance: The amt you debt you have right now. 
* We apply numeric converter on it from BALANCE. We fill na 0. 
## credit_limit: The maximum amount you can borrow.
* We first apply a numeric converter to this to create credit_limit and then we make it NA for non revolving, open, or charge card accounts (see later)
* Only makes sense the context of revolving accounts
## pastDueAmt: The dolalr amount on the tradeline that is past due at the report date (may include feeds and interest that come with it
* The amount of money you should have paid by now but have not yet
* Revolving: This can be the minimum payments you have missed so far
* Installment: Generally equals the sum of the missed payments
* We create pastDueAmt from PAST_DUE_AMOUNT and we fill NA with 0
## scheduled_payment_amount: Contractual amount due for next payment
* Installment: That fixed payment
* Revolving: The minimum payment amount for next one?
* **EQUIFAX** says this is the monthly amount regardless of the actual payment frequency (Page 271) **Confirm**
**QUESTION: DON'T REALLY USE ACTUAL PAYMENT AMOUNT A LOT AT ALL? SIMIALR SIGNAL NOT RELIABLE?? WHY**
* We create this with numeric converter from SCHEDULED_PAYMENT_AMOUNT
## termDur: How Long the PAyment Last
* We create termDur from the TERMS_DURATION. We do not fill na with 0 here.
* The amount of time to repay the loan. (page 271)
## termFreqStr: String version of how often you have to pay
## termFreqMult: How often you have to pay
* we create termFreqMult from TERMS_FREQUENCY and we use a NumericConverter that first has a mapping.
* We create a new variable here, which assumes missing that the term Frequency is 1, and then we use the mapping to convert the frequency into number **based on how many months there are**
```sh
{
"M": 1,
"B": 2.1666666666666665,
"W": 4.333333333333333,
"E": 2,
"L": 0.5,
"Q": 0.3333333333333333,
"S": 0.16666666666666666,
"T": 0.25,
"Y": 0.08333333333333333,
"D": 1,
"P": 1,
"0": 1,
"<": 1
}
* This can also be seen on page 155 in the equifax document
* 
```
## Date_Of_Request: The application date. We use in reference to these but don't have exactly. For each person, we can give them a random application date between 0 and 3 months after the latest reported date for this tradeline.
## openDate: Date that tradeline was opened
* We create this with DateConverter from Date_Opened
## closedDate: Data that the tradeline was closed
Equifax: "contains the date the account was closed. It will not be populated when Date Major Delinquency 1st Reported is present."
**So this means closed date basically means it ended without a major delinquency??**
* We create this from CLOSED_DATE using data converter
## majordqDate: Date of First Major Delinquency
* Data Dictionary: IF current rate/status is 6,7,8,9, M, Z or if trade contains narrative code 081 (foreclosure) this data will reflect the first time narrative code was reported. (See Narrative Code Section)
* 6 (Collection Account), 7 (Apart of Chapter 13 Bakrupcy), 8 (Repossesion has occured) (9 it has been charged off Z: (Foreclosure)
* * We create this from DMD_REPORTED
## rptDate: Date of application (is this right)**ASK QUESTION**
* we create this using data converter from DATE_REPORTED
## lstPmtDate: Date the user made the last payment
* We create this with dateconverter from LAST_PAYMENT_DATE
## accountType: type of loan
* We create accountType from ACCOUNT_TYPE using StringConverter
* We can see all of these on page 150 on the document. For examplem 18 is credit card
* "Contains a code that describes the kind of lona (auto, home improvement, credit card etc)
## portfolioType: The type of loan (more general)
* We create it using string convert from PORTFOLIO_TYPE and we map O to R. This maps open to revolving. Open is typically a **charge card: where you have full payment every cycle**
## PORTFOLIO_TYPE: The type of loan (With open)
* Without the original mapping 

In [26]:
test_inquiry_df['ACCOUNT_TYPE'].value_counts()

ACCOUNT_TYPE
18    117583
07    113746
00     76021
12     65638
26     24276
       ...  
9B         5
37         3
72         2
7A         1
67         1
Name: count, Length: 62, dtype: int64

In [25]:
test_inquiry_df['PORTFOLIO_TYPE'].value_counts()

PORTFOLIO_TYPE
R    249030
I    186092
M     36231
C     11911
O      8543
*      1599
Name: count, dtype: int64

## ecoa: Relationship of the person to the tradeline
* We create it from ECOA_DESIGNATOR using string converter
* Account Designator Codes

### Account Designator Codes (ECOA)

| CODE | DESCRIPTION |
| :---: | :--- |
| A | Authorized User – This is an authorized user of this account; another individual has contractual responsibility. |
| B | On behalf of another person – The subject has financial responsibility for an account, which is used exclusively by another person. |
| C | Co-maker – The subject has co-signed for a loan, and will be responsible for payment if the borrower should default. |
| I | Individual Account – The subject of the report has contractual responsibility for this account and is primarily responsible for its payment. |
| J | Joint Account – The subject and another person (or persons) are jointly responsible for payment on this account. |
| M | Maker – The subject is responsible for payment of a loan, but a co-maker will be responsible for payment if maker defaults. |
| S | Shared, but otherwise undesignated – This code is an indication that the credit grantor knows that the subject and at least one other person share the account, but not enough information is available to designate the account as “J” or “A”. |
| T | Terminated – The subject’s relationship to this account has ended, although other parties who once shared the account may continue to maintain the account. |
| U | Undesignated |
| X | Deceased (Not returned on Trade Lines) |

In [27]:
test_inquiry_df['ECOA_DESIGNATOR'].value_counts()

ECOA_DESIGNATOR
I    373369
J     96912
A     17416
M      4119
C      3784
T      1519
S      1043
U        12
X         9
B         3
Name: count, dtype: int64

## NARRATIVE_CODE_1
## NARRATIVE_CODE_2
* These are both created by just using string_converter
* According to experian on page 158, the narrative code indicate "certain comments considering the segment information in question"
* **WHAT IS THE DIFFERENCE BETWEEN NARRATIVE_CODE_1 and NARRATIVE_CODE_2?**


## dqDate
* "COntains the date that the highest rate/status occured outside of the timeframe of the payment history that has been created"
* **Equifax seems to suggest that this is based on the previous high rate, which only tracks delinquency (page 153)**
* **IS this for when it occured outside of the payment history**
* Confused how this is dqDate if it is outside of history
* "The date that the highest rate/status occured outside of the timeframe of the payment history that has been requested."
* We create this from PREVIOUS_HIGH_DATE_1. **So for the example below does that mean 1 is example of people who were DQ at some point but now have paid debts off**?
## Rate_Status_Code: The current rating on the account. 
* Rate codes are numeric and status codes are letters
<table border="0">
  <tr>
    <td valign="top" width="50%">
      <h3>Rate Codes</h3>
      <table>
        <thead>
          <tr>
            <th>CODE</th>
            <th>DESCRIPTION</th>
          </tr>
        </thead>
        <tbody>
          <tr>
            <td align="center">0</td>
            <td>Too new to rate; Approved but not used</td>
          </tr>
          <tr>
            <td align="center">1</td>
            <td>Pays account as agreed</td>
          </tr>
          <tr>
            <td align="center">2</td>
            <td>Not more than two payments past due</td>
          </tr>
          <tr>
            <td align="center">3</td>
            <td>Not more than three payments past due</td>
          </tr>
          <tr>
            <td align="center">4</td>
            <td>Not more than four payments past due</td>
          </tr>
          <tr>
            <td align="center">5</td>
            <td>At least 120 days or more than four payments past due</td>
          </tr>
          <tr>
            <td align="center">6</td>
            <td>Collection account (Enhanced Trade Only)</td>
          </tr>
          <tr>
            <td align="center">7</td>
            <td>Included in Chapter 13</td>
          </tr>
          <tr>
            <td align="center">8</td>
            <td>Repossession</td>
          </tr>
          <tr>
            <td align="center">9</td>
            <td>Charge-off</td>
          </tr>
          <tr>
            <td align="center">Blank</td>
            <td>No rate reported</td>
          </tr>
        </tbody>
      </table>
    </td>
    <td valign="top" width="50%">
      <h3>Status Codes</h3>
      <table>
        <thead>
          <tr>
            <th>CODE</th>
            <th>DESCRIPTION</th>
          </tr>
        </thead>
        <tbody>
          <tr>
            <td align="center">A</td>
            <td>Account is inactive</td>
          </tr>
          <tr>
            <td align="center">B</td>
            <td>Lost or stolen card</td>
          </tr>
          <tr>
            <td align="center">C</td>
            <td>Contact member for status</td>
          </tr>
          <tr>
            <td align="center">D</td>
            <td>Refinanced or renewed</td>
          </tr>
          <tr>
            <td align="center">E</td>
            <td>Consumer deceased</td>
          </tr>
          <tr>
            <td align="center">F</td>
            <td>In financial counseling</td>
          </tr>
          <tr>
            <td align="center">G</td>
            <td>Foreclosure process started</td>
          </tr>
          <tr>
            <td align="center" style="color:red">H</td>
            <td style="color:red">In WEP of other party <i>(retired 2-2-2009)</i></td>
          </tr>
          <tr>
            <td align="center">J</td>
            <td>Adjustment pending</td>
          </tr>
          <tr>
            <td align="center">M</td>
            <td>Included in Chapter 13</td>
          </tr>
          <tr>
            <td align="center">S</td>
            <td>Dispute - resolution pending</td>
          </tr>
          <tr>
            <td align="center">Z</td>
            <td>Included in Bankruptcy</td>
          </tr>
          <tr>
            <td align="center" style="color:red">#</td>
            <td style="color:red">In BK of Another Person <i>(retired 2-2-2009)</i></td>
          </tr>
          <tr>
            <td align="center">$</td>
            <td>Assigned to US Dept of ED</td>
          </tr>
        </tbody>
      </table>
    </td>
  </tr>
</table>

In [21]:
test_inquiry_df[~test_inquiry_df['PREVIOUS_HIGH_RATE_1'].isna()]['RATE_STATUS_CODE'].value_counts()

RATE_STATUS_CODE
1    28150
9    15113
5     6171
6     3654
2     1850
3     1275
4      979
7      355
8      224
*      147
Z       14
C        9
Name: count, dtype: int64

In [22]:
test_inquiry_df['TERMS_DURATION']

0         None
1         None
2         None
3         0360
4         0180
          ... 
499995    None
499996    0084
499997    None
499998    None
499999    0075
Name: TERMS_DURATION, Length: 500000, dtype: object

## Payment_History_1_24
* The payment history of the most recent 24 months. Leftmost is most recent
## Payment_History_25_36: Payment history from months 25-36
## Payment_History_37_48: Payment history from months 37-48


In [14]:
json_high_credit = {
            "type": "NumericConverterV2",
            "params": {
                "raw_feature": "BALANCE",
                "new_feature": "balance_amt",
                "fillna": 0,
                "fillblanks": 0
            }

# Payment History

## Payment_History_1_24: Most recent 24 months?

## ACTIVITY_DESIGNATOR
* We just create it with string converter. It is the final state of the account.
| Code | Description |
| :---: | :--- |
| **B** | Paid and Closed |
| **C** | Closed |
| **D** | Transfer/Sold/Paid |
| **L** | Lost/Stolen |
| **P** | Paid |
| **R** | Refinanced |
| **T** | Transfer/Sold |

**SO BASED ON EQUIFAX IT SAYS THAT CLOSED DATE BASICALLY IS ONYL THERE WHEN IT I CLSOED AND NOT CHARGED OFF, IS THIS WHY WE DON"T SEE ANY CHARGEOFF CODES HERE?**

In [30]:
test_inquiry_df[~test_inquiry_df['CLOSED_DATE'].isna()]['ACTIVITY_DESIGNATOR'].value_counts()

ACTIVITY_DESIGNATOR
B    213057
T     28300
C      7746
D      2401
L      1460
P       302
R       125
Name: count, dtype: int64

In [31]:
test_inquiry_df[test_inquiry_df['CLOSED_DATE'].isna()]['ACTIVITY_DESIGNATOR'].value_counts()

ACTIVITY_DESIGNATOR
T    8056
B    5888
C    5363
P    1435
D     258
R     220
L     109
Name: count, dtype: int64

In [None]:
 test_inquiry_df[test_inquiry_df['CLOSED_DATE'].isna()]['ORIGINAL_CHARGE_OFF_AMOUNT'].value_counts()