In [51]:
import pandas as pd
pd.set_option('display.float_format', '{:.2f}'.format)
import model_engine
import boto3
import  numpy as np
from functions_for_onboarding import *

# LOAD FILES

In [6]:
bucket_name = "power-client-data-staging"

In [7]:
files_trade_data = list_s3_files(bucket_name = bucket_name, 
                                prefix = 'CLIENT/PARSED/DATA/BUREAU=equifax/FORMAT=cms_6/TABLE=trade/PULL_NAME=20250201_oefcu_orangecounty_orlando_trustone_vantagewest/')

In [8]:
test_inquiry_df = load_df_from_list(list = files_trade_data, number = 1)

# Base Features

In [282]:
from __future__ import annotations

import json
from pathlib import Path
from typing import Any, Dict, Iterable, List, Set, Union

import numpy as np
import pandas as pd

from feature_engine_parts.fe_parts_V2.mappers import converters

# --------------------------------------------------------------------------------------
# Converter registry + helpers
# --------------------------------------------------------------------------------------

CONVERTER_REGISTRY = {
    "MappingBase": converters.MappingBase,
    "DateConverterV2": converters.DateConverterV2,
    "NumericConverterV2": converters.NumericConverterV2,
    "StringConverterV2": converters.StringConverterV2,
}


def _build_converter(spec: Dict[str, Any]):
    if not isinstance(spec, dict):
        raise TypeError(f"spec must be a dict, got {type(spec)}")

    type_name = spec.get("type")
    params = spec.get("params", {}) or {}

    if type_name not in CONVERTER_REGISTRY:
        known = ", ".join(sorted(CONVERTER_REGISTRY.keys()))
        raise ValueError(f"Unknown converter type '{type_name}'. Known types: {known}")

    if not isinstance(params, dict):
        raise TypeError(f"spec['params'] must be a dict, got {type(params)}")

    cls = CONVERTER_REGISTRY[type_name]
    return type_name, cls(**params)


def apply_converter_spec(df: pd.DataFrame, spec: Dict[str, Any], *, verbose: bool = True) -> pd.DataFrame:
    type_name, converter = _build_converter(spec)

    if verbose:
        raw_feature = getattr(converter, "raw_feature", None)
        new_feature = getattr(converter, "new_feature", None)
        print(f"[apply_converter_spec] Using {type_name} | raw_feature={raw_feature!r} -> new_feature={new_feature!r}")

    return converter.transform(df)


def _output_feature_name_from_spec(spec: Dict[str, Any]) -> str | None:
    params = spec.get("params", {}) or {}
    return params.get("new_feature") or params.get("raw_feature")


def build_output_feature_allowlist(specs: List[Dict[str, Any]]) -> Set[str]:
    out: Set[str] = set()
    for spec in specs:
        name = _output_feature_name_from_spec(spec)
        if name:
            out.add(name)
    return out


# --------------------------------------------------------------------------------------
# 1) Synthetic DATE_OF_REQUEST generator based on DATE_REPORTED grouped by ZEST_KEY
# --------------------------------------------------------------------------------------

def create_synthetic_date_of_request_simple(
    df: pd.DataFrame,
    *,
    zest_key_col: str = "ZEST_KEY",
    date_reported_col: str = "DATE_REPORTED",
    zest_key_out: str = "ZEST_KEY",
    rpt_col: str = "rptDate",
    out_col: str = "date_of_request",
    date_reported_format: str | None = "%m%d%Y",
    seed: int | None = None,
    verbose: bool = False,
) -> pd.DataFrame:
    out = df.copy()
    ## apply zest ke and date fix 
    out = apply_converter_spec(
        out,
        {"type": "StringConverterV2", "params": {"raw_feature": zest_key_col, "new_feature": zest_key_out}},
        verbose=verbose,
    )

    date_spec: Dict[str, Any] = {
        "type": "DateConverterV2",
        "params": {"raw_feature": date_reported_col, "new_feature": rpt_col},
    }
    if date_reported_format:
        date_spec["params"]["format"] = date_reported_format
    out = apply_converter_spec(out, date_spec, verbose=verbose)

    rng = np.random.default_rng(seed)
    # max rpt date
    max_rpt = out.groupby(zest_key_out, dropna=False)[rpt_col].max()

    ## mapping to random date for request
    offsets = pd.Series(rng.integers(1, 4, size=len(max_rpt)), index=max_rpt.index)

    dor_map = (max_rpt + offsets.map(lambda m: pd.DateOffset(months=int(m)))).rename(out_col)
    out[out_col] = out[zest_key_out].map(dor_map)

    return out
def load_mapping_specs_from_json(path: Union[str, Path], *, key: str = "mapping") -> List[Dict[str, Any]]:
    p = Path(path)
    with p.open("r", encoding="utf-8") as f:
        obj = json.load(f)

    if key not in obj:
        raise KeyError(f"JSON missing key '{key}'. Keys found: {list(obj.keys())}")

    specs = obj[key]
    if not isinstance(specs, list):
        raise TypeError(f"json['{key}'] must be a list, got {type(specs)}")

    return specs


# --------------------------------------------------------------------------------------
# 3) Apply mappings from JSON + filter outputs
#    (includes DATE_REPORTED as a feature to keep, per your request)
# --------------------------------------------------------------------------------------

def apply_mapping_json_and_filter(
    df: pd.DataFrame,
    mapping_json_path: Union[str, Path],
    *,
    exclude_raw_features: Iterable[str] = (),
    exclude_new_features: Iterable[str] = (),
    extra_keep: Iterable[str] = (),
    mapping_key: str = "mapping",
    always_keep: Iterable[str] = ("DATE_REPORTED",),
    strict: bool = False,
    verbose: bool = True,
) -> pd.DataFrame:
    specs = load_mapping_specs_from_json(mapping_json_path, key=mapping_key)

    exclude_raw = set(exclude_raw_features)
    exclude_new = set(exclude_new_features)

    def _spec_included(spec: Dict[str, Any]) -> bool:
        params = spec.get("params", {}) or {}
        raw = params.get("raw_feature")
        new = params.get("new_feature")
        if raw and raw in exclude_raw:
            return False
        if new and new in exclude_new:
            return False
        return True

    specs_to_apply = [s for s in specs if _spec_included(s)]

    out = df.copy()
    for i, spec in enumerate(specs_to_apply, start=1):
        if verbose:
            print(f"\n[apply_mapping_json_and_filter] Step {i}/{len(specs_to_apply)}")
        out = apply_converter_spec(out, spec, verbose=verbose)

    keep = build_output_feature_allowlist(specs_to_apply) | set(always_keep) | set(extra_keep)

    missing = [c for c in keep if c not in out.columns]
    if missing and strict:
        raise KeyError(f"Missing expected columns after mapping: {missing}")

    keep_in_df = [c for c in keep if c in out.columns]
    return out.loc[:, keep_in_df].copy()



## Mapper Functions

## high_credit_amt: The highest reported balance (outstanding debt) you have had so far for that tradeline. ,
* Installment Accounts: If you have't missed any payments, this should simple be the balance you started with. If you miss payments, this can acrue on the high credit.
* For revolving, would be the highest balance you had at that point. Even if you pay off immediately, would be the most expensive purchase
* This [link](https://zestfinance.atlassian.net/wiki/spaces/DS/pages/1710784539/App+Review+Guide) **Seems to apply that installment it is the original balance**, is THAT TRUE WHAT ABOUT LATE FEES/ETC??
* We apply a numeric converter from CREDIT_LIMIT
## Balance: The amt you debt you have right now. 
* We apply numeric converter on it from BALANCE. We fill na 0. 
## credit_limit: The maximum amount you can borrow.
* We first apply a numeric converter to this to create credit_limit and then we make it NA for non revolving, open, or charge card accounts (see later)
* Only makes sense the context of revolving accounts
## pastDueAmt: The dolalr amount on the tradeline that is past due at the report date (may include feeds and interest that come with it
* The amount of money you should have paid by now but have not yet
* Revolving: This can be the minimum payments you have missed so far
* Installment: Generally equals the sum of the missed payments
* We create pastDueAmt from PAST_DUE_AMOUNT and we fill NA with 0
## scheduled_payment_amount: Contractual amount due for next payment
* Installment: That fixed payment
* Revolving: The minimum payment amount for next one?
* **EQUIFAX** says this is the monthly amount regardless of the actual payment frequency (Page 271) **Confirm**
**QUESTION: DON'T REALLY USE ACTUAL PAYMENT AMOUNT A LOT AT ALL? SIMIALR SIGNAL NOT RELIABLE?? WHY**
* We create this with numeric converter from SCHEDULED_PAYMENT_AMOUNT
## termDur: How Long the PAyment Last
* We create termDur from the TERMS_DURATION. We do not fill na with 0 here.
* The amount of time to repay the loan. (page 271)
## termFreqStr: String version of how often you have to pay
## termFreqMult: How often you have to pay
* we create termFreqMult from TERMS_FREQUENCY and we use a NumericConverter that first has a mapping.
* We create a new variable here, which assumes missing that the term Frequency is 1, and then we use the mapping to convert the frequency into number **based on how many months there are**
```sh
{
"M": 1,
"B": 2.1666666666666665,
"W": 4.333333333333333,
"E": 2,
"L": 0.5,
"Q": 0.3333333333333333,
"S": 0.16666666666666666,
"T": 0.25,
"Y": 0.08333333333333333,
"D": 1,
"P": 1,
"0": 1,
"<": 1
}
* This can also be seen on page 155 in the equifax document
* 
```
## Date_Of_Request: The application date. We use in reference to these but don't have exactly. For each person, we can give them a random application date between 0 and 3 months after the latest reported date for this tradeline.
## openDate: Date that tradeline was opened
* We create this with DateConverter from Date_Opened
## closedDate: Data that the tradeline was closed
Equifax: "contains the date the account was closed. It will not be populated when Date Major Delinquency 1st Reported is present."
**So this means closed date basically means it ended without a major delinquency??**
* We create this from CLOSED_DATE using data converter
## majordqDate: Date of First Major Delinquency
* Data Dictionary: IF current rate/status is 6,7,8,9, M, Z or if trade contains narrative code 081 (foreclosure) this data will reflect the first time narrative code was reported. (See Narrative Code Section)
* 6 (Collection Account), 7 (Apart of Chapter 13 Bakrupcy), 8 (Repossesion has occured) (9 it has been charged off Z: (Foreclosure)
* * We create this from DMD_REPORTED
## rptDate: Date of application (is this right)**ASK QUESTION**
* we create this using data converter from DATE_REPORTED
## lstPmtDate: Date the user made the last payment
* We create this with dateconverter from LAST_PAYMENT_DATE
## accountType: type of loan
* We create accountType from ACCOUNT_TYPE using StringConverter
* We can see all of these on page 150 on the document. For examplem 18 is credit card
* "Contains a code that describes the kind of lona (auto, home improvement, credit card etc)
## portfolioType: The type of loan (more general)
* We create it using string convert from PORTFOLIO_TYPE and we map O to R. This maps open to revolving. Open is typically a **charge card: where you have full payment every cycle**
## PORTFOLIO_TYPE: The type of loan (With open)
* Without the original mapping 

In [10]:
test_inquiry_df['ACCOUNT_TYPE'].value_counts()

ACCOUNT_TYPE
18    117583
07    113746
00     76021
12     65638
26     24276
       ...  
9B         5
37         3
72         2
7A         1
67         1
Name: count, Length: 62, dtype: int64

In [11]:
test_inquiry_df['PORTFOLIO_TYPE'].value_counts()

PORTFOLIO_TYPE
R    249030
I    186092
M     36231
C     11911
O      8543
*      1599
Name: count, dtype: int64

In [None]:
test_inquiry_d

## ecoa: Relationship of the person to the tradeline
* We create it from ECOA_DESIGNATOR using string converter
* Account Designator Codes

### Account Designator Codes (ECOA)

| CODE | DESCRIPTION |
| :---: | :--- |
| A | Authorized User – This is an authorized user of this account; another individual has contractual responsibility. |
| B | On behalf of another person – The subject has financial responsibility for an account, which is used exclusively by another person. |
| C | Co-maker – The subject has co-signed for a loan, and will be responsible for payment if the borrower should default. |
| I | Individual Account – The subject of the report has contractual responsibility for this account and is primarily responsible for its payment. |
| J | Joint Account – The subject and another person (or persons) are jointly responsible for payment on this account. |
| M | Maker – The subject is responsible for payment of a loan, but a co-maker will be responsible for payment if maker defaults. |
| S | Shared, but otherwise undesignated – This code is an indication that the credit grantor knows that the subject and at least one other person share the account, but not enough information is available to designate the account as “J” or “A”. |
| T | Terminated – The subject’s relationship to this account has ended, although other parties who once shared the account may continue to maintain the account. |
| U | Undesignated |
| X | Deceased (Not returned on Trade Lines) |

In [12]:
test_inquiry_df['ECOA_DESIGNATOR'].value_counts()

ECOA_DESIGNATOR
I    373369
J     96912
A     17416
M      4119
C      3784
T      1519
S      1043
U        12
X         9
B         3
Name: count, dtype: int64

## NARRATIVE_CODE_1
## NARRATIVE_CODE_2
* These are both created by just using string_converter
* According to experian on page 158, the narrative code indicate "certain comments considering the segment information in question"
* **WHAT IS THE DIFFERENCE BETWEEN NARRATIVE_CODE_1 and NARRATIVE_CODE_2?**


## dqDate
* "COntains the date that the highest rate/status occured outside of the timeframe of the payment history that has been created"
* **Equifax seems to suggest that this is based on the previous high rate, which only tracks delinquency (page 153)**
* **IS this for when it occured outside of the payment history**
* Confused how this is dqDate if it is outside of history
* "The date that the highest rate/status occured outside of the timeframe of the payment history that has been requested."
* We create this from PREVIOUS_HIGH_DATE_1. **So for the example below does that mean 1 is example of people who were DQ at some point but now have paid debts off**?
## Rate_Status_Code: The current rating on the account. 
* Rate codes are numeric and status codes are letters
<table border="0">
  <tr>
    <td valign="top" width="50%">
      <h3>Rate Codes</h3>
      <table>
        <thead>
          <tr>
            <th>CODE</th>
            <th>DESCRIPTION</th>
          </tr>
        </thead>
        <tbody>
          <tr>
            <td align="center">0</td>
            <td>Too new to rate; Approved but not used</td>
          </tr>
          <tr>
            <td align="center">1</td>
            <td>Pays account as agreed</td>
          </tr>
          <tr>
            <td align="center">2</td>
            <td>Not more than two payments past due</td>
          </tr>
          <tr>
            <td align="center">3</td>
            <td>Not more than three payments past due</td>
          </tr>
          <tr>
            <td align="center">4</td>
            <td>Not more than four payments past due</td>
          </tr>
          <tr>
            <td align="center">5</td>
            <td>At least 120 days or more than four payments past due</td>
          </tr>
          <tr>
            <td align="center">6</td>
            <td>Collection account (Enhanced Trade Only)</td>
          </tr>
          <tr>
            <td align="center">7</td>
            <td>Included in Chapter 13</td>
          </tr>
          <tr>
            <td align="center">8</td>
            <td>Repossession</td>
          </tr>
          <tr>
            <td align="center">9</td>
            <td>Charge-off</td>
          </tr>
          <tr>
            <td align="center">Blank</td>
            <td>No rate reported</td>
          </tr>
        </tbody>
      </table>
    </td>
    <td valign="top" width="50%">
      <h3>Status Codes</h3>
      <table>
        <thead>
          <tr>
            <th>CODE</th>
            <th>DESCRIPTION</th>
          </tr>
        </thead>
        <tbody>
          <tr>
            <td align="center">A</td>
            <td>Account is inactive</td>
          </tr>
          <tr>
            <td align="center">B</td>
            <td>Lost or stolen card</td>
          </tr>
          <tr>
            <td align="center">C</td>
            <td>Contact member for status</td>
          </tr>
          <tr>
            <td align="center">D</td>
            <td>Refinanced or renewed</td>
          </tr>
          <tr>
            <td align="center">E</td>
            <td>Consumer deceased</td>
          </tr>
          <tr>
            <td align="center">F</td>
            <td>In financial counseling</td>
          </tr>
          <tr>
            <td align="center">G</td>
            <td>Foreclosure process started</td>
          </tr>
          <tr>
            <td align="center" style="color:red">H</td>
            <td style="color:red">In WEP of other party <i>(retired 2-2-2009)</i></td>
          </tr>
          <tr>
            <td align="center">J</td>
            <td>Adjustment pending</td>
          </tr>
          <tr>
            <td align="center">M</td>
            <td>Included in Chapter 13</td>
          </tr>
          <tr>
            <td align="center">S</td>
            <td>Dispute - resolution pending</td>
          </tr>
          <tr>
            <td align="center">Z</td>
            <td>Included in Bankruptcy</td>
          </tr>
          <tr>
            <td align="center" style="color:red">#</td>
            <td style="color:red">In BK of Another Person <i>(retired 2-2-2009)</i></td>
          </tr>
          <tr>
            <td align="center">$</td>
            <td>Assigned to US Dept of ED</td>
          </tr>
        </tbody>
      </table>
    </td>
  </tr>
</table>

In [21]:
test_inquiry_df[~test_inquiry_df['PREVIOUS_HIGH_RATE_1'].isna()]['RATE_STATUS_CODE'].value_counts()

RATE_STATUS_CODE
1    28150
9    15113
5     6171
6     3654
2     1850
3     1275
4      979
7      355
8      224
*      147
Z       14
C        9
Name: count, dtype: int64

In [13]:
test_inquiry_df['TERMS_DURATION']

0         None
1         None
2         None
3         0360
4         0180
          ... 
499995    None
499996    0084
499997    None
499998    None
499999    0075
Name: TERMS_DURATION, Length: 500000, dtype: object

## Payment_History_1_24
* The payment history of the most recent 24 months. Leftmost is most recent
## Payment_History_25_36: Payment history from months 25-36
## Payment_History_37_48: Payment history from months 37-48


# Payment History

## Payment_History_1_24: Most recent 24 months?

## ACTIVITY_DESIGNATOR
* We just create it with string converter. It is the final state of the account.
| Code | Description |
| :---: | :--- |
| **B** | Paid and Closed |
| **C** | Closed |
| **D** | Transfer/Sold/Paid |
| **L** | Lost/Stolen |
| **P** | Paid |
| **R** | Refinanced |
| **T** | Transfer/Sold |

**SO BASED ON EQUIFAX IT SAYS THAT CLOSED DATE BASICALLY IS ONYL THERE WHEN IT I CLSOED AND NOT CHARGED OFF, IS THIS WHY WE DON"T SEE ANY CHARGEOFF CODES HERE?**

## chargeoff_amt: The amount the creditor wrote off when the account was charged off

**Question: Not always the same as how much they defaulted or owe, more of accounting that they wrote off as lost?**

In [15]:
test_inquiry_df[~test_inquiry_df['CLOSED_DATE'].isna()]['ACTIVITY_DESIGNATOR'].value_counts()

ACTIVITY_DESIGNATOR
B    213057
T     28300
C      7746
D      2401
L      1460
P       302
R       125
Name: count, dtype: int64

In [16]:
test_inquiry_df[test_inquiry_df['CLOSED_DATE'].isna()]['ACTIVITY_DESIGNATOR'].value_counts()

ACTIVITY_DESIGNATOR
T    8056
B    5888
C    5363
P    1435
D     258
R     220
L     109
Name: count, dtype: int64

In [17]:
 test_inquiry_df[test_inquiry_df['CLOSED_DATE'].isna()]['ORIGINAL_CHARGE_OFF_AMOUNT'].value_counts()

ORIGINAL_CHARGE_OFF_AMOUNT
000000606    29
000000593    27
000000722    26
000000594    25
000000502    25
             ..
000006433     1
000012058     1
000009643     1
000006006     1
000001639     1
Name: count, Length: 4339, dtype: int64

## First Create a Synthetic Date Reported

* For each person find their max date reporter and then make the date_of_request be randomly 1-3 months after to simulate the quarter pulling.

In [283]:
test_inquiry_df_with_date_of_request = create_synthetic_date_of_request_simple(test_inquiry_df, seed=42)

  dor_map = (max_rpt + offsets.map(lambda m: pd.DateOffset(months=int(m)))).rename(out_col)


In [285]:
test_inquiry_df_with_date_of_request.groupby("ZEST_KEY")["date_of_request"].nunique(dropna=True).value_counts()

date_of_request
1    29895
Name: count, dtype: int64

In [288]:
test_inquiry_df_with_date_of_request['ZEST_KEY'].nunique()

29895

In [286]:
latest = test_inquiry_df_with_date_of_request.loc[test_inquiry_df_with_date_of_request.groupby("ZEST_KEY")["rptDate"].idxmax()]

result = (
    latest[["ZEST_KEY", "rptDate", "date_of_request"]]
    .assign(days_from_date_of_request=lambda x: (x["date_of_request"] - x["rptDate"]).dt.days)
    .sort_values("days_from_date_of_request")
    .reset_index(drop=True)
)


In [287]:
result['days_from_date_of_request'].describe()

count   29895.00
mean       61.08
std        24.88
min        28.00
25%        31.00
50%        61.00
75%        92.00
max        92.00
Name: days_from_date_of_request, dtype: float64

# Apply all of the other ones

In [37]:
ls ../model-engine

CHANGELOG.md  [0m[01;34mhistory[0m/     [01;34mmodel_engine[0m/           setup.py
[01;34mclients[0m/      Jenkinsfile  [01;34mmodel_engine.egg-info[0m/  test-requirements.txt
[01;34mdocs[0m/         LICENSE.txt  README.md               [01;34mtests[0m/
[01;34mexamples[0m/     MANIFEST.in  requirements.txt


In [289]:
test_inquiry_df_mapped['date_of_request']

0        2014-07-29
1        2014-05-23
2        2013-12-08
3        2013-11-30
4        2011-03-01
            ...    
499995   2020-05-14
499996   2020-04-01
499997   2020-05-05
499998   2020-05-03
499999   2020-04-01
Name: date_of_request, Length: 500000, dtype: datetime64[ns]

In [290]:
test_inquiry_df_mapped = apply_mapping_json_and_filter(
    df=test_inquiry_df_with_date_of_request,
    mapping_json_path="../model-engine/model_engine/assets/equifax/cms_6/fe2/trade.json",
    exclude_raw_features={"DATE_OF_REQUEST", "DATE_REPORTED"},
    exclude_new_features={"date_of_request", "rptDate"},
    extra_keep={"DATE_OF_REQUEST", "rptDate", "DMD_REPORTED", 'date_of_request'},
    verbose=True,
)



[apply_mapping_json_and_filter] Step 1/28
[apply_converter_spec] Using StringConverterV2 | raw_feature='ZEST_KEY' -> new_feature='ZEST_KEY'

[apply_mapping_json_and_filter] Step 2/28
[apply_converter_spec] Using NumericConverterV2 | raw_feature='BALANCE' -> new_feature='balance_amt'

[apply_mapping_json_and_filter] Step 3/28
[apply_converter_spec] Using NumericConverterV2 | raw_feature='CREDIT_LIMIT' -> new_feature='credit_limit'

[apply_mapping_json_and_filter] Step 4/28
[apply_converter_spec] Using NumericConverterV2 | raw_feature='HIGH_CREDIT' -> new_feature='high_credit_amt'

[apply_mapping_json_and_filter] Step 5/28
[apply_converter_spec] Using NumericConverterV2 | raw_feature='PAST_DUE_AMOUNT' -> new_feature='pastDueAmt'

[apply_mapping_json_and_filter] Step 6/28
[apply_converter_spec] Using NumericConverterV2 | raw_feature='SCHEDULED_PAYMENT_AMOUNT' -> new_feature='scheduled_payment_amount'

[apply_mapping_json_and_filter] Step 7/28
[apply_converter_spec] Using DateConverterV2 

## Analysis

## Question 1: For High_Credit_AMT, will the 

In [291]:
test_inquiry_df_mapped['PORTFOLIO_TYPE'].value_counts()

PORTFOLIO_TYPE
R    249030
I    186092
M     36231
C     11911
O      8543
*      1599
Name: count, dtype: int64

## BZ: Account PAID FOR less than full balanace. Mostly revolving makes sense. 

In [292]:
test_inquiry_df_mapped[test_inquiry_df_mapped['NARRATIVE_CODE_1']=='BZ']['PORTFOLIO_TYPE'].value_counts()

PORTFOLIO_TYPE
R    1542
I     357
O     142
M      87
C      27
*       6
Name: count, dtype: int64

In [293]:
def summarize_overlap(mask_a, mask_b, name_a="Mask A", name_b="Mask B"):
    sum_a = np.sum(mask_a)
    sum_b = np.sum(mask_b)

    mask_both = mask_a & mask_b
    sum_both = np.sum(mask_both)

    pct_b_given_a = (sum_both / sum_a * 100) if sum_a > 0 else np.nan
    pct_a_given_b = (sum_both / sum_b * 100) if sum_b > 0 else np.nan

    print(f"Number of Tradelines with {name_a}: {sum_a}")
    print(f"Number of Tradelines with {name_b}: {sum_b}")
    print(f"Number of Tradelines with both {name_a} and {name_b}: {sum_both}")
    print(f"Percent of {name_a} that also have {name_b}: {pct_b_given_a:.3f}")
    print(f"Percent of {name_b} that also have {name_a}: {pct_a_given_b:.3f}")

    return mask_both, sum_both

## Question 2: **“closedDate will not be populated when Date Major Delinquency 1st Reported is present”** So does closed Date mean no charge off? And does closed mean no major DQ occured???

For all the tradelines with a charge-off amount greater than 0, the closed date only exists 10 percent of the time

In [294]:
mask_closed = test_inquiry_df_mapped["closedDate"].notna()
mask_charge_off_amount_greater_0 = (test_inquiry_df_mapped['chargeoff_amt']>0)


mask_charge_off_and_closed, sum_charge_off_and_closed =  summarize_overlap(mask_a = mask_closed, mask_b = mask_charge_off_amount_greater_0,
                                                                          name_a = "ClosedDate", name_b = "Pos Charge Off Amount")


Number of Tradelines with ClosedDate: 255249
Number of Tradelines with Pos Charge Off Amount: 14410
Number of Tradelines with both ClosedDate and Pos Charge Off Amount: 1423
Percent of ClosedDate that also have Pos Charge Off Amount: 0.557
Percent of Pos Charge Off Amount that also have ClosedDate: 9.875


In [295]:
mask_first_major_delinquency_exists = test_inquiry_df_mapped['majordqDate'].notna()

masked_closed_date_and_first_major_dq_exists, sum_masked_closed_date_and_first_major_dq_exists =  summarize_overlap(
                                                            mask_a = mask_closed, mask_b = mask_first_major_delinquency_exists,
                                                                          name_a = "ClosedDate", name_b = "First Major DQ Date Exists")

Number of Tradelines with ClosedDate: 255249
Number of Tradelines with First Major DQ Date Exists: 28602
Number of Tradelines with both ClosedDate and First Major DQ Date Exists: 0
Percent of ClosedDate that also have First Major DQ Date Exists: 0.000
Percent of First Major DQ Date Exists that also have ClosedDate: 0.000


In [296]:


masked_pos_charge_off_and_major_dq, sum_masked_pos_charge_off_and_major_dq =  summarize_overlap(
                                                            mask_a = mask_charge_off_amount_greater_0, mask_b = mask_first_major_delinquency_exists,
                                                                          name_a = "Pos Charge Off Amount", name_b = "First Major DQ Date Exists")

Number of Tradelines with Pos Charge Off Amount: 14410
Number of Tradelines with First Major DQ Date Exists: 28602
Number of Tradelines with both Pos Charge Off Amount and First Major DQ Date Exists: 12888
Percent of Pos Charge Off Amount that also have First Major DQ Date Exists: 89.438
Percent of First Major DQ Date Exists that also have Pos Charge Off Amount: 45.060


Our dat dictionary says that the current rate status is 6,7,8,9,M,Z or if it contains narrative code 081, 
this will refect the first time that code reported. **QUESTION: SO BASICALLY THIS MAJOR_DQ_DATE MEANS  IF CURRENTLY IN MAJOR DEREGATORY STATUS, THIS WILL SHOW THE FIRST TIME YOU WERE IN MAJOR DEROGATORY STATUS**

Note, rate code 6 is that this is a collection account, 7 means in chapter 13 bankrupcy, 8 means it is being repossessed, 9 chargedd off, M means in chapter 13, Z means in bankruptcy. 



In [297]:
test_inquiry_df_mapped[mask_first_major_delinquency_exists]['RATE_STATUS_CODE'].value_counts(), np.sum(test_inquiry_df_mapped[mask_first_major_delinquency_exists]['RATE_STATUS_CODE'].value_counts())

(RATE_STATUS_CODE
 9    14316
 6     8588
 Z     4684
 7      771
 8      240
 5        3
 Name: count, dtype: int64,
 28602)

## Conclusion to Question 2: ClosedDate being populated basically means the account is not currently in a major derogatory status. They could still have a positive charge off amount, likely because they were charged off but got paid back?

In [298]:
test_inquiry_df_mapped[masked_pos_charge_off_and_major_dq]['RATE_STATUS_CODE'].value_counts()

RATE_STATUS_CODE
9    12888
Name: count, dtype: int64

In [299]:
mask_no_major_deliquency_status  = ~(test_inquiry_df_mapped['majordqDate'].notna())



mask_pos_charge_off_and_no_major_dq_date, sum_mask_pos_charge_off_and_no_major_dq_date =  summarize_overlap(
                                                            mask_a = mask_charge_off_amount_greater_0, 
                                                        mask_b = mask_no_major_deliquency_status,
                                                              name_a = "Pos Charge Off Amount", 
                                                        name_b = "First Major DQ Date DOES NOT Exist")

Number of Tradelines with Pos Charge Off Amount: 14410
Number of Tradelines with First Major DQ Date DOES NOT Exist: 471398
Number of Tradelines with both Pos Charge Off Amount and First Major DQ Date DOES NOT Exist: 1522
Percent of Pos Charge Off Amount that also have First Major DQ Date DOES NOT Exist: 10.562
Percent of First Major DQ Date DOES NOT Exist that also have Pos Charge Off Amount: 0.323


In [300]:
test_inquiry_df_mapped[mask_pos_charge_off_and_no_major_dq_date]['RATE_STATUS_CODE'].value_counts()

RATE_STATUS_CODE
9    1514
1       5
*       3
Name: count, dtype: int64

## How can you be paying off and still have a rate status of charged off. Is charged off a final accounting action?

## **Key Point: Charged Off is a Final Accounting Action and not a Final Collection/Bureau action**

In [301]:
test_inquiry_df_mapped[mask_pos_charge_off_and_no_major_dq_date]['DMD_REPORTED'].value_counts()

Series([], Name: count, dtype: int64)

In [302]:
test_inquiry_df_mapped.loc[mask_pos_charge_off_and_no_major_dq_date, "DMD_REPORTED"].notna().sum()


0

In [303]:
test_inquiry_df_mapped[mask_pos_charge_off_and_no_major_dq_date]['ecoa'].value_counts()

ecoa
I    1391
A      77
J      51
C       2
T       1
Name: count, dtype: int64

In [304]:
test_inquiry_df_mapped[mask_pos_charge_off_and_no_major_dq_date]['ACTIVITY_DESIGNATOR'].value_counts()

ACTIVITY_DESIGNATOR
C    982
P    262
B    107
T     21
D     12
Name: count, dtype: int64

## We can see that two a good portion of these have paid and closed and paid. That's why not in major dq status. 

In [305]:
test_inquiry_df_mapped[mask_pos_charge_off_and_no_major_dq_date][['NARRATIVE_CODE_1','NARRATIVE_CODE_2']].value_counts()

NARRATIVE_CODE_1  NARRATIVE_CODE_2
DB                CW                  870
BZ                BQ                  243
IR                DB                   79
BQ                CW                   67
IQ                DB                   33
IR                BZ                   26
IP                DB                   22
DB                EP                   14
BZ                DB                    9
IP                BZ                    8
DB                KF                    7
IR                BQ                    6
IQ                BZ                    3
IP                BQ                    3
HX                DB                    2
DB                BX                    2
CJ                DB                    2
IR                IQ                    2
DB                GL                    1
                  EQ                    1
Name: count, dtype: int64

In [306]:
test_inquiry_df_mapped[mask_pos_charge_off_and_no_major_dq_date][['NARRATIVE_CODE_1','NARRATIVE_CODE_2', 'ecoa']].value_counts()

NARRATIVE_CODE_1  NARRATIVE_CODE_2  ecoa
DB                CW                I       808
BZ                BQ                I       213
IR                DB                I        77
BQ                CW                I        62
DB                CW                A        54
IQ                DB                I        32
IR                BZ                I        24
IP                DB                I        22
BZ                BQ                A        15
                                    J        15
DB                EP                I        14
IP                BZ                I         8
BZ                DB                I         7
DB                CW                J         7
                  KF                J         6
IR                BQ                I         6
BQ                CW                A         4
IQ                BZ                I         3
IP                BQ                I         3
IR                IQ                I         2

CW: Account Closed by Credit Grantor, DB: Charged off Account
BZ: Account Paid for Less than Full Balance
BQ: Paid Charge OFF
IQ: Consumer Disputes after Resolution
IR: Account Closed at Consumers Request

## BZ and BQ seem to imply that the borrower is starting to pay back a non-charged off loan, but DB and CW seem to imply they are not. 
## IS DB and CW a data problem???



In [307]:
test_inquiry_df_mapped['High_Credit_GEQ_Balance'] =(
    test_inquiry_df_mapped['high_credit_amt']>=test_inquiry_df_mapped['balance_amt']).astype(int)

In [308]:
test_inquiry_df_mapped[mask_first_major_delinquency_exists]['High_Credit_GEQ_Balance'].value_counts()

High_Credit_GEQ_Balance
0    20988
1     7614
Name: count, dtype: int64

In [309]:
mask_narrative_1_DB_narrative_2_CW = (test_inquiry_df_mapped['NARRATIVE_CODE_1']=='DB') & (test_inquiry_df_mapped['NARRATIVE_CODE_2']=='CW')
mask_not_narrative_1_DB_narrative_2_CW  = ~(mask_narrative_1_DB_narrative_2_CW)

In [310]:
test_inquiry_df_mapped[(mask_pos_charge_off_and_no_major_dq_date) &(mask_narrative_1_DB_narrative_2_CW)]['balance_amt'].describe()

count     870.00
mean     2405.54
std      3274.19
min         0.00
25%       536.00
50%      1269.50
75%      2909.00
max     22649.00
Name: balance_amt, dtype: float64

In [311]:
test_inquiry_df_mapped[(mask_pos_charge_off_and_no_major_dq_date) &(mask_narrative_1_DB_narrative_2_CW)]['pastDueAmt'].describe()

count     870.00
mean     2405.54
std      3274.19
min         0.00
25%       536.00
50%      1269.50
75%      2909.00
max     22649.00
Name: pastDueAmt, dtype: float64

In [312]:
test_inquiry_df_mapped[(mask_pos_charge_off_and_no_major_dq_date) &
(mask_narrative_1_DB_narrative_2_CW)]['High_Credit_GEQ_Balance'].value_counts()

High_Credit_GEQ_Balance
0    870
Name: count, dtype: int64

In [313]:
test_inquiry_df_mapped[(mask_pos_charge_off_and_no_major_dq_date) &(mask_not_narrative_1_DB_narrative_2_CW)]['balance_amt'].describe()

count      652.00
mean      2386.25
std      10005.47
min          0.00
25%          0.00
50%          0.00
75%        716.00
max     126099.00
Name: balance_amt, dtype: float64

In [314]:
test_inquiry_df_mapped[(mask_pos_charge_off_and_no_major_dq_date) &(mask_not_narrative_1_DB_narrative_2_CW)]['pastDueAmt'].describe()

count     652.00
mean     1556.26
std      4924.77
min         0.00
25%         0.00
50%         0.00
75%       653.25
max     63966.00
Name: pastDueAmt, dtype: float64

In [315]:
test_inquiry_df_mapped[(mask_pos_charge_off_and_no_major_dq_date) &(mask_not_narrative_1_DB_narrative_2_CW)]['High_Credit_GEQ_Balance'].value_counts()

High_Credit_GEQ_Balance
0    652
Name: count, dtype: int64

## **Question: Are these data entry issues? If Rate Status Code is 9, shoudn't that mean majordqDate should exist by the definition??**

Think conclusion is that closedDate will not be populated in major derogatory status.

# Preprocessing:

## We create months_since_rptDate, months_since_openDate,months_since_lstPmtDate
* They tell us respectively the difference in months between the application date and the date the tradeline was reported, opened, and the last date of a payment on the tradeline

## Payment History Features



In [316]:
[feature for feature in test_inquiry_df.columns if 'date' in feature.lower()]

['DATE_REPORTED',
 'DATE_OPENED',
 'AUTOMATED_UPDATE_INDICATOR',
 'PREVIOUS_HIGH_DATE_1',
 'PREVIOUS_HIGH_DATE_2',
 'PREVIOUS_HIGH_DATE_3',
 'LAST_PAYMENT_DATE',
 'CLOSED_DATE',
 'DEFERRED_PAYMENT_START_DATE',
 'BALLOON_PAYMENT_DUE_DATE',
 'PREVIOUS_HIGH_DATE_BEFORE_HISTORY']

In [317]:
months_since_rptDate = (
    (test_inquiry_df_mapped["date_of_request"] - test_inquiry_df_mapped["rptDate"])
    / np.timedelta64(1, "M")
)

test_inquiry_df_mapped["months_since_rptDate"] = months_since_rptDate

In [318]:
test_df = test_inquiry_df_mapped[['rptDate',  "RATE_STATUS_CODE",
                                  "PAYMENT_HISTORY_1_24","PAYMENT_HISTORY_25_36","PAYMENT_HISTORY_37_48",'date_of_request', 'months_since_rptDate']]


In [319]:
ppt_name = 'zest_payment_pattern'

payment_patterns = {
                    "patterns": [
                        "RATE_STATUS_CODE",
                        "PAYMENT_HISTORY_1_24",
                        "PAYMENT_HISTORY_25_36",
                        "PAYMENT_HISTORY_37_48"
                    ],
                    "rate": {
                        "paid_as_agreed": ["0", "1"],
                        "DQ30+": ["2", "3", "4", "5", "6", "7", "8", "9", "G", "K", "L", "Z"],
                        "DQ60+": ["3", "4", "5", "6", "7", "8", "9", "G", "K", "L", "Z"],
                        "DQ90+": ["4", "5", "6", "7", "8", "9", "G", "K", "L", "Z"],
                        "DQ120+": ["5", "6", "7", "8", "9", "G", "K", "L", "Z"],
                        "CO": ["6", "8", "9", "G", "K", "L", "Z"],
                        "DQ30": ["2"],
                        "DQ60": ["3"],
                        "DQ90": ["4"],
                        "DQ120": ["5"]
                    },
                    "trim": 48,
                    "placeholder": "/",
                    "keep": ["DQ30+", "DQ60+", "DQ90+", "DQ120+", "CO", "DQ30", "DQ60", "DQ90"]
                }


In [374]:
patterns = payment_patterns['patterns']
rate = payment_patterns['rate']
trim = payment_patterns['trim']
placeholder = payment_patterns['placeholder']
name = ""
zest_ppt_name = f"zest_payment_pattern{name}"
ppt_len_name = f"payment_history_length{name}"
return_ppt_len = True
return_zest_ppt = True
report_date = 'rptDate'
date_delta_col = f'months_since_{report_date}'
print(f'''
patterns: {patterns}
rate: {rate}
trim: {trim}
placeholder: {placeholder}
zest_ppt_name: {zest_ppt_name}
ppt_len_name: {ppt_len_name}
return_ppt_len: {return_ppt_len}
return_zest_ppt: {return_zest_ppt}
report_date: {report_date}
date_delta_col: {date_delta_col}
''')


patterns: ['RATE_STATUS_CODE', 'PAYMENT_HISTORY_1_24', 'PAYMENT_HISTORY_25_36', 'PAYMENT_HISTORY_37_48']
rate: {'paid_as_agreed': ['0', '1'], 'DQ30+': ['2', '3', '4', '5', '6', '7', '8', '9', 'G', 'K', 'L', 'Z'], 'DQ60+': ['3', '4', '5', '6', '7', '8', '9', 'G', 'K', 'L', 'Z'], 'DQ90+': ['4', '5', '6', '7', '8', '9', 'G', 'K', 'L', 'Z'], 'DQ120+': ['5', '6', '7', '8', '9', 'G', 'K', 'L', 'Z'], 'CO': ['6', '8', '9', 'G', 'K', 'L', 'Z'], 'DQ30': ['2'], 'DQ60': ['3'], 'DQ90': ['4'], 'DQ120': ['5']}
trim: 48
placeholder: /
zest_ppt_name: zest_payment_pattern
ppt_len_name: payment_history_length
return_ppt_len: True
return_zest_ppt: True
report_date: rptDate
date_delta_col: months_since_rptDate



### Construction of zest_payment_pattern (_construct_payment_pattern_cols)

#### 1)  We append on the left of all the history (_combine_payment_patterns)

In [375]:
new_ppt = None
first_ppt_name = patterns[0]
print(f'Make payment collumn be the most recent 24')
new_ppt = test_df[first_ppt_name].copy(deep=True)
print(new_ppt.head(1))
for ppt in patterns[1:]:
    ppt_data_ = test_df[ppt].copy(deep=True)
    ppt_data_.loc[ppt_data_.isna()] = ""
    new_ppt += ppt_data_
    print(f'append')
    print(new_ppt.head(1))

Make payment collumn be the most recent 24
0    1
Name: RATE_STATUS_CODE, dtype: object
append
0    1************/************
Name: RATE_STATUS_CODE, dtype: object
append
0    1************/************/************
Name: RATE_STATUS_CODE, dtype: object
append
0    1************/************/************/******...
Name: RATE_STATUS_CODE, dtype: object


#### 2) We remove any placeholder values (Equifax uses / as a placeholder for every 12 months)

In [376]:
## Construct payment pattern column First we combine all the payment pattenns such that left most is most recent


print(f'remove placeholder / in equifax is a placeholder for every 12 months')
new_ppt = new_ppt.str.replace(placeholder, "")
print(new_ppt.head(1))


remove placeholder / in equifax is a placeholder for every 12 months
0    1************************************************
Name: RATE_STATUS_CODE, dtype: object


#### 3) We only keep the most recent 48 months (_trim)

In [377]:
print(f'only keep most recent 48 months')
new_ppt = new_ppt.str[:trim]
print(new_ppt.head(1))

only keep most recent 48 months
0    1***********************************************
Name: RATE_STATUS_CODE, dtype: object


#### 4) We add # to the left to account for the time that took place between report date and application date (_add_filers)

In [378]:
## now we add fillers

## always round up months so we are more conservative for dates since. If it has been 1.1 months-> will count as 2 months

months_since_ppt = (
                np.ceil(test_df[date_delta_col]).fillna(0).astype(int)) 
## for each value of months_Since_ppt give it # value
print(f'zeries where each row is #')
z = pd.Series(["#"] * len(months_since_ppt), index=test_df.index, dtype="str")
## make it be the lenght of monthsSince 
filler = pd.Series(z).str.repeat(repeats=months_since_ppt.astype(int)).astype("str")
print(f'Series where number of # is equal to number of months since app date')
print(new_ppt.head(1))
print(f'Put this to the left of your history')
new_ppt = filler.str.cat(new_ppt, join="left")  # append the filler
print(new_ppt.head(1))
print(f'that will be the val of {zest_ppt_name}')
test_df[zest_ppt_name]= new_ppt

test_df[zest_ppt_name].head(1)

zeries where each row is #
Series where number of # is equal to number of months since app date
0    1***********************************************
Name: RATE_STATUS_CODE, dtype: object
Put this to the left of your history
0    ###1******************************************...
dtype: object
that will be the val of zest_payment_pattern


0    ###1******************************************...
Name: zest_payment_pattern, dtype: object

#### We now have the zest_ppt_name: zest_payment_pattern

#### Construction of the Effective Month History  (ppt_len_name: payment_history_length) (_get_effective_month_range)
##### This gives for each persoon, the number of months they actually have valid history in

In [379]:
def _count_trailing_matches(input_str, char_list):
        count = 0
        for char in reversed(input_str.strip()):
            if char in char_list:
                count += 1
            else:
                break
        return count

#### 1) We first count the number of months since it application date (number of #s) to create z_count

In [380]:
print(f''' {ppt_len_name} col''')
trimmed  = test_df[zest_ppt_name]
month_range = test_df[zest_ppt_name].str.len()
exclude_trailing = ["*"]

z_count = trimmed.str.count("#")


 payment_history_length col


#### 2) We then add the * to this count. This is because * uses for blanks. We do not want to count this as valid payment history

In [381]:
print(f'counting num of #')
z_count = trimmed.str.count("#")
print(z_count.head(1).values)
z_count = z_count + trimmed.apply(lambda x: _count_trailing_matches(str(x), exclude_trailing))
print(f'after counting trailing matches')
print(z_count.head(1).values)

counting num of #
[3]
after counting trailing matches
[50]


#### 3) Finally, we take the leength of the entire payment history and substract from the z_count to get the actual effective number of months in payment history

In [382]:


effective_month_range = month_range - z_count

print(f'''
month range: Number of characters you have in sequence {month_range.head(1).values}
z_count: number of characters that don't mean anything (appended # for past time and * for missing): {z_count.head(1).values}
Effective_month_range: number of actual month history: {effective_month_range.head(1).values}
''')


month range: Number of characters you have in sequence [51]
z_count: number of characters that don't mean anything (appended # for past time and * for missing): [50]
Effective_month_range: number of actual month history: [1]



#### 4) We now have the payment_history_length

In [383]:
month_ranges = [3, 6, 12, 24]

#### Trended Features Construction (_construct_trended_features

#### For each set of month range: [3,6,12,24], we grab the most recent months for that range , get the actual number of effective months in there and then for each type of right code, we count the number of that code in that range 

#### 1) For each of these we first strim the string to only the left most characters for that month range
```python
trimmed = new_ppt.str[:month_range]
```
#### 2) We then grab the effective_month_range again (see above for code)
```python
effective_month_count = _get_effective_month_range(trimmed, month_range)
```
#### 3)Then we iterate through each rate key and value pair

```python
rate_items = {  
"paid_as_agreed": ["0", "1"],
"DQ30+": ["2", "3", "4", "5", "6", "7", "8", "9", "G", "K", "L", "Z"],
"DQ60+": ["3", "4", "5", "6", "7", "8", "9", "G", "K", "L", "Z"],
"DQ90+": ["4", "5", "6", "7", "8", "9", "G", "K", "L", "Z"],
"DQ120+": ["5", "6", "7", "8", "9", "G", "K", "L", "Z"],
"CO": ["6", "8", "9", "G", "K", "L", "Z"],
"DQ30": ["2"],
"DQ60": ["3"],
"DQ90": ["4"],
"DQ120": ["5"]}

for rate, values in rate_items():
```
#### 3a) For each of these we first grab the count
```python
count =  get_count(trimmed, values)
```
##### where we have get_count as basically if there is only one value we just count it, and if multiple we use | to count for any of them

```python
if len(values)==1:
    return trimmed.str.count(values[0])
else:
    pattern = "|".join(values)
    return trimmed.str.count(pattern)
```

#### 3b) We then create a feature called "number_{rate}_{month_range}{self.name}" which is the count of that rate code, and then a percent which we divide by the effective month range
* So will this enforce that we get NAN when

### We now have that for every rate a count and a percentage for each month range

### We now for each rate construct a feature that counts the months since that last occured



#### 1) We create a contiguous array of the payment history. 

In [386]:
col = test_df['zest_payment_pattern'].astype("string").to_numpy(dtype="str")

In [402]:
rate = 'paid_as_agreed'
values = ["0", "1"]



#### 2) For each rate we

##### 2a) We first create a matrix with a row for each value and where the position in each row says for that person what was the earliest index that value was there (last time we saw that value). It will be -1 if None found



In [391]:
col[None,:].shape

(1, 500000)

In [396]:
col[None,:].shape

(1, 500000)

In [398]:
col.shape

(500000,)

In [405]:
col[None,:], col[None,:].shape

(array([['###1***********************************************',
         '######1***********************************************',
         '###########1***********************************************',
         ..., '##11111111111*************************************',
         '##1EEEEEEEEEEEEEEEEEEEEEE111111EEEEEEEEEEEEEEEEEEE',
         '###11**********************************************']],
       dtype='<U168'),
 (1, 500000))

In [404]:
 np.array(values, dtype="str")[:, None],  np.array(values, dtype="str")[:, None].shape

(array([['0'],
        ['1']], dtype='<U1'),
 (2, 1))

In [415]:
positions = np.char.find(col[None, :], np.array(values, dtype="str")[:, None]).astype(float)

In [416]:
positions, positions.shape

(array([[-1., -1., -1., ..., -1., -1., -1.],
        [ 3.,  6., 11., ...,  2.,  2.,  3.]]),
 (2, 500000))

In [421]:
np.unique(positions[0])

array([-1.])

In [422]:
np.unique(positions[1])

array([ -1.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
        11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,
        22.,  23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,
        33.,  34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,
        44.,  45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,
        55.,  56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,
        66.,  67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,  75.,  76.,
        77.,  78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,
        88.,  89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,
        99., 100., 101., 102., 103., 104., 105., 106., 107., 108., 109.,
       110., 111., 112., 113., 114., 115., 116., 117., 118., 119., 120.])

In [410]:
np.char.find(col[None, :], np.array(values, dtype="str")[:, None]).astype(float)[0].shape

(500000,)

In [412]:
np.char.find(col[None, :], np.array(values, dtype="str")[:, None]).astype(float)[1].shape

(500000,)

##### 2B) we replace not found (-1) with np.nan)

In [425]:
positions[positions==-1] = np.nan

##### 2C) We then find the earliest position among each row for each person

earliest_pos is an array of 50000 now 

In [426]:
earliest_pos = np.nanmin(positions, axis =0)

  earliest_pos = np.nanmin(positions, axis =0)


#### That will create the months since that rate column

In [432]:
len(earliest_pos[np.isnan(earliest_pos)]), len(earliest_pos[~np.isnan(earliest_pos)])

(24864, 475136)

#### We update the high_credit_amt to be the chargeoff amount when the hight_credit_amt is na

Intuition, if we don't have their highest debt they have taken on yet, we assume it is the charge off amount if that exists.
**The borrower must have reached at least that chargeoff amount in debt or the charge off could not have happened** 

**Note, the high_credit_amt is supposed to be the highest exposure**

### We update the termDur to be null if the PORTFOLIO_TYPE is in R, O, C or the accountType is in 07, 0G, 18 or 2a
Open: No fixed balance, no fixed term (paid in full (Charge Card)
C: Line of credit, YOu draw and repay repeadtely. HELOC: drw up against credit limit and pay back

* 07 is charge account, 18 is credit card and 2a is secured credit card, 0G is flexible spending

* Loan Duration only has meaning when the account has a fixed repayment schedule with a known maturity date
* Maturity Date: The date at which the loan is schedueld to be paid off, assuming normal amortization

* Amortization: The process of paying down a debt through scheduled payments over time. Where each payment covers interest and principle.

* Basically, for each month a fixed monthly payment is due

    * We first calcualte interest = balance * (annual rate/12)
    * Payment = INterest + (monthly payment -interest)
    * What is left over is called the principal and the next balance will be balance-principal

#### We then update the credit limit when the portfolio type or account type is not in these

#### We then update the credit_limit to be null when the credit limit is 0 and the account type is in 7, 0G, 18, 2a
**Question: HOW COME FOR THis one we don't use portfolio type**
**LIKEWISE: HOW COME WE DON'T JUST SET IT EQUAL TO NULL IF EQUAL TO 0, SHOUDN'T ALL OF THE ONES NOT IN PORTFOLIO TYPE ROC OR ACCOUNT TYPE ABOVE ALREADY BE NULL**

#### We then compute blnc_to_hc by taking the balance amount and dividing by the high_credit_amt

* This gives us a sense of how their current exposure or debt compares to the max exposure and debt we have seen

#### We then compute hc_to_cl tells us a person's max utilization
* Basically the max exposure divided by how much they can borrow.
* Max utilization we have seen on this account.
* For credit cards, when you do not pay full balance, your balance can go up even if you don't utilize more because of interest
* Likewise if you miss payment you can also get feeds and penatly APR

#### We take the balance_amt and divide by the creidt limit to get the utilization: crdUtl

#### We take their pastDueAmt and divide by the credit_limit

* This gives us the severity of the delinquency.
* Tells us how big is their failure to pay relative to the trust extended to them?
