In [1]:
import pandas as pd

In [2]:
contract_snapshot_df = pd.read_csv("ContractsSnapshotData.csv")

In [3]:
contract_snapshot_df.head()

Unnamed: 0,Contract_ID,Snapshot Date,Outstanding Amount,Due Amount,Number of Days Past Due,Contract Status
0,CONT109375,2018-12-31,0.0,0.0,,Closed
1,CONT109376,2006-08-31,215.08,0.0,,Active
2,CONT109376,2006-09-30,215.08,0.0,,Active
3,CONT109376,2006-10-31,215.08,0.0,,Active
4,CONT109376,2006-11-30,215.08,0.0,,Active


In [4]:
contract_snapshot_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274371 entries, 0 to 274370
Data columns (total 6 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Contract_ID              274371 non-null  object 
 1   Snapshot Date            274371 non-null  object 
 2   Outstanding Amount       274371 non-null  float64
 3   Due Amount               274371 non-null  float64
 4   Number of Days Past Due  79158 non-null   float64
 5   Contract Status          274371 non-null  object 
dtypes: float64(3), object(3)
memory usage: 12.6+ MB


In [5]:
contract_snapshot_df.shape

(274371, 6)

In [6]:
contract_snapshot_df.describe()

Unnamed: 0,Outstanding Amount,Due Amount,Number of Days Past Due
count,274371.0,274371.0,79158.0
mean,25161.23,384.217923,65.570606
std,40848.94,2431.451256,141.958344
min,-70873.29,0.0,0.0
25%,3697.2,0.0,0.0
50%,11234.87,0.0,5.0
75%,30287.38,1.76,60.0
max,1059941.0,211534.63,1765.0


In [7]:
contract_snapshot_df.value_counts()

Contract_ID  Snapshot Date  Outstanding Amount  Due Amount  Number of Days Past Due  Contract Status
CONT113921   2015-02-29     447.15              447.15      3.0                      Active             2
CONT113904   2015-02-29     0.00                0.00        0.0                      Active             2
CONT113917   2015-02-29     1979.93             0.00        0.0                      Active             2
CONT113911   2015-02-29     2343.73             122.23      0.0                      Active             2
CONT113878   2015-02-29     170.50              79.84       0.0                      Active             2
                                                                                                       ..
CONT108634   202-1-06-30    1336.76             138.26      12.0                     Active             1
             202-1-05-31    1453.97             136.37      13.0                     Active             1
             202-1-03-31    1685.25             151

In [8]:
contract_snapshot_df.duplicated().sum()

84

In [9]:
contract_snapshot_df.isnull().sum() / len(contract_snapshot_df) * 100

Contract_ID                 0.000000
Snapshot Date               0.000000
Outstanding Amount          0.000000
Due Amount                  0.000000
Number of Days Past Due    71.149283
Contract Status             0.000000
dtype: float64

# Cleaning & Preparation

In [10]:
clean_contract_snapshot_df = contract_snapshot_df.copy()

In [11]:
clean_contract_snapshot_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274371 entries, 0 to 274370
Data columns (total 6 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Contract_ID              274371 non-null  object 
 1   Snapshot Date            274371 non-null  object 
 2   Outstanding Amount       274371 non-null  float64
 3   Due Amount               274371 non-null  float64
 4   Number of Days Past Due  79158 non-null   float64
 5   Contract Status          274371 non-null  object 
dtypes: float64(3), object(3)
memory usage: 12.6+ MB


# Drop Duplicates

In [12]:
clean_contract_snapshot_df.drop_duplicates(inplace=True)

# Fix Malformed Snapshot Date

In [13]:
# Fix common malformed year format like '202-1' → '2021'
clean_contract_snapshot_df['Snapshot Date'] = clean_contract_snapshot_df['Snapshot Date'].str.replace(
    r'^202-1', '2021', regex=True
)

# Convert again
clean_contract_snapshot_df['Snapshot Date'] = pd.to_datetime(
    clean_contract_snapshot_df['Snapshot Date'], errors='coerce'
)

# Recheck invalid dates
print("Invalid snapshot dates after fix:", clean_contract_snapshot_df['Snapshot Date'].isna().sum())

Invalid snapshot dates after fix: 13598


In [14]:
clean_contract_snapshot_df = clean_contract_snapshot_df[clean_contract_snapshot_df['Snapshot Date'].notna()]

In [15]:
# Count bad dates
print("Invalid snapshot dates:", clean_contract_snapshot_df['Snapshot Date'].isna().sum())

Invalid snapshot dates: 0


# Inspect and Handle Number of Days Past Due
## 71% missing, This means most loans are not past due

In [16]:
clean_contract_snapshot_df['Number of Days Past Due'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  clean_contract_snapshot_df['Number of Days Past Due'].fillna(0, inplace=True)


# Handle Negative Outstanding Amount

In [17]:
# Count negatives
negatives = clean_contract_snapshot_df[clean_contract_snapshot_df['Outstanding Amount'] < 0]
print(f"Negative outstanding amounts: {len(negatives)}")

Negative outstanding amounts: 1430


In [20]:
clean_contract_snapshot_df['Outstanding Amount'] = clean_contract_snapshot_df['Outstanding Amount'].clip(lower=0)

# Feature Engineering (Snapshot-Based)
## Because this is time-series data, we need to reduce it to one row per contract for merging.

# Aggregate Snapshot Data by Contract_ID
### We'll calculate summary stats that describe loan performance history up to the latest snapshot.

In [22]:
agg_snapshot_df = clean_contract_snapshot_df.groupby('Contract_ID').agg({
    'Outstanding Amount': ['max', 'mean', 'last'],
    'Due Amount': ['max', 'mean'],
    'Number of Days Past Due': ['max', 'mean', 'count'],
    'Contract Status': lambda x: x.iloc[-1]  # status at latest snapshot
})

# Flatten column names
agg_snapshot_df.columns = ['_'.join(col).strip() for col in agg_snapshot_df.columns.values]
agg_snapshot_df.reset_index(inplace=True)


# New Feature Names Created:
## Outstanding Amount_max, _mean, _last

## Due Amount_max, _mean

## Number of Days Past Due_max, _mean, _count

## Contract_Status_last

In [23]:
agg_snapshot_df.to_csv('clean_snapshot_summary.csv', index=False)