# Table of Contents
* [Import Data from SQLite Database](#Import-Data-from-SQLite-Database)
* [Aggregate columns by SK_ID_CURR](#Aggregate-columns-by-SK_ID_CURR)
    * [Part 1 -](#Part-1--)
    * [Part 2 -](#Part-2--)
    * [Part 3 -](#Part-3--)
    * [Merge and save all 3 parts](#Merge-and-save-all-3-parts)
    * [Aggregate the merged dataset at SK_ID_CURR level](#Aggregate-the-merged-dataset-at-SK_ID_CURR-level)

## Import Data from SQLite Database

In [1]:
import pandas as pd
import numpy as np
from numpy.random import seed
import matplotlib.pyplot as plt
%matplotlib inline  
import statistics
from scipy import stats
from scipy.stats import t
from scipy.stats import norm
import seaborn as sns

In [2]:
import sqlite3
from sqlite3 import Error
import csv
# open the connection to read in the datasets, remember to close the connection at the end of the code
con = sqlite3.connect(r"pythonsqlite.db")
cur = con.cursor()

In [3]:
# read in bureau balance data from sqlite database
sql_stmt = '''SELECT A.* FROM card_bal_sql as A '''
credit_bal = pd.read_sql(sql_stmt, coerce_float=True, con=con)
# replace field that's entirely space (or empty) with NaN
credit_bal.replace(r'^\s*$', np.nan, regex=True, inplace=True)
# close connection
con.close()

## Aggregate columns by SK_ID_CURR

Use the following aggregation for each variable: For each SK_ID_CURR and SK_ID_PREV, summarize all the previous months records by certain summary statistics such as max, min or average, sum etc.
* Part 1 - Get average credit card balances, maximum credit limit, total ATM drawings etc. over the past months before the current application
<p>
    1. AMT_BALANCE: average<br>
    2. AMT_CREDIT_LIMIT_ACTUAL: max<br>
    3. AMT_DRAWINGS_ATM_CURRENT: sum<br>
    4. AMT_DRAWINGS_CURRENT: sum<br>
    5. AMT_DRAWINGS_OTHER_CURRENT: sum<br>
    6. AMT_DRAWINGS_POS_CURRENT: sum<br>
<p>  
* Part 2 - Get maximum of min installment, average payment, average receivable etc. over the past months before the current application
<p>
    7. AMT_INST_MIN_REGULARITY: max<br>
    8. AMT_PAYMENT_CURRENT: avg<br>
    9. AMT_PAYMENT_TOTAL_CURRENT: avg<br>
    10. AMT_RECEIVABLE_PRINCIPAL: avg<br>
    11. AMT_RECIVABLE: avg<br>
    12. AMT_TOTAL_RECEIVABLE: avg<br>
<p>
* Part 3 - Get total number of drawings over the past months before the current application. Variable NAME_CONTRACT_STATUS has 7 categories and very few counts in many categories. To reduce the number of categories, create a new dummy variable "STATUS_ACTIVE" with only 2 categories, 1 meaning active, 0 meaning inactive. Variable SK_DPD_DEF means number of Days past due during each month in the past before the current application. Since over 97.5% of the values are 0, and only less than 0.5% of the values are >= 1, create a new dummy variable to group all the > 0 values into 1 category. Similar treatment for variable SK_DPD
<p>
    13. CNT_DRAWINGS_ATM_CURRENT: sum<br>
    14. CNT_DRAWINGS_CURRENT: sum<br>
    15. CNT_DRAWINGS_OTHER_CURRENT: sum<br>
    16. CNT_DRAWINGS_POS_CURRENT: sum<br>
    17. CNT_INSTALMENT_MATURE_CUM: sum<br>
    18. NAME_CONTRACT_STATUS: create a new variable 'STATUS_ACTIVE'<br>
    19. SK_DPD: create a new variable 'COUNT_DPD_GE1'<br> 
    20. SK_DPD_DEF: create a new variable 'COUNT_DPD_DEF_GE0'<br> 
<p>

## Part 1 - 

In [4]:
card_bal_p1 = credit_bal[['SK_ID_CURR','SK_ID_PREV','MONTHS_BALANCE','AMT_BALANCE','AMT_CREDIT_LIMIT_ACTUAL',
                         'AMT_DRAWINGS_ATM_CURRENT','AMT_DRAWINGS_CURRENT','AMT_DRAWINGS_OTHER_CURRENT',
                          'AMT_DRAWINGS_POS_CURRENT']]

In [5]:
# open connection to sqlite database 
con = sqlite3.connect(r"pythonsqlite.db")
cur = con.cursor()

In [6]:
# write pos_p1 to sqlite database and save it, if don't have index=False, there will be an extra index column
card_bal_p1.to_sql(name='card_bal_p1', index=False, con=con)

# group by curr and prev IDs:
sql_sm = '''SELECT SK_ID_CURR,
                SK_ID_PREV,
                AVG(AMT_BALANCE) AS AVG_AMT_BALANCE,
                MAX(AMT_CREDIT_LIMIT_ACTUAL) AS MAX_AMT_CREDIT_LIMIT_ACTUAL,
                SUM(AMT_DRAWINGS_ATM_CURRENT) AS SUM_AMT_DRAWINGS_ATM_CURRENT,
                SUM(AMT_DRAWINGS_CURRENT) AS SUM_AMT_DRAWINGS_CURRENT,
                SUM(AMT_DRAWINGS_OTHER_CURRENT) AS SUM_AMT_DRAWINGS_OTHER_CURRENT,
                SUM(AMT_DRAWINGS_POS_CURRENT) AS SUM_AMT_DRAWINGS_POS_CURRENT
            FROM card_bal_p1
            GROUP BY SK_ID_CURR, SK_ID_PREV'''
card_bal_p1_gp = pd.read_sql(sql_sm, coerce_float=True, con=con)

display(card_bal_p1_gp.head())
print(card_bal_p1_gp.shape)


Unnamed: 0,SK_ID_CURR,SK_ID_PREV,AVG_AMT_BALANCE,MAX_AMT_CREDIT_LIMIT_ACTUAL,SUM_AMT_DRAWINGS_ATM_CURRENT,SUM_AMT_DRAWINGS_CURRENT,SUM_AMT_DRAWINGS_OTHER_CURRENT,SUM_AMT_DRAWINGS_POS_CURRENT
0,100006,1489396,0.0,270000,,0.0,,
1,100011,1843384,54482.111149,180000,180000.0,180000.0,0.0,0.0
2,100013,2038692,18159.919219,157500,571500.0,571500.0,0.0,0.0
3,100021,2594025,0.0,675000,,0.0,,
4,100023,1499902,0.0,225000,,0.0,,


(104307, 8)


## Part 2 - 

In [7]:
card_bal_p2 = credit_bal[['SK_ID_CURR','SK_ID_PREV','MONTHS_BALANCE','AMT_INST_MIN_REGULARITY',
                          'AMT_PAYMENT_CURRENT','AMT_PAYMENT_TOTAL_CURRENT','AMT_RECEIVABLE_PRINCIPAL',
                          'AMT_RECIVABLE','AMT_TOTAL_RECEIVABLE']]

In [8]:
# write to sqlite database and save it, if don't have index=False, there will be an extra index column
card_bal_p2.to_sql(name='card_bal_p2', index=False, con=con)

# group by curr and prev IDs:
sql_sm = '''SELECT SK_ID_CURR,
                SK_ID_PREV,
                
                MAX(AMT_INST_MIN_REGULARITY) AS MAX_AMT_INST_MIN_REGULARITY,
                AVG(AMT_PAYMENT_CURRENT) AS AVG_AMT_PAYMENT_CURRENT,
                AVG(AMT_PAYMENT_TOTAL_CURRENT) AS AVG_AMT_PAYMENT_TOTAL_CURRENT,
                AVG(AMT_RECEIVABLE_PRINCIPAL) AS AVG_AMT_RECEIVABLE_PRINCIPAL,
                AVG(AMT_RECIVABLE) AS AVG_AMT_RECIVABLE,
                AVG(AMT_TOTAL_RECEIVABLE) AS AVG_AMT_TOTAL_RECEIVABLE
                          
            FROM card_bal_p2
            GROUP BY SK_ID_CURR, SK_ID_PREV'''
card_bal_p2_gp = pd.read_sql(sql_sm, coerce_float=True, con=con)

display(card_bal_p2_gp.head())
print(card_bal_p2_gp.shape)

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,MAX_AMT_INST_MIN_REGULARITY,AVG_AMT_PAYMENT_CURRENT,AVG_AMT_PAYMENT_TOTAL_CURRENT,AVG_AMT_RECEIVABLE_PRINCIPAL,AVG_AMT_RECIVABLE,AVG_AMT_TOTAL_RECEIVABLE
0,100006,1489396,0.0,,0.0,0.0,0.0,0.0
1,100011,1843384,9000.0,4843.064189,4520.067568,52402.088919,54433.179122,54433.179122
2,100013,2038692,7875.0,7168.34625,6817.172344,17255.559844,18101.079844,18101.079844
3,100021,2594025,0.0,,0.0,0.0,0.0,0.0
4,100023,1499902,0.0,,0.0,0.0,0.0,0.0


(104307, 8)


## Part 3 - 

In [9]:
card_bal_p3 = credit_bal[['SK_ID_CURR','SK_ID_PREV','MONTHS_BALANCE','CNT_DRAWINGS_ATM_CURRENT',
                          'CNT_DRAWINGS_CURRENT','CNT_DRAWINGS_OTHER_CURRENT','CNT_DRAWINGS_POS_CURRENT',
                          'CNT_INSTALMENT_MATURE_CUM','NAME_CONTRACT_STATUS','SK_DPD','SK_DPD_DEF']]

In [10]:
# write to sqlite database and save it, if don't have index=False, there will be an extra index column
card_bal_p3.to_sql(name='card_bal_p3', index=False, con=con)

# group by curr and prev IDs, define an extra column to hold name contract status:
sql_sm = '''SELECT SK_ID_CURR,
                SK_ID_PREV,
                SUM(CNT_DRAWINGS_ATM_CURRENT) AS SUM_CNT_DRAWINGS_ATM_CURRENT,
                SUM(CNT_DRAWINGS_CURRENT) AS SUM_CNT_DRAWINGS_CURRENT,
                SUM(CNT_DRAWINGS_OTHER_CURRENT) AS SUM_CNT_DRAWINGS_OTHER_CURRENT,
                SUM(CNT_DRAWINGS_POS_CURRENT) AS SUM_CNT_DRAWINGS_POS_CURRENT,
                SUM(CNT_INSTALMENT_MATURE_CUM) AS SUM_CNT_INSTALMENT_MATURE_CUM,
                MAX(CASE WHEN NAME_CONTRACT_STATUS = 'Active' THEN 1 ELSE 0 END) AS STATUS_ACTIVE,
                SUM(CASE WHEN SK_DPD > 1 THEN 1 ELSE 0 END) AS COUNT_DPD_GE1,
                SUM(CASE WHEN SK_DPD_DEF > 0 THEN 1 ELSE 0 END) AS COUNT_DPD_DEF_GE0    
            FROM card_bal_p3
            GROUP BY SK_ID_CURR, SK_ID_PREV'''
card_bal_p3_gp = pd.read_sql(sql_sm, coerce_float=True, con=con)

display(card_bal_p3_gp.head())
print(card_bal_p3_gp.shape)

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,SUM_CNT_DRAWINGS_ATM_CURRENT,SUM_CNT_DRAWINGS_CURRENT,SUM_CNT_DRAWINGS_OTHER_CURRENT,SUM_CNT_DRAWINGS_POS_CURRENT,SUM_CNT_INSTALMENT_MATURE_CUM,STATUS_ACTIVE,COUNT_DPD_GE1,COUNT_DPD_DEF_GE0
0,100006,1489396,,0,,,0.0,1,0,0
1,100011,1843384,4.0,4,0.0,0.0,1881.0,1,0,0
2,100013,2038692,23.0,23,0.0,0.0,1666.0,1,0,1
3,100021,2594025,,0,,,0.0,1,0,0
4,100023,1499902,,0,,,0.0,1,0,0


(104307, 10)


## Merge and save all 3 parts 

In [11]:
# save the grouped data to database to merge with other parts
card_bal_p1_gp.to_sql(name='card_bal_p1_gp', index=False, con=con)
card_bal_p2_gp.to_sql(name='card_bal_p2_gp', index=False, con=con)
card_bal_p3_gp.to_sql(name='card_bal_p3_gp', index=False, con=con)

In [12]:
# merge all 3 parts together 
sql_sm = '''SELECT A.*,
                B.MAX_AMT_INST_MIN_REGULARITY,
                B.AVG_AMT_PAYMENT_CURRENT,
                B.AVG_AMT_PAYMENT_TOTAL_CURRENT,
                B.AVG_AMT_RECEIVABLE_PRINCIPAL,
                B.AVG_AMT_RECIVABLE,
                B.AVG_AMT_TOTAL_RECEIVABLE,
                C.SUM_CNT_DRAWINGS_ATM_CURRENT,
                C.SUM_CNT_DRAWINGS_CURRENT,
                C.SUM_CNT_DRAWINGS_OTHER_CURRENT,
                C.SUM_CNT_DRAWINGS_POS_CURRENT,
                C.SUM_CNT_INSTALMENT_MATURE_CUM,
                C.STATUS_ACTIVE,
                C.COUNT_DPD_GE1,
                C.COUNT_DPD_DEF_GE0
            FROM card_bal_p1_gp AS A 
            INNER JOIN
                 card_bal_p2_gp AS B ON A.SK_ID_CURR = B.SK_ID_CURR AND A.SK_ID_PREV = B.SK_ID_PREV
            INNER JOIN
                 card_bal_p3_gp AS C ON A.SK_ID_CURR = C.SK_ID_CURR AND A.SK_ID_PREV = C.SK_ID_PREV           
            '''
card_bal_cur_prev_gp = pd.read_sql(sql_sm, coerce_float=True, con=con)

display(card_bal_cur_prev_gp.head())
print(card_bal_cur_prev_gp.shape)

# write the aggregated level data to sqlite database and save it, if don't have index=False, there will be an extra index column
card_bal_cur_prev_gp.to_sql(name='card_bal_cur_prev_gp', index=False, con=con)

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,AVG_AMT_BALANCE,MAX_AMT_CREDIT_LIMIT_ACTUAL,SUM_AMT_DRAWINGS_ATM_CURRENT,SUM_AMT_DRAWINGS_CURRENT,SUM_AMT_DRAWINGS_OTHER_CURRENT,SUM_AMT_DRAWINGS_POS_CURRENT,MAX_AMT_INST_MIN_REGULARITY,AVG_AMT_PAYMENT_CURRENT,...,AVG_AMT_RECIVABLE,AVG_AMT_TOTAL_RECEIVABLE,SUM_CNT_DRAWINGS_ATM_CURRENT,SUM_CNT_DRAWINGS_CURRENT,SUM_CNT_DRAWINGS_OTHER_CURRENT,SUM_CNT_DRAWINGS_POS_CURRENT,SUM_CNT_INSTALMENT_MATURE_CUM,STATUS_ACTIVE,COUNT_DPD_GE1,COUNT_DPD_DEF_GE0
0,100006,1489396,0.0,270000,,0.0,,,0.0,,...,0.0,0.0,,0,,,0.0,1,0,0
1,100011,1843384,54482.111149,180000,180000.0,180000.0,0.0,0.0,9000.0,4843.064189,...,54433.179122,54433.179122,4.0,4,0.0,0.0,1881.0,1,0,0
2,100013,2038692,18159.919219,157500,571500.0,571500.0,0.0,0.0,7875.0,7168.34625,...,18101.079844,18101.079844,23.0,23,0.0,0.0,1666.0,1,0,1
3,100021,2594025,0.0,675000,,0.0,,,0.0,,...,0.0,0.0,,0,,,0.0,1,0,0
4,100023,1499902,0.0,225000,,0.0,,,0.0,,...,0.0,0.0,,0,,,0.0,1,0,0


(104307, 22)


## Aggregate the merged dataset at SK_ID_CURR level

In [13]:
# aggregate at SK_ID_CURR level, added one column to count total number of previous applications
sql_sm = '''SELECT SK_ID_CURR,
                COUNT(SK_ID_PREV) AS TOTAL_NUM_PREV_APPS, 
                SUM(STATUS_ACTIVE) AS SUM_STATUS_ACTIVE,
                AVG(AVG_AMT_BALANCE) AS AVG_AMT_BALANCE,
                SUM(MAX_AMT_CREDIT_LIMIT_ACTUAL) AS SUM_MAX_AMT_CREDIT_LIMIT_ACTUAL,
                SUM(SUM_AMT_DRAWINGS_ATM_CURRENT) AS SUM_AMT_DRAWINGS_ATM_CURRENT,
                SUM(SUM_AMT_DRAWINGS_CURRENT) AS SUM_AMT_DRAWINGS_CURRENT,
                SUM(SUM_AMT_DRAWINGS_OTHER_CURRENT) AS SUM_AMT_DRAWINGS_OTHER_CURRENT,
                SUM(SUM_AMT_DRAWINGS_POS_CURRENT) AS SUM_AMT_DRAWINGS_POS_CURRENT,
                SUM(MAX_AMT_INST_MIN_REGULARITY) AS SUM_MAX_AMT_INST_MIN_REGULARITY,
                AVG(AVG_AMT_PAYMENT_CURRENT) AS AVG_AMT_PAYMENT_CURRENT,
                AVG(AVG_AMT_PAYMENT_TOTAL_CURRENT) AS AVG_AMT_PAYMENT_TOTAL_CURRENT,
                AVG(AVG_AMT_RECEIVABLE_PRINCIPAL) AS AVG_AMT_RECEIVABLE_PRINCIPAL,
                AVG(AVG_AMT_RECIVABLE) AS AVG_AMT_RECIVABLE,
                AVG(AVG_AMT_TOTAL_RECEIVABLE) AS AVG_AMT_TOTAL_RECEIVABLE,
                SUM(SUM_CNT_DRAWINGS_ATM_CURRENT) AS SUM_CNT_DRAWINGS_ATM_CURRENT,
                SUM(SUM_CNT_DRAWINGS_CURRENT) AS SUM_CNT_DRAWINGS_CURRENT,
                SUM(SUM_CNT_DRAWINGS_OTHER_CURRENT) AS SUM_CNT_DRAWINGS_OTHER_CURRENT,
                SUM(SUM_CNT_DRAWINGS_POS_CURRENT) AS SUM_CNT_DRAWINGS_POS_CURRENT,
                SUM(SUM_CNT_INSTALMENT_MATURE_CUM) AS SUM_CNT_INSTALMENT_MATURE_CUM,
                SUM(COUNT_DPD_GE1) AS SUM_DPD_GE1,
                SUM(COUNT_DPD_DEF_GE0) AS SUM_DPD_DEF_GE0
            FROM card_bal_cur_prev_gp
            GROUP BY SK_ID_CURR       
            '''
card_bal_cur_gp = pd.read_sql(sql_sm, coerce_float=True, con=con)

display(card_bal_cur_gp.head())
print(card_bal_cur_gp.shape)

# write the aggregated level data to sqlite database and save it, if don't have index=False, there will be an extra index column
card_bal_cur_gp.to_sql(name='card_bal_cur_gp', index=False, con=con)

con.commit()
con.close()

Unnamed: 0,SK_ID_CURR,TOTAL_NUM_PREV_APPS,SUM_STATUS_ACTIVE,AVG_AMT_BALANCE,SUM_MAX_AMT_CREDIT_LIMIT_ACTUAL,SUM_AMT_DRAWINGS_ATM_CURRENT,SUM_AMT_DRAWINGS_CURRENT,SUM_AMT_DRAWINGS_OTHER_CURRENT,SUM_AMT_DRAWINGS_POS_CURRENT,SUM_MAX_AMT_INST_MIN_REGULARITY,...,AVG_AMT_RECEIVABLE_PRINCIPAL,AVG_AMT_RECIVABLE,AVG_AMT_TOTAL_RECEIVABLE,SUM_CNT_DRAWINGS_ATM_CURRENT,SUM_CNT_DRAWINGS_CURRENT,SUM_CNT_DRAWINGS_OTHER_CURRENT,SUM_CNT_DRAWINGS_POS_CURRENT,SUM_CNT_INSTALMENT_MATURE_CUM,SUM_DPD_GE1,SUM_DPD_DEF_GE0
0,100006,1,1,0.0,270000,,0.0,,,0.0,...,0.0,0.0,0.0,,0,,,0.0,0,0
1,100011,1,1,54482.111149,180000,180000.0,180000.0,0.0,0.0,9000.0,...,52402.088919,54433.179122,54433.179122,4.0,4,0.0,0.0,1881.0,0,0
2,100013,1,1,18159.919219,157500,571500.0,571500.0,0.0,0.0,7875.0,...,17255.559844,18101.079844,18101.079844,23.0,23,0.0,0.0,1666.0,0,1
3,100021,1,1,0.0,675000,,0.0,,,0.0,...,0.0,0.0,0.0,,0,,,0.0,0,0
4,100023,1,1,0.0,225000,,0.0,,,0.0,...,0.0,0.0,0.0,,0,,,0.0,0,0


(103558, 22)
