# Classifying Sticky, Active, Inactive, and Dormant Accounts

Classify Sticky, Active, Inactive and Dormant accounts


## Code Explanation
1. Import necessary libraries:
```python
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import json
from datetime import datetime, timedelta


In [0]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import json
from datetime import datetime, timedelta


# Load credentials from JSON (assuming it's a valid JSON file)
with open('/Workspace/Credentials/db_data.json', 'r') as fp:
    credentials = json.load(fp)

# Extract connection details from dictionary (assuming keys exist)
host = credentials['redshift']['host']
user = credentials['redshift']['user']
passwd = credentials['redshift']['passwd']
database = credentials['redshift']['database']

# Connect to Redshift using a connection string (cleaner approach)
conn_string = f"postgresql+psycopg2://{user}:{passwd}@{host}:5439/{database}"
conn = create_engine(conn_string)

# Set display format for floats (consider using a global configuration file)
pd.set_option('display.float_format', '{:.2f}'.format)

# Get today and yesterday's dates efficiently using pandas
today = pd.Timestamp.today().strftime('%Y-%m-%d')
yesterday = (pd.Timestamp.today() - pd.Timedelta(days=1)).strftime('%Y-%m-%d')

# Calculate date for last 2 weeks using pandas (cleaner approach)
last_2_weeks = (pd.Timestamp.today() - pd.Timedelta(days=14)).strftime('%Y-%m-%d')

print('------------------------------------')
print(last_2_weeks)

print('\n')

# Get timestamps efficiently using pandas
now = pd.Timestamp.today().strftime('%Y-%m-%d %H:%M:%S')
last_30_mins = (pd.Timestamp.today() - pd.Timedelta(days=1)).strftime('%Y-%m-%d %H:%M:%S')
trunc_last_30_mins = (pd.Timestamp.today() - pd.Timedelta(days=1)).strftime('%Y-%m-%d %H:%M')

print(f"{last_30_mins} to {now}")


## Classifying Sticky, Active, Inactive and Dormant Accounts

This code snippet below is defining the conditions for two calculated fields: `sticky` and `account_status`. 

1. `sticky`: It categorizes clients based on recent transaction activity. If a client has had a transaction within the last 30 days, they are labeled as 'sticky'; otherwise, they are 'non sticky'.

2. `account_status`: It classifies clients based on their transaction history. If a client has had a transaction within the last 60 days, they are 'active'. If the last transaction was between 61 to 179 days ago, they are 'inactive'. Otherwise, they are labeled as 'dormant'.

In [0]:

query = '''
SELECT
    dac.client_id,
    CASE
        WHEN COUNT(dat.transaction_id) > 0 AND DATEDIFF(DAY, MAX(dat.transaction_date), CURRENT_DATE) <= 30 THEN 'sticky'
        ELSE 'non sticky'
    END AS sticky,
    CASE
        WHEN DATEDIFF(DAY, MAX(dat.transaction_date), CURRENT_DATE) <= 90 THEN 'active'
        WHEN DATEDIFF(DAY, MAX(dat.transaction_date), CURRENT_DATE) BETWEEN 91 AND 180 THEN 'inactive'
        ELSE 'dormant'
    END AS account_status,
    DATEDIFF(DAY, MAX(dat.transaction_date), CURRENT_DATE) AS days_since_last_transaction,
    DATEDIFF(MONTH, MIN(dac.activation_date), CURRENT_DATE) AS age_banked_in_months,
    MIN(dat.transaction_date) AS first_transaction_date,
    MAX(dat.transaction_date) AS last_transaction_date,
    COUNT(dat.transaction_id) AS transaction_volume,
    SUM(dat.amount) AS total_transaction_value,
    AVG(dat.running_balance) AS avg_bal,
    CURRENT_DATE AS run_date
FROM
    dwh_all_clients dac
LEFT JOIN dwh_all_accounts daa ON dac.client_id = daa.client_id
LEFT JOIN dwh_all_transactions dat ON dac.client_id = dat.client_id
WHERE
    dat.transaction_type_enum IN ('1', '2')
    AND dac.client_status != 'closed'
    AND dat.transaction_date >= DATEADD(MONTH, -12, CURRENT_DATE) 
GROUP BY
    dac.client_id

'''

# Execute the query using the engine and read the result into a DataFrame
query_data = pd.read_sql_query(query, conn)

# Display the first few rows of the result
query_data.head()



This code snippet demonstrates how to use PySpark to process data and store it in a table in Databricks.



In [0]:

from pyspark.sql import SparkSession

df = query_data
display(df)

# Create a SparkSession if not already created
spark = SparkSession.builder.getOrCreate()

# Convert Pandas DataFrame to Spark DataFrame
spark_df = spark.createDataFrame(df)

# Write Spark DataFrame to table in Databricks
spark_df.write \
    .mode("overwrite") \
    .saveAsTable("vfd_databricks.default.active_dormancy")


In [0]:

%%time

query_data.to_sql("dwh_active_dormancy", conn, index = False, if_exists = 'append', chunksize = 5000, method = 'multi')



In [0]:
print("run completed successfully on {now} \n")