Dependencies
--

In [3]:
import pandas as pd
pd.set_option('display.max_colwidth', None)  # Display full content of each column
pd.set_option('display.max_columns', None)   # Display all columns
pd.set_option('display.width', 5000)         # Set display width


Pdf-Csv
--

In [57]:
import pdfplumber

def pdf_to_csv(pdf_path):
    pdf = pdfplumber.open(pdf_path)

    df = pd.DataFrame()

    for i in range(len(pdf.pages)):
        table = pdf.pages[i].extract_table()
        local_df = pd.DataFrame(table[1:], columns=table[0])
        df = pd.concat([df, local_df], ignore_index=True)
    return df

    


Merge ALL CSV'S
--

In [58]:
# df1=pd.read_csv("24-25.csv")
# df2=pd.read_csv("23-24.csv")
# df3=pd.read_csv("22-23.csv")

# df = pd.concat([df1, df2, df3], ignore_index=True)

# df.to_csv("Merge.csv",index=False)
# df.columns

PATHS
--

In [62]:
pdf_path="pratham2.pdf"
Processed_output_path="Merge_Proccessed_Pratham.csv"

Segregate Columns with Mode/Name/ID/Note
--

In [77]:
import re
import pandas as pd

# List of all transaction modes
transaction_modes = [
    "UPI", "INB", "IMP", "NEFT", "RTGS", "Cheque", "Cash Deposit", "Cash Withdrawal", 
    "POS", "DD", "SWIFT", "Wire Transfer", "ECS", "Bill Pay", "M-wallet", "EMI", "EFT", "ACH"
]


pattern = re.compile(
    r"(?P<type>TO|BY) TRANSFER-(?P<method>[A-Z0-9]+)"
    r"/(?P<drcr>DR|CR)/(?P<id>\d+)/(?P<name>[^/]+)/(?P<bank>[A-Z]+)/(?P<upi_id>[^/]+)(?:/(?P<optional>[^-]+))?"
)

def extract_details(description):
    match = pattern.search(description)
    if match:
        return pd.Series({
            "Transaction_Type": match.group("type"),
            "Transaction_Mode": match.group("method"),
            "DR/CR_Indicator": match.group("drcr"),
            "Transaction_ID": match.group("id"),
            "Recipient_Name": match.group("name"),
            "Bank": match.group("bank"),
            "UPI_ID": match.group("upi_id"),
            "Note": match.group("optional") if match.group("optional") else "N/A"
        })
    else:
        # Handle unstructured descriptions by splitting on '/'
        split_data = description.split('/')
        
        # Extract Transaction Mode by checking against the list
        transaction_mode = "N/A"
        for mode in transaction_modes:
            if mode in description:
                transaction_mode = mode
                break

        return pd.Series({
            "Transaction_Type": "BY" if "BY" in description else "TO",
            "Transaction_Mode": transaction_mode,
            "DR/CR_Indicator": "N/A",
            "Transaction_ID": split_data[2] if len(split_data) > 1 else 'N/A',
            "Recipient_Name": split_data[1] if len(split_data) > 2 else 'N/A',
            "Bank": "N/A",
            "UPI_ID": split_data[1] if len(split_data) > 2 else 'N/A',
            "Note": split_data[3] if len(split_data) > 3 else 'N/A'
        })

# text = "BY TRANSFER-INBIMPS409519772968/9890160567/XX8237/Son-"
# text = "TO TRANSFER-UPI/DR/409378221768/JULFIKAR/YESB/paytmqr1jc/baker-"
text = "TRANSFER FROM 4897736162097 -UPI/CR/420866358273/SURENDRA/SBIN/surendrash/UPI"

print(extract_details(text))


Transaction_Type              TO
Transaction_Mode             UPI
DR/CR_Indicator               DR
Transaction_ID      409378221768
Recipient_Name          JULFIKAR
Bank                        YESB
UPI_ID                paytmqr1jc
Note                       baker
dtype: object


Csv-Organized
--

In [75]:
df=pdf_to_csv(pdf_path)
df = df.replace('\n', '', regex=True)

df = df.rename(columns={
    'Txn Date': 'Transaction_Date',
    # 'Date': 'Transaction_Date',
    
    'Value\nDate': 'Value Date',
    'Description': 'Description',
    # 'Details': 'Description',
    
    'Ref No./Cheque\nNo.': 'Reference No./Cheque No.',
    'Debit': 'Debit',
    'Credit': 'Credit',
    'Balance': 'Balance'
})

df=df.drop(columns=['Value Date'])
df_extracted = df['Description'].apply(extract_details)

df = pd.concat([df, df_extracted], axis=1)
df


Unnamed: 0,Transaction_Date,Description,Ref No./Cheque\nNo,Debit,Credit,Balance
0,26 Jul 2024,TRANSFER FROM 4897736162097 -UPI/CR/420866358273/SURENDRA/SBIN/surendrash/UPI,,-,2200.00,2275.60
1,26 Jul 2024,TRANSFER TO 4897694162092 -UPI/DR/420825709170/IRCTC_AP/YESB/paytm-7620/NA,,1094.05,-,1181.55
2,26 Jul 2024,TRANSFER TO 4897694162092 -UPI/DR/420844303878/IRCTC_AP/YESB/paytm-7620/NA,,1084.05,-,97.50
3,29 Jul 2024,TRANSFER FROM 4698314162098 -INB IMPS/421111414631/CNB-XX919-SHIV KUM/IMPS -MAK000132691034,,-,2500.00,2597.50
4,29 Jul 2024,TRANSFER TO 4897690162095 -UPI/DR/421120542966/NAMRATA/AIRP/pinelabs.1/Gener,,764.00,-,1833.50
...,...,...,...,...,...,...
144,21 Nov 2024,TRANSFER TO 4897694162092 -UPI/DR/432626555998/SANTOSH/FDRL/bharatpe.9/Pay t,,80.00,-,1602.33
145,21 Nov 2024,TRANSFER TO 4897694162092 -UPI/DR/432629300337/MumbaiM/YESB/paytm-8736/NA,,10.00,-,1592.33
146,21 Nov 2024,TRANSFER TO 4897694162092 -UPI/DR/432629467353/MahaMum/ICIC/mmmocl.zkp/NA,,20.00,-,1572.33
147,22 Nov 2024,TRANSFER TO 4897695162091 -UPI/DR/432736503212/Chalo/ICIC/chalo920.r/PayviaRa,,13.00,-,1559.33


Converting Dtypes
--

In [None]:
df['Transaction_Date'] = df['Transaction_Date'].astype(str)

# Fix spacing issue in "27 Mar2023" format
df['Transaction_Date'] = df['Transaction_Date'].str.replace(
    r'(\d{1,2})\s*([A-Za-z]{3})(\d{4})', r'\1 \2 \3', regex=True
)

# Convert to datetime, handling mixed formats
df['Transaction_Date'] = pd.to_datetime(df['Transaction_Date'], format='mixed', dayfirst=True)


df['Debit'] = pd.to_numeric(df['Debit'].replace({',': '', 'nan': None, ' ': ''}, regex=True), errors='coerce')
df['Credit'] = pd.to_numeric(df['Credit'].replace({',': '', 'nan': None, ' ': ''}, regex=True), errors='coerce')
df['Balance'] = pd.to_numeric(df['Balance'].replace({',': '', 'nan': None, ' ': ''}, regex=True), errors='coerce')

categorical_cols = ['Transaction_Type', 'Transaction_Mode', 'DR/CR_Indicator', 
                    'Recipient_Name', 'Bank', 'UPI_ID', 'Note']

for col in categorical_cols:
    df[col] = df[col].astype('category')


df['Reference No./Cheque No.'] = df['Reference No./Cheque No.'].astype(str)
df['Transaction_ID'] = df['Transaction_ID'].astype(str)


In [68]:
df

Unnamed: 0,Transaction_Date,Description,Ref No./Cheque\nNo,Debit,Credit,Balance,Transaction_Type,Transaction_Mode,DR/CR_Indicator,Transaction_ID,Recipient_Name,Bank,UPI_ID,Note
0,26 Jul 2024,TRANSFER FROM 4897736162097 -UPI/CR/420866358273/SURENDRA/SBIN/surendrash/UPI,,-,2200.00,2275.60,TO,UPI,,420866358273,CR,,CR,SURENDRA
1,26 Jul 2024,TRANSFER TO 4897694162092 -UPI/DR/420825709170/IRCTC_AP/YESB/paytm-7620/NA,,1094.05,-,1181.55,TO,UPI,,420825709170,DR,,DR,IRCTC_AP
2,26 Jul 2024,TRANSFER TO 4897694162092 -UPI/DR/420844303878/IRCTC_AP/YESB/paytm-7620/NA,,1084.05,-,97.50,TO,UPI,,420844303878,DR,,DR,IRCTC_AP
3,29 Jul 2024,TRANSFER FROM 4698314162098 -INB IMPS/421111414631/CNB-XX919-SHIV KUM/IMPS -MAK000132691034,,-,2500.00,2597.50,TO,INB,,CNB-XX919-SHIV KUM,421111414631,,421111414631,IMPS -MAK000132691034
4,29 Jul 2024,TRANSFER TO 4897690162095 -UPI/DR/421120542966/NAMRATA/AIRP/pinelabs.1/Gener,,764.00,-,1833.50,TO,UPI,,421120542966,DR,,DR,NAMRATA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,21 Nov 2024,TRANSFER TO 4897694162092 -UPI/DR/432626555998/SANTOSH/FDRL/bharatpe.9/Pay t,,80.00,-,1602.33,TO,UPI,,432626555998,DR,,DR,SANTOSH
145,21 Nov 2024,TRANSFER TO 4897694162092 -UPI/DR/432629300337/MumbaiM/YESB/paytm-8736/NA,,10.00,-,1592.33,TO,UPI,,432629300337,DR,,DR,MumbaiM
146,21 Nov 2024,TRANSFER TO 4897694162092 -UPI/DR/432629467353/MahaMum/ICIC/mmmocl.zkp/NA,,20.00,-,1572.33,TO,UPI,,432629467353,DR,,DR,MahaMum
147,22 Nov 2024,TRANSFER TO 4897695162091 -UPI/DR/432736503212/Chalo/ICIC/chalo920.r/PayviaRa,,13.00,-,1559.33,TO,UPI,,432736503212,DR,,DR,Chalo


Handling NAN
--

In [None]:
df['Debit'] = pd.to_numeric(df['Debit'], errors='coerce').fillna(0)
df['Credit'] = pd.to_numeric(df['Credit'], errors='coerce').fillna(0)

def determine_dr_cr(df):
    for i in range(1,len(df)):  
        
        balance_diff = df.loc[i, 'Balance'] - df.loc[i-1, 'Balance']
        
        if balance_diff > 0:
            df.loc[i, 'DR/CR_Indicator'] = 'CR'
        else:
            df.loc[i, 'DR/CR_Indicator'] = 'DR'
    return df

def update_transaction_mode(row):
    if 'ATM' in row['Description']:
        row['Transaction_Mode'] = 'ATM'
    return row

df = df.apply(update_transaction_mode, axis=1)

df = determine_dr_cr(df)


In [4]:

df=pd.read_csv("Merge_Proccessed.csv")
# df.columns
# df.to_csv("Merge_Proccessed.csv",index=False)


Features
--

In [6]:
import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype

def feature_engineering(df):
    # Replace spaces with underscores for all column names,
    # but ensure that the date column becomes 'Transaction_Date'
    new_cols = {}
    for col in df.columns:
        if col.strip().lower() == 'transaction date':
            new_cols[col] = 'Transaction_Date'
        else:
            new_cols[col] = col.strip().replace(" ", "_")
    df.rename(columns=new_cols, inplace=True)
    
    # Ensure 'Transaction_Date' is in datetime format
    df['Transaction_Date'] = pd.to_datetime(df['Transaction_Date'])
    
    # Define an ordered categorical type for weekdays (Monday to Sunday)
    cat_type = CategoricalDtype(categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], ordered=True)
    
    # Extract time-based features from Transaction_Date
    df['dayofweek'] = df['Transaction_Date'].dt.dayofweek  # 0=Monday, 6=Sunday
    df['weekday'] = df['Transaction_Date'].dt.day_name().astype(cat_type)
    df['quarter'] = df['Transaction_Date'].dt.quarter
    df['month'] = df['Transaction_Date'].dt.month
    df['year'] = df['Transaction_Date'].dt.year
    df['dayofyear'] = df['Transaction_Date'].dt.dayofyear
    df['dayofmonth'] = df['Transaction_Date'].dt.day
    df['weekofyear'] = df['Transaction_Date'].dt.isocalendar().week
    df['weekofmonth'] = df['Transaction_Date'].apply(lambda x: (x.day - 1) // 7 + 1)
    
    # Create a custom date offset column for seasonal analysis
    df['date_offset'] = (df['Transaction_Date'].dt.month * 100 + df['Transaction_Date'].dt.day - 320) % 1300
    
    # Flag weekend transactions (1 if Saturday or Sunday, else 0)
    df['is_weekend'] = df['dayofweek'].apply(lambda x: 1 if x >= 5 else 0)
    
    # Flags for start/end of month and quarter
    df['is_month_start'] = df['Transaction_Date'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['Transaction_Date'].dt.is_month_end.astype(int)
    df['is_quarter_start'] = df['Transaction_Date'].dt.is_quarter_start.astype(int)
    df['is_quarter_end'] = df['Transaction_Date'].dt.is_quarter_end.astype(int)
    
    # Create unified transaction amount column using 'Debit' if available; otherwise, use 'Credit'
    df['transaction_amount'] = df['Debit'].fillna(df['Credit'])
    
    # Flag large transactions (threshold > 1000)
    df['is_large_transaction'] = df['transaction_amount'].apply(lambda x: 1 if x > 1000 else 0)
    
    # Calculate transaction counts per day, week, and month using Transaction_Date and Transaction_ID
    df['transaction_count_per_day'] = df.groupby(df['Transaction_Date'].dt.date)['Transaction_ID'].transform('count')
    df['transaction_count_per_week'] = df.groupby(df['weekofyear'])['Transaction_ID'].transform('count')
    df['transaction_count_per_month'] = df.groupby(df['month'])['Transaction_ID'].transform('count')
    
    # Compute average transaction amount per day
    df['average_transaction_per_day'] = df.groupby(df['Transaction_Date'].dt.date)['transaction_amount'].transform('mean')
    
    # Compute cumulative spending per day (summing Debit amounts)
    df['cumulative_spent_per_day'] = df.groupby(df['Transaction_Date'].dt.date)['Debit'].cumsum()
    
    # Sort by Transaction_Date and calculate time since the last transaction in hours
    df = df.sort_values(by=['Transaction_Date'])
    df['time_since_last_transaction'] = df['Transaction_Date'].diff().dt.total_seconds() / 3600
    df['time_since_last_transaction'].fillna(0, inplace=True)
    
    # Rolling averages for transaction_amount over the past 7 and 30 days
    df['rolling_avg_transaction_7d'] = df['transaction_amount'].rolling(window=7, min_periods=1).mean()
    df['rolling_avg_transaction_30d'] = df['transaction_amount'].rolling(window=30, min_periods=1).mean()
    
    return df

# Example Usage:
# df = pd.read_csv("Merge_Proccessed.csv")
df = feature_engineering(df)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['time_since_last_transaction'].fillna(0, inplace=True)


In [7]:
import dtale

# dtale.show(df)

In [11]:
# df

Save Processed CSV
--

In [9]:
# df.to_csv("Features.csv", index=False)


In [10]:
df.columns

Index(['Transaction_Date', 'Description', 'Reference_No./Cheque_No.', 'Debit', 'Credit', 'Balance', 'Transaction_Type', 'Transaction_Mode', 'DR/CR_Indicator', 'Transaction_ID', 'Recipient_Name', 'Bank', 'UPI_ID', 'Note', 'dayofweek', 'weekday', 'quarter', 'month', 'year', 'dayofyear', 'dayofmonth', 'weekofyear', 'weekofmonth', 'date_offset', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 'transaction_amount', 'is_large_transaction', 'transaction_count_per_day', 'transaction_count_per_week', 'transaction_count_per_month', 'average_transaction_per_day', 'cumulative_spent_per_day', 'time_since_last_transaction', 'rolling_avg_transaction_7d', 'rolling_avg_transaction_30d'], dtype='object')

Print N/A
--

In [26]:
na_count_per_column = df.applymap(lambda x: x == "N/A").sum()

# Print the result
print("Number of 'N/A' values per column:")
# print(na_count_per_column)

Number of 'N/A' values per column:



DataFrame.applymap has been deprecated. Use DataFrame.map instead.



Print Everthing
--

In [11]:
def print_transaction_details(df):
    """
    Prints every row with all the extracted details from the DataFrame.
    """
    for index, row in df.iterrows():
        print(f"Row {index}:")
        for col in df.columns:
            print(f"  {col}: {row[col]}")
        print("-" * 50)  # Separator for better readability
# print_transaction_details(df)
        

Holidays
--

In [None]:
import holidays
import pandas as pd

indian_holidays = holidays.India(years=2024)

df = pd.DataFrame(indian_holidays.items(), columns=["Date", "Holiday"])
df["Date"] = pd.to_datetime(df["Date"])

df
