In [1]:
from flask import Flask, request, jsonify, send_file, send_from_directory
from flask_cors import CORS
from werkzeug.utils import secure_filename
from PyPDF2 import PdfReader
import requests
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import io
from collections import Counter
import os
import re
import time
import json

In [2]:
app = Flask(__name__)
CORS(app)

app.config['UPLOAD_FOLDER'] = '/tmp/uploads'
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)

GPT4V_KEY = "32a17b4382f644b48aac5d0ede6f0ac0"
GPT4V_ENDPOINT = "https://finease.openai.azure.com/openai/deployments/finease-2/chat/completions?api-version=2024-02-15-preview"

# Global variables
transactions = []
split_transactions = []
uploaded_files = []
text=""


In [None]:
# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return text


# Extract format of bank statement
def extract_format():
    global text
    pdf_text = text
    match = re.search(r'.*Date.*', pdf_text, re.MULTILINE)
    return match.group(0) if match else ""

# Parse transactions based on specified keywords
def parse_transactions(text):
    lines = text.split('\n')
    # return [line for line in lines if any(keyword in line for keyword in ['UPI/', 'POS/', 'IMPS/', 'NEFT/', 'RTGS/'])]
    return lines

# Upload PDF and extract transactions
@app.route('/upload_pdf', methods=['POST'])
def upload_pdf():
    global transactions, split_transactions, uploaded_files
    try:
        if 'files' not in request.files:
            return jsonify({'error': 'No files part in the request'}), 400

        files = request.files.getlist('files')
        if not files:
            return jsonify({'error': 'No file selected'}), 400

        transactions = []
        split_transactions = []
        uploaded_files = []

        for file in files:
            if file and file.filename.endswith('.pdf'):
                filename = secure_filename(file.filename)
                file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
                file.save(file_path)

                uploaded_files.append(filename)

                text = extract_text_from_pdf(file_path)
                parsed_transactions = parse_transactions(text)
                transactions.extend(parsed_transactions)

            if transactions:
                first_few_transactions = transactions[:10]
                split_transactions = get_split_transactions(first_few_transactions)

        return jsonify({'transactions': split_transactions}), 200

    except Exception as e:
        return jsonify({'error': str(e)}), 500
    


def get_split_transactions(transaction_texts):
    headers = {
        "Content-Type": "application/json",
        "api-key": GPT4V_KEY,
    }

    transactions_text = "\n".join(transaction_texts)
    format = extract_format()

    payload = {
        "messages": [
            {
                "role": "user",
                "content": f"""Return me a python list of dictionaries for each of the transaction, 
                for each transaction the dictionary keys should be Date(STRING), Transaction(STRING), Amount(FLOAT), Balance(FLOAT), 
                Make amount as negative float if it is withdrawal (JUST GIVE THE LIST, NO EXTRA TEXT):\n\n{transactions_text}, 
                the transaction key should display the transaction information
                the format of the text is: {format}"""
            }
        ],
        "temperature": 0.5,
        "top_p": 1,
        "max_tokens": 1000
    }

    response = requests.post(GPT4V_ENDPOINT, headers=headers, json=payload, timeout=30)
    response.raise_for_status()
    result = response.json()['choices'][0]['message']['content'].strip()

    # text between '[' and ']'
    match = re.search(r'\[(.*?)\]', result, re.DOTALL)
    if match:
        transactions_text = match.group(0).strip()  # Include the brackets in the result
        split_transactions = json.loads(transactions_text)
        return split_transactions
    else:
        raise ValueError("Failed to extract the list from response")

