In [None]:
import os
import whisper

# Get the current working directory
cwd_models = os.getcwd() + "/models"

In [None]:
# Now, call load_model with the current working directory as the download_root
medium_en_model = whisper.load_model(
    "medium.en", download_root=cwd_models + "/transcribe", device="cpu"
)

In [None]:
medium_en_result = medium_en_model.transcribe("test.wav")

In [None]:
display(medium_en_result["text"].strip())

In [1]:
import sys
import os
import tabula
import pandas as pd


def pdf_to_df(filename: str, pdf_password: str, root_dir: str):  # type: ignore

    if root_dir not in sys.path:
        sys.path.append(root_dir)

    # Correctly specify the path to your bank statement PDF
    filepath = root_dir + f"/data/bankstatements/{filename}"

    # Generate the list dynamically
    column_list = ["Column" + str(i) for i in range(20)]
    # print(column_list)

    dfs = tabula.read_pdf(
        input_path=filepath,
        output_format="dataframe",
        encoding="utf-8",
        password=pdf_password,
        pages="all",
        multiple_tables=True,
        lattice=True,
        guess=False,
        pandas_options={
            "names": column_list,
            "header": None,
        },
    )

    clean_df = pd.DataFrame(columns=column_list)
    for i in dfs:

        temp_df = (
            i.replace("\r", " ", regex=True)
            .replace(r"(\d),(\d)", r"\1\2", regex=True)
            .replace(",", " ", regex=True)
        )

        clean_df = pd.concat([clean_df, temp_df], axis=0, ignore_index=True)

    clean_df = clean_df.dropna(axis=1, how="all")
    # clean_df.to_csv("test.csv", index=False, header=True)
    # Convert DataFrame to comma-separated string
    csv_string = clean_df.to_csv(index=False)
    return csv_string


# Specify the project root directory
root_dir = os.getcwd()
filename = "2020 - 02 - February.pdf"
# Specify the password to decrypt the PDF
pdf_password = "11323650"
csv_string = pdf_to_df(filename, pdf_password, root_dir)
# print(csv_string)

In [2]:
promt_str = (
    """
Using the data provided from a bank statement, transform the transaction details into a standardized list format.
Each transaction entry should be structured as following 3 columns: 'date, description, amount'.
Ensure description does not contain the values of amount as it would be redundant.
Here's the raw data extracted for processing:

"""
    + csv_string
    + """

Transform this data into a clean, readable list of transactions, adhering to the specified format.
Ensure to include the negative sign for deductions from the account.
Ensure you provide the data nothing else in response"""
)
print(promt_str)


Using the data provided from a bank statement, transform the transaction details into a standardized list format.
Each transaction entry should be structured as following 3 columns: 'date, description, amount'.
Ensure description does not contain the values of amount as it would be redundant.
Here's the raw data extracted for processing:

Column0,Column1,Column2,Column3,Column4,Column5,Column6
Account Number 11323650810001,,,,,,
11323650920001,Current Account Personal,AED,19709.28,19709.28,,
,Total,,,19709.28,,
Accounts 475,,,,,,
05/02/2020,1304054767 MBTRF,08430239,05/02/2020,15000.00,,94461.15
11/02/2020,PG-100000000527-51712.25- UAEEXCHANGEBET,276296,11/02/2020,51712.25,,42748.90
18/02/2020,PG-100000003671-40000.00- UAEEXCHANGEBET,282391,18/02/2020,40000.00,,2748.90
22/02/2020,CREDIT CARD PAYMNT XXXXXXXXXXXX3118,45761085,22/02/2020,1201.36,,1547.54
22/02/2020,1345764824 MBTRF,88113985,22/02/2020,1547.54,,0.00
,Total,,,109461.15,0.00,
03/02/2020,I/W CLEARING CHEQUE,000004,03/02/2020

In [None]:
from langchain_community.llms import Ollama

In [None]:
llm = Ollama(model="gemma")

In [None]:
# display(llm.invoke("Tell me a joke"))

In [None]:
gemma_str = llm.invoke(promt_str)
display(gemma_str)

In [None]:
import pandas as pd

# Split the string into lines
lines = """Sure, here's the transformed data in the specified format:\n\n... (rest of your string)""".splitlines()[
    3:-1
]  # Remove header and incomplete line

# Create a list of lists, where each inner list represents a row
data = []
for line in lines:
    row = line.split(",")
    data.append(row)

# Create the DataFrame
df = pd.DataFrame(data, columns=["Date", "Description", "Amount"])

print(df)