In [6]:
! python --version

Python 3.9.13


In [8]:
import sys
!{sys.executable} -m pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
     -------------------------------------- 42.0/42.0 kB 169.8 kB/s eta 0:00:00
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-win_amd64.whl.metadata (48 kB)
     -------------------------------------- 48.2/48.2 kB 243.5 kB/s eta 0:00:00
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
   ---------------------------------------- 59.2/59.2 kB 313.7 kB/s eta 0:00:00
Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
   ---------------------------------------- 5.6/5.6 MB 568.4 kB/s eta 0:00:00
Downloading pypdfium2-4.30.1-py3-none-win_amd64.whl (3.0 MB)
   ---------------------------------------- 3.0/3.0 MB 552.0 kB/s eta 0:00:00
Installing collected packages: pypdfium2, pdfminer.six, pdfplumber
Successfully installed pdfminer.



In [10]:
!{sys.executable} -m pip install read_pdf 

ERROR: Could not find a version that satisfies the requirement read_pdf (from versions: none)
ERROR: No matching distribution found for read_pdf


In [None]:
# Import necessary libraries
import pandas as pd
from tabula import read_pdf
import matplotlib.pyplot as plt
import seaborn as sns

# Load the PDF file into a dataframe
file_path = "MPESA_Statement_2024-01-01_to_2024-12-24_2547xxxxxx374_unlocked.pdf"

# Extract tables from the PDF
try:
    tables = read_pdf(file_path, pages="all", multiple_tables=True)
    print("PDF tables successfully extracted.")
except Exception as e:
    print("Error reading the PDF file:", e)

# Assuming the detailed transactions are in the largest table
transactions = tables[1]  # Adjust index based on your PDF structure
transactions.columns = ["Receipt No.", "Completion Time", "Details", "Transaction Status", "Paid In", "Withdrawn", "Balance"]

# Data Cleaning
transactions["Paid In"] = pd.to_numeric(transactions["Paid In"], errors="coerce")
transactions["Withdrawn"] = pd.to_numeric(transactions["Withdrawn"], errors="coerce")
transactions["Balance"] = pd.to_numeric(transactions["Balance"], errors="coerce")
transactions["Completion Time"] = pd.to_datetime(transactions["Completion Time"], errors="coerce")

# Remove rows with missing values in critical columns
transactions.dropna(subset=["Completion Time", "Paid In", "Withdrawn"], inplace=True)

# Basic Insights
total_paid_in = transactions["Paid In"].sum()
total_withdrawn = transactions["Withdrawn"].sum()
print(f"Total Paid In: {total_paid_in}")
print(f"Total Withdrawn: {total_withdrawn}")

# Visualizations
# 1. Transaction Trends Over Time
plt.figure(figsize=(12, 6))
sns.lineplot(data=transactions, x="Completion Time", y="Paid In", label="Paid In")
sns.lineplot(data=transactions, x="Completion Time", y="Withdrawn", label="Withdrawn")
plt.title("Transaction Trends Over Time")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.legend()
plt.show()

# 2. Distribution of Paid In and Withdrawn Amounts
plt.figure(figsize=(12, 6))
sns.histplot(transactions["Paid In"], kde=True, label="Paid In", color="blue", bins=20)
sns.histplot(transactions["Withdrawn"], kde=True, label="Withdrawn", color="red", bins=20)
plt.title("Distribution of Paid In and Withdrawn Amounts")
plt.xlabel("Amount")
plt.ylabel("Frequency")
plt.legend()
plt.show()

# 3. Monthly Summary
transactions["Month"] = transactions["Completion Time"].dt.to_period("M")
monthly_summary = transactions.groupby("Month").agg({"Paid In": "sum", "Withdrawn": "sum"}).reset_index()
print(monthly_summary)

# Bar Plot for Monthly Summary
plt.figure(figsize=(12, 6))
monthly_summary.plot(x="Month", kind="bar", stacked=True, title="Monthly Paid In and Withdrawn Summary")
plt.ylabel("Amount")
plt.xlabel("Month")
plt.show()
