In [37]:
!pip install easyocr
!pip install opencv-python
!pip install python-Levenshtein



In [38]:
import cv2
import easyocr
import numpy as np
import re
import json
import os
import zipfile
from difflib import SequenceMatcher
from google.colab import files

In [39]:
uploaded = files.upload()
zip_path = list(uploaded.keys())[0]
print("Uploaded ZIP:", zip_path)

Saving aadhar data.rar to aadhar data (1).rar
Saving bank data.zip to bank data (1).zip
Saving pan data.zip to pan data (1).zip
Uploaded ZIP: aadhar data (1).rar


In [40]:
os.makedirs("aadhar_data", exist_ok=True)
os.makedirs("pan_data", exist_ok=True)
os.makedirs("bank_data", exist_ok=True)

print("Folders created:")
print(os.listdir())

Folders created:
['.config', 'bank_data', 'bank data.zip', 'aadhar data.rar', 'aadhar data (1).rar', 'pan data (1).zip', 'pan data.zip', 'pan_data', 'bank data (1).zip', 'aadhar_data', 'sample_data']


In [41]:
for fname in uploaded.keys():
    if "aadhar" in fname.lower():
        target = "aadhar_data"
    elif "pan" in fname.lower():
        target = "pan_data"
    elif "bank" in fname.lower():
        target = "bank_data"
    else:
        print("Skipping:", fname)
        continue

    print(f"Extracting {fname} → {target}/")



print("Extraction complete.")

Extracting aadhar data (1).rar → aadhar_data/
Extracting bank data (1).zip → bank_data/
Extracting pan data (1).zip → pan_data/
Extraction complete.


In [42]:
print("Aadhaar files:", os.listdir("aadhar_data")[:10])
print("PAN files:", os.listdir("pan_data")[:10])
print("Bank files:", os.listdir("bank_data")[:10])

Aadhaar files: ['new_generated_aadharcard_images']
PAN files: ['pan_from_internet.png', 'pan_from_internet2.png', 'pan3.png', 'pan2.png', 'pan1.png', 'pan_from_internet3.png']
Bank files: ['Bank Statement', 'Salary Slip', 'Utility', 'Check', 'ITR_Form 16']


In [43]:
valid_ext = (".jpg", ".jpeg", ".png", ".webp", ".jfif")

def collect_images(folder):
    images = []
    for root, dirs, files in os.walk(folder):
        for f in files:
            if f.lower().endswith(valid_ext):
                images.append(os.path.join(root, f))
    return images

aadhar_images = collect_images("aadhar_data")
pan_images = collect_images("pan_data")
bank_images = collect_images("bank_data")

print("Aadhaar images:", len(aadhar_images))
print("PAN images:", len(pan_images))
print("Bank images:", len(bank_images))

Aadhaar images: 1000
PAN images: 6
Bank images: 426


In [44]:
def preprocess_image(img_path):
    img = cv2.imread(img_path)

    if img is None:
        return None

    img = cv2.resize(img, (700, 700))

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    _, th = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)

    return th


In [45]:
reader = easyocr.Reader(['en'], gpu=True)
print("EasyOCR GPU Enabled:", reader.gpu)

EasyOCR initialized. Attempting to use GPU mode.


In [46]:
def extract_text(img_path):
    processed = preprocess_image(img_path)
    if processed is None:
        return ""

    text_list = reader.readtext(processed, detail=0)
    return " ".join(text_list)

In [47]:
def extract_pan_fields(text):
    return {
        "name": None,
        "dob": re.search(r"\d{2}[/-]\d{2}[/-]\d{4}", text).group(0) if re.search(r"\d{2}[/-]\d{2}[/-]\d{4}", text) else None,
        "pan": re.search(r"[A-Z]{5}[0-9]{4}[A-Z]", text).group(0) if re.search(r"[A-Z]{5}[0-9]{4}[A-Z]", text) else None,
        "aadhaar": None,
        "address": None
    }

def extract_aadhaar_fields(text):
    aa = re.search(r"\d{4} \d{4} \d{4}", text)
    return {
        "name": None,
        "dob": re.search(r"\d{2}[/-]\d{2}[/-]\d{4}", text).group(0) if re.search(r"\d{2}[/-]\d{2}[/-]\d{4}", text) else None,
        "pan": None,
        "aadhaar": aa.group(0) if aa else None,
        "address": None
    }

def extract_bank_fields(text):
    acc = re.search(r"\d{6,18}", text)
    return {
        "name": None,
        "dob": None,
        "pan": None,
        "aadhaar": None,
        "account_no": acc.group(0) if acc else None,
        "address": None
    }

In [48]:
def normalize(t):
    if not t:
        return ""
    t = t.lower()
    t = re.sub(r"[^a-z0-9 ]", "", t)
    return t.strip()

def similarity(a, b):
    a, b = normalize(a), normalize(b)
    return SequenceMatcher(None, a, b).ratio() if a and b else 0

In [49]:
WEIGHTS = {
    "name": 0.30,
    "dob": 0.15,
    "pan": 0.25,
    "aadhaar": 0.20,
    "address": 0.10
}

def compute_kyc_score(extracted, form_data):
    score = 0
    for key in WEIGHTS:
        score += WEIGHTS[key] * similarity(extracted.get(key), form_data.get(key))
    return round(score, 3)

In [50]:
# -------- Aadhaar form data --------
form_data_aadhaar = {
    "name": "john loyal",
    "dob": "01/01/1995",
    "pan": "",
    "aadhaar": "110022003300",
    "address": ""
}

# -------- PAN form data --------
form_data_pan = {
    "name": "D MANIKANDAN DURAISAMY",
    "dob": "16/07/1986",
    "pan": "BNZPM2501F",
    "aadhaar": "",
    "address": ""

# -------- Bank Statement form data --------
form_data_bank = {
    "name": "INDIRA SAPARE",
    "dob": "",
    "pan": "",
    "aadhaar": "",
    "address": "PLOT NO 23 RENUKA HOUSING SOCIETY YASHODA NAGAR HINGNA NAGPUR"
}

print("Form data sections created.")


Form data sections created.


In [55]:
# ============================================
# CELL 15 — PROCESS SINGLE DOCUMENT
# ============================================

def process_document(img_path, doc_type, form_data):
    raw = extract_text(img_path)

    if doc_type == "PAN":
        extracted = extract_pan_fields(raw)
    elif doc_type == "AADHAAR":
        extracted = extract_aadhaar_fields(raw)
    elif doc_type == "BANK":
        extracted = extract_bank_fields(raw)
    else:
        extracted = {}

    score = compute_kyc_score(extracted, form_data)

    return {
        "file": img_path,
        "raw_text": raw,
        "extracted": extracted,
        "score": score
    }

In [53]:
results = {"aadhar": [], "pan": [], "bank": []}

print("Processing Aadhaar images...")
for img in aadhar_images:
    out = process_document(img, "AADHAAR", form_data_aadhaar)
    results["aadhar"].append(out)

print("Processing PAN images...")
for img in pan_images:
    out = process_document(img, "PAN", form_data_pan)
    results["pan"].append(out)

print("Processing Bank images...")
for img in bank_images:
    out = process_document(img, "BANK", form_data_bank)
    results["bank"].append(out)

print("All processing finished.")

Processing Aadhaar images...
Processing PAN images...
Processing Bank images...
All processing finished.


In [54]:
with open("aadhar_results.json", "w") as f:
    json.dump(results["aadhar"], f, indent=4)

with open("pan_results.json", "w") as f:
    json.dump(results["pan"], f, indent=4)

with open("bank_results.json", "w") as f:
    json.dump(results["bank"], f, indent=4)

with open("full_kyc_summary.json", "w") as f:
    json.dump(results, f, indent=4)

print("Saved all JSON files successfully.")

files.download("full_kyc_summary.json")

Saved all JSON files successfully.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>