<a href="https://colab.research.google.com/github/joshiayush/cardmatch/blob/main/notebooks/card_doc_chain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain >/dev/null
!pip install langchain-groq >/dev/null
!pip install langchain-community >/dev/null
!pip install langchain-openai >/dev/null

In [None]:
#@title Extraction Prompt
credit_cards_details_extractor_prompt = """### Instructions for Large Language Model (LLM) to Extract Credit Card Data

1. **Input**:
   The input will be a scraped document about a credit card. It may contain structured or unstructured text with details about the card, its features, fees, and rewards.

2. **Output**:
   The model should return a JSON object. **If any specific detail is not found in the document, mark that field as `"null"`**.

### Example Output
If the document contains only partial information, such as card name, issuer, joining fees, and online shopping benefits, the output JSON should look like:

```json
{{
  "card_name": "SBI Card ELITE",
  "card_category": "Lifestyle",
  "card_issuer": "SBI",
  "card_network": "Visa",
  "card_co_brand_name": "null",
  "joining_fees": 4999,
  "annual_fees": 4999,
  "annual_fees_waived_off": "Spend INR 10 Lakh or more in a year to get a fee waiver",
  "card_replacement_fees": "null",
  "minimum_repayment_amount": "null",
  "cash_withdrawal_fee": "2.5% of the withdrawn amount or INR 500, whichever is higher",
  "cash_advance_fee": "null",
  "over_limit_fee": "2.5% of over-limit amount or INR 600, whichever is higher",
  "late_payment_charges": [
    "Nil for total amount due from INR 0 to INR 500",
    "INR 400 for total amount due greater than INR 500 & up to INR 1,000",
    "INR 750 for total amount due greater than INR 1,000 & up to INR 10,000",
    "INR 950 for total amount due greater than INR 10,000 & up to INR 25,000",
    "INR 1,100 for total amount due greater than INR 25,000 & up to INR 50,000",
    "INR 1,300 for total amount due greater than INR 50,000"
  ],
  "return_of_cheque_charges": "INR 500",
  "auto_debit_return_charges": [
    "INR 500 per returned transaction"
  ],
  "foreign_transaction_fee": "3.5% of the transaction amount",
  "rent_pay_transaction_fee": "null",
  "education_transaction_fee": "null",
  "utility_transaction_fee": "null",
  "fuel_transaction_fee": "null",
  "fuel_transaction_surcharge": "1% surcharge waiver on fuel transactions across all petrol pumps for transactions between INR 500 to INR 4,000, maximum waiver of INR 250 per statement cycle",
  "welcome_benefits": [
    "INR 5,000 worth of e-vouchers from Pantaloons, Hush Puppies/Bata, Aditya Birla Fashion, Shoppers Stop, and Yatra"
  ],
  "benefits_on_amazon": "null",
  "benefits_on_flipkart": "null",
  "benefits_on_myntra": "null",
  "benefits_on_messho": "null",
  "benefits_on_ola": "null",
  "benefits_on_uber": "null",
  "benefits_on_rapido": "null",
  "benefits_on_zomato": "null",
  "benefits_on_swiggy": "null",
  "benefits_on_blinkit": "null",
  "benefits_on_flipkart_grocery_minutes": "null",
  "benefits_on_zepto": "null",
  "benefits_on_big_basket": "null",
  "benefits_on_dmart": "null",
  "benefits_on_reliance_fresh": "null",
  "benefits_on_tata_star_bazzar": "null",
  "benefits_on_vishal_mega_mart": "null",
  "benefits_on_dominos": "null",
  "benefits_on_kfc": "null",
  "benefits_on_burger_king": "null",
  "benefits_on_wow_momo": "null",
  "benefits_on_mc_donalds": "null",
  "benefits_on_other_online_spends": [
    "5x Reward Points on Departmental Stores, Dining, Movies, Grocery, and International Transactions"
  ],
  "benefits_on_other_offline_spends": "null",
  "benefits_on_mobile_recharge": "null",
  "benefits_on_water_bill": "null",
  "benefits_on_gas_bill": "null",
  "benefits_on_electricity_bill": "null",
  "benefits_on_upi": "null",
  "benefits_on_insurance": "null",
  "benefits_on_govt_payments": "null",
  "benefits_on_rent_payments": "null",
  "benefits_on_fuel": [
    "1% fuel surcharge waiver"
  ],
  "benefits_on_travel": [
    "Complimentary Club Vistara Silver Membership",
    "6 Complimentary Domestic Lounge Access visits per year"
  ],
  "benefits_on_lounge_access": [
    "Complimentary Priority Pass Membership for the primary cardholder",
    "2 Complimentary International Lounge Visits per quarter through Priority Pass Program"
  ],
  "other_rewards": "null"
}}
```

### Document

{document}
"""

In [None]:
import os
import getpass

In [None]:
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("LANGCHAIN_API_KEY: ")
os.environ["OPENAI_API_KEY"] = getpass.getpass("OPENAI_API_KEY: ")

GROQ_API_KEY: ··········
LANGCHAIN_API_KEY: ··········
OPENAI_API_KEY: ··········


In [None]:
import os
import logging
import json
import re
from typing import Dict, List

import numpy as np
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.pydantic_v1 import BaseModel, Field


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
_cc_sheets = ["sbi", "axis", "hdfc"]


def load_cc_info() -> Dict:
    cc_info = dict()
    for sheet in _cc_sheets:
        df = pd.read_excel("cc_urls.xlsx", sheet_name=sheet)
        cc_info.update(df.set_index("card_name").T.to_dict())
    return cc_info

In [None]:
credit_cards_info = load_cc_info()

  cc_info.update(df.set_index("card_name").T.to_dict())
  cc_info.update(df.set_index("card_name").T.to_dict())


In [None]:
def llm_response_to_json(message: str) -> Dict:
    """Structures the model response into JSON format.

    This function provides a unified value for all the responses obtained from the
    model.

    Args:
        message: The message string obtained from the JSON response.

    Returns:
        The JSON response.
    """
    data = {
        "type": None,
        "data": None,
    }

    try:
        # Attempt to parse the JSON response
        response = json.loads(message.replace("\n", ""))
        data["type"] = "json"
        data["data"] = response
    except json.JSONDecodeError:
        logging.warn("Parsing failed, attempting to extract JSON from a code block.")
        try:
            response = message.split("```json")[1].split("```")[0].strip()
            response = json.loads(response)
            data["type"] = "json"
            data["data"] = response
        except (IndexError, json.JSONDecodeError):
            logging.exception('Cannot parse JSON string "%s"' % message)

    if not data["data"]:
        data["type"] = "text"
        data["data"] = message

    return data

In [None]:
def get_credit_card_unique_name(url: str) -> str:
    """Takes a URL and returns the last part of the URL after the last '/'.

    Args:
        url (str): The input URL.

    Returns:
        str: The last part of the URL.
    """
    return url.rstrip('/').split('/')[-1]

In [None]:
def load_docs_from_urls(cc_info: Dict, /, *, revised: bool = False) -> List[Dict]:
    """Loads a credit card JSON doc from each url."""
    json_docs = list()


    llm = ChatOpenAI()
    prompt_template = ChatPromptTemplate.from_template(credit_cards_details_extractor_prompt)
    chain = prompt_template | llm | StrOutputParser()

    for key, info in cc_info.items():
        if info["card_link"] is np.nan:
            continue

        scraped_data_loader = WebBaseLoader(info["card_link"])
        scraped_documents = scraped_data_loader.load()

        # replace multiple new lines and multiple spaces with a single one
        document = re.sub(r'(\r\n|\r|\n){2,}', r'\n', scraped_documents[0].page_content)
        document = re.sub(r'[ \t]+', ' ', document)

        res = chain.invoke({"document": document})

        json_doc = llm_response_to_json(res)["data"]
        json_doc["card_name"] = key
        json_doc["source"] = scraped_documents[0].metadata["source"]
        json_doc["card_image"] = info["card_image"]
        json_doc["tnc"] = info["tnc"]
        json_doc["unique_name"] = get_credit_card_unique_name(
            scraped_documents[0].metadata["source"]
        )

        json_docs.append(json_doc)
    return json_docs

In [None]:
load_docs_from_urls(credit_cards_info)

  logging.warn("Parsing failed, attempting to extract JSON from a code block.")


[{'card_name': 'SBI Card ELITE',
  'card_category': 'Lifestyle',
  'card_issuer': 'SBI',
  'card_network': 'Visa',
  'card_co_brand_name': 'null',
  'joining_fees': 4999,
  'annual_fees': 4999,
  'annual_fees_waived_off': 'Spend INR 10 Lakh or more in a year to get a fee waiver',
  'card_replacement_fees': 'null',
  'minimum_repayment_amount': 'null',
  'cash_withdrawal_fee': '2.5% of the withdrawn amount or INR 500, whichever is higher',
  'cash_advance_fee': 'null',
  'over_limit_fee': '2.5% of over-limit amount or INR 600, whichever is higher',
  'late_payment_charges': ['Nil for total amount due from INR 0 to INR 500',
   'INR 400 for total amount due greater than INR 500 & up to INR 1,000',
   'INR 750 for total amount due greater than INR 1,000 & up to INR 10,000',
   'INR 950 for total amount due greater than INR 10,000 & up to INR 25,000',
   'INR 1,100 for total amount due greater than INR 25,000 & up to INR 50,000',
   'INR 1,300 for total amount due greater than INR 50,000']