In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [5]:
def scrape_case_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the relevant section containing case details (this may vary based on website structure)
    cases = []
    
    # Example: Assuming cases are listed in <div class="case"> tags (you need to inspect the actual HTML)
    for case_div in soup.find_all('div', class_='case'):
        case_name = case_div.find('h2').text.strip()  # Adjust based on actual structure
        citation = case_div.find('span', class_='citation').text.strip()  # Adjust based on actual structure
        year = case_div.find('span', class_='year').text.strip()  # Adjust based on actual structure
        facts = case_div.find('p', class_='facts').text.strip()  # Adjust based on actual structure
        issues = case_div.find('p', class_='issues').text.strip()  # Adjust based on actual structure
        judgment_summary = case_div.find('p', class_='summary').text.strip()  # Adjust based on actual structure
        
        cases.append({
            'case_name': case_name,
            'citation': citation,
            'year': year,
            'facts': facts,
            'issues': issues,
            'judgment_summary': judgment_summary
        })
    
    return cases


In [6]:
def main():
    url = "https://www.indiankanoon.org/search/?formInput=constitution"  # Example URL (modify as needed)
    all_cases = scrape_case_data(url)

    # Create DataFrame and save to CSV
    df = pd.DataFrame(all_cases)
    df.to_csv('constitutional_cases_dataset.csv', index=False)
    print("Dataset saved as constitutional_cases_dataset.csv")

if __name__ == "__main__":
    main()


Dataset saved as constitutional_cases_dataset.csv


In [11]:
import requests
def get_cases(keyword, max_cases=10):
    url = f"https://api.indiankanoon.org/search/?formInput={keyword}"
    response = requests.get(url)
    # Parse response to extract case details
    return response


print(get_cases("Article 14"))

<Response [401]>


In [13]:
import requests
import pandas as pd
import time

def get_cases(keyword="constitution", max_cases=200):
    API_KEY = "eb9ac60185dc449a177c16a3f881476b16999a01"  # Replace with your API key
    headers = {"Authorization": f"Token {API_KEY}"}
    
    cases = []
    page = 1
    
    try:
        while len(cases) < max_cases:
            url = f"https://api.indiankanoon.org/search/?formInput={keyword}&pn={page}"
            response = requests.get(url, headers=headers)
            
            if response.status_code != 200:
                print(f"Error: API returned status code {response.status_code}")
                break
                
            data = response.json()
            
            if not data.get("results"):
                print("No more results found")
                break
                
            for result in data["results"]:
                cases.append({
                    "case_name": result.get("title", "N/A"),
                    "citation": result.get("citation", "N/A"),
                    "year": result.get("date", "N/A").split("-")[0] if result.get("date") else "N/A",
                    "judgment_summary": result.get("doc", "N/A")[:200] + "...",  # First 200 characters
                    "url": f"https://indiankanoon.org/doc/{result['tid']}/"
                })
                
                if len(cases) >= max_cases:
                    break
                    
            page += 1
            time.sleep(1)  # Rate limiting
            
        return pd.DataFrame(cases)
    
    except Exception as e:
        print(f"Error: {str(e)}")
        return pd.DataFrame(cases)

# Example usage
if __name__ == "__main__":
    df = get_cases(keyword="article 14", max_cases=200)
    df.to_csv("indian_constitutional_cases.csv", index=False)
    print(f"Saved {len(df)} cases to CSV")


Error: API returned status code 405
Saved 0 cases to CSV


In [15]:
import requests
import pandas as pd
import time

def get_cases(keyword="constitution", max_cases=200):
    API_KEY = "eb9ac60185dc449a177c16a3f881476b16999a01"  # Replace with your actual API key
    headers = {"Authorization": f"Token {API_KEY}"}  # Using Token Auth
    
    cases = []
    page = 0  # pagenum starts at 0
    
    try:
        while len(cases) < max_cases:
            url = f"https://api.indiankanoon.org/search/?formInput={keyword}&pagenum={page}"  # SEARCH endpoint
            response = requests.post(url, headers=headers)  # POST request
            
            if response.status_code == 404:
                print("Error 404: Check URL, parameters, or API endpoint.")
                break
            
            if response.status_code == 403:
                print("Error 403: Authentication failed. Check API key format or status.")
                break
            
            if response.status_code != 200:
                print(f"Error {response.status_code}: {response.text}")
                break
            
            data = response.json()
            
            if not data.get("docs"):  # "docs" instead of "results"
                print("No more results found.")
                break
            
            for doc in data["docs"]:  # Use "docs" key
                cases.append({
                    "case_name": doc.get("title", "N/A"),
                    "citation": doc.get("citation", "N/A"),
                    "year": doc.get("publishdate", "N/A").split("-")[0] if doc.get("publishdate") else "N/A",
                    "url": f"https://indiankanoon.org/doc/{doc['tid']}/"
                })
                
            page += 1
            time.sleep(2)  # Rate limiting
            
        return pd.DataFrame(cases)
    
    except Exception as e:
        print(f"Fatal error: {str(e)}")
        return pd.DataFrame(cases)

# Example
df = get_cases(keyword="constitution")
df.to_csv("cases.csv", index=False)


In [7]:
import mlcroissant as mlc
import pandas as pd

# Load the Croissant Dataset
croissant_dataset = mlc.Dataset('https://www.kaggle.com/datasets/kanishhkaa/legal-analysis-using-ipc-dataset/croissant/download')

# List all record sets
record_sets = croissant_dataset.metadata.record_sets
for i, rs in enumerate(record_sets):
    print(f"{i}: {rs.name}")

# Assuming you're using the first record set
record_set = record_sets[0]
print(f"\nUsing record set: {record_set.name}")

# Convert records to DataFrame
records = croissant_dataset.records(record_set=record_set.uuid)
df = pd.DataFrame(records)

# Preview the dataframe
print(df.head())

# Save to CSV
csv_filename = f"{record_set.name.replace(' ', '_')}.csv"
df.to_csv(csv_filename, index=False)
print(f"\nSaved dataset to '{csv_filename}'")


  -  [Metadata(Legal Analysis using IPC Dataset)] Property "http://mlcommons.org/croissant/citeAs" is recommended, but does not exist.


0: ipc_sections.csv

Using record set: ipc_sections.csv
                        ipc_sections.csv/Description  \
0  b'Description of IPC Section 140\nAccording to...   
1  b'Description of IPC Section 127\nAccording to...   
2  b'Description of IPC Section 128\nAccording to...   
3  b'Description of IPC Section 129\nAccording to...   
4  b'Description of IPC Section 130\nAccording to...   

                            ipc_sections.csv/Offense  \
0  b'Wearing the dress or carrying any token used...   
1  b'Receiving property taken by war or depredati...   
2  b'Public servant voluntarily allowing prisoner...   
3  b'Public servant negligently suffering prisone...   
4  b'Aiding escape of, rescuing or harbouring, su...   

                   ipc_sections.csv/Punishment ipc_sections.csv/Section  
0                  b'3 Months or Fine or Both'               b'IPC_140'  
1   b'7 Years + Fine + forfeiture of property'               b'IPC_127'  
2  b'Imprisonment for Life or 10 Years + Fine'  

In [8]:
import mlcroissant as mlc
import pandas as pd

# Load the Croissant Dataset
croissant_dataset = mlc.Dataset('https://www.kaggle.com/datasets/vyomrohila/aricles-of-indian-constitution/croissant/download')

# List all record sets
record_sets = croissant_dataset.metadata.record_sets
for i, rs in enumerate(record_sets):
    print(f"{i}: {rs.name}")

# Assuming you're using the first record set
record_set = record_sets[0]
print(f"\nUsing record set: {record_set.name}")

# Convert records to DataFrame
records = croissant_dataset.records(record_set=record_set.uuid)
df = pd.DataFrame(records)

# Preview the dataframe
print(df.head())

# Save to CSV
csv_filename = f"{record_set.name.replace(' ', '_')}.csv"
df.to_csv(csv_filename, index=False)
print(f"\nSaved dataset to '{csv_filename}'")


  -  [Metadata(Aricles of Indian Constitution)] Property "http://mlcommons.org/croissant/citeAs" is recommended, but does not exist.
  -  [Metadata(Aricles of Indian Constitution)] Property "https://schema.org/datePublished" is recommended, but does not exist.


0: indian_constitution.csv

Using record set: indian_constitution.csv


Downloading https://www.kaggle.com/api/v1/datasets/download/vyomrohila/aricles-of-indian-constitution?datasetVersionNumber=1...: 100%|██████████| 105k/105k [00:00<00:00, 224kiB/s] 


  indian_constitution.csv/Part+No. indian_constitution.csv/Article+No.  \
0                        b'Part I'                   b'\nArticle\n1\n'   
1                        b'Part I'                   b'\nArticle\n2\n'   
2                        b'Part I'                   b'\nArticle\n3\n'   
3                        b'Part I'                   b'\nArticle\n4\n'   
4                       b'Part II'                   b'\nArticle\n5\n'   

             indian_constitution.csv/Article+Heading  \
0                 b'Name and Territory of the Union'   
1        b'Admission or establishment of new States'   
2  b'Formation of new States and alteration of ar...   
3  b'Laws made under articles 2 and 3 to provide ...   
4  b'Citizenship at the commencement of the Const...   

         indian_constitution.csv/Article+Description  
0  b'\n(1) India, that is Bharat, shall be a Unio...  
1  b'\nParliament may by law admit into the Union...  
2  b'\nParliament may by law\xe2\x80\x94\n(a) for... 