In [1]:
import requests
import re
from bs4 import BeautifulSoup
import json
import csv
import pandas as pd

In [2]:
url = 'https://zambialii.org/legislation/all'
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')
script = soup.find('script', attrs={'type': 'application/json'})
json_data = json.loads(script.string)

In [3]:
print(json_data)

[{'title': 'Brands Act, 1913', 'children': [], 'citation': 'Chapter 244', 'work_frbr_uri': '/akn/zm/act/1913/12', 'repealed': False, 'year': '1913', 'taxonomies': [], 'languages': ['eng']}, {'title': 'Authentication of Documents Act, 1914', 'children': [], 'citation': 'Chapter 75', 'work_frbr_uri': '/akn/zm/act/1914/1', 'repealed': False, 'year': '1914', 'taxonomies': [], 'languages': ['eng']}, {'title': 'Lands and Deeds Registry Act, 1914', 'children': [], 'citation': 'Chapter 185', 'work_frbr_uri': '/akn/zm/act/1914/15', 'repealed': False, 'year': '1914', 'taxonomies': [], 'languages': ['eng']}, {'title': 'Cotton Act, 1914', 'children': [], 'citation': 'Chapter 227', 'work_frbr_uri': '/akn/zm/act/1914/4', 'repealed': False, 'year': '1914', 'taxonomies': [], 'languages': ['eng']}, {'title': 'Witchcraft Act, 1914', 'children': [], 'citation': 'Chapter 90', 'work_frbr_uri': '/akn/zm/act/1914/5', 'repealed': False, 'year': '1914', 'taxonomies': [], 'languages': ['eng']}, {'title': 'Pluma

In [4]:
print(json.dumps(json_data, indent=2))


[
  {
    "title": "Brands Act, 1913",
    "children": [],
    "citation": "Chapter 244",
    "work_frbr_uri": "/akn/zm/act/1913/12",
    "repealed": false,
    "year": "1913",
    "taxonomies": [],
    "languages": [
      "eng"
    ]
  },
  {
    "title": "Authentication of Documents Act, 1914",
    "children": [],
    "citation": "Chapter 75",
    "work_frbr_uri": "/akn/zm/act/1914/1",
    "repealed": false,
    "year": "1914",
    "taxonomies": [],
    "languages": [
      "eng"
    ]
  },
  {
    "title": "Lands and Deeds Registry Act, 1914",
    "children": [],
    "citation": "Chapter 185",
    "work_frbr_uri": "/akn/zm/act/1914/15",
    "repealed": false,
    "year": "1914",
    "taxonomies": [],
    "languages": [
      "eng"
    ]
  },
  {
    "title": "Cotton Act, 1914",
    "children": [],
    "citation": "Chapter 227",
    "work_frbr_uri": "/akn/zm/act/1914/4",
    "repealed": false,
    "year": "1914",
    "taxonomies": [],
    "languages": [
      "eng"
    ]
  },
  {
  

In [8]:

# Define a set of rules for filtering out headers and footers
header_rules = ["ACT", "REGULATION", "ENACTED", "Gazette", "Page", "Printed by the Government Printer", "Lusaka", "Republic of Zambia", "CHRONOLOGICAL TABLE OF THE LAWS", "TABLE OF CONTENTS","ZambiaLII","Judgments","Legislation","Document detail"]
footer_rules = ["____________________", "FIRST SCHEDULE", "SECOND SCHEDULE", "THIRD SCHEDULE", "FOURTH SCHEDULE", "FIFTH SCHEDULE", "SIXTH SCHEDULE", "SEVENTH SCHEDULE", "EIGHTH SCHEDULE", "NINTH SCHEDULE", "TENTH SCHEDULE"]

# Set the base URL
base_url = "https://zambialii.org"

# Create a list to hold the data
data = []
 
# Loop through the first 10 JSON objects
for i in range(1250):
    obj = json_data[i]
    # Build the URL
    url = base_url + obj["work_frbr_uri"]
    # Get the PDF content
    response = requests.get(url)
    # Parse the PDF content with BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    # Extract the text content
    content = soup.get_text().strip()  # Remove excess spaces
    # Filter out headers and footers
    lines = content.split('\n')
    content = '\n'.join(line for line in lines if not any(rule in line.upper() for rule in header_rules + footer_rules))
    # Remove excess white space
    content = ' '.join(content.split())
    # Add the content to the JSON object
    
    original_string = content # Slice the string to exclude the first 87 characters and the last 33 characters
    content = original_string[250:-1000]
    
    obj["content"] = content
    # Add the object to the list
    data.append(obj)


In [9]:
# Convert data to DataFrame
df = pd.DataFrame(json_data, columns=['title', 'citation','content'])

print(df.head(2))

                                   title     citation  \
0                       Brands Act, 1913  Chapter 244   
1  Authentication of Documents Act, 1914   Chapter 75   

                                             content  
0  e document Zambia Chapter 244 Commenced on 1 N...  
1   document Zambia Chapter 75 Commenced on 21 Fe...  


In [10]:
df.to_excel('df2.xlsx', index=False)