# Converting PDF to Txt
This codebook is to convert all judgement PDFs into TXT format.

Judgments on [eCourts portal](https://ecourts.gov.in/ecourts_home/) are available in the form of PDFs, and often they are not machine readable. Hence they have to be converted into TXT first to input them into NLP Models.

In [1]:
#Import required libraries
import os
import shutil
import glob
from natsort import natsorted

from tqdm import tqdm
import pandas as pd

from pdf2image import convert_from_path
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter

In [5]:
# Make a list of all paths of all PDFs in the directory
judgement_paths_assam = []
path ="Data"
for root, dirs, files in os.walk(path):
    for file in files:
        if(file.endswith(".pdf")):
            judgement_paths_assam.append(os.path.join(root,file))
            
            
# Select only those PDFs that are judgements - the name of the judgment file has "judgment" mentioend in it. 
judgements = []
for judgement in judgement_paths_assam:
    if ('judgment' in judgement.lower()):
        judgements.append(judgement)

In [6]:
len(judgements)

1162

In [None]:
# Convert Judgment PDFs into txts

## Store already converted PDFS in a list
converted_cases = []

for judgement in tqdm(judgements):
    #Retrieve case_id from the judgement file name
    case_id = judgement.split(r'/')[-1].split('_')[0]
    
    if case_id in converted_cases:
        continue
    
    # Saving the TXT in the same folder where PDF is present
    folder_path = judgement.split(r'/')
    folder_path.pop()
    folder_path = r"/".join(folder_path)+"/"

    # Save individual pages of a judgement in a temporary folder with DPI 350
    pages = convert_from_path(judgement, 350, output_folder='./tmp')
    i = 1
    for page in pages:
        image_name = "Page_" + str(i) + ".jpg"  
        page.save(image_name, "JPEG")
        i = i+1
        
    # Delete the temporary folder and create it anew for the next iteration
    shutil.rmtree('tmp')
    os.makedirs('tmp')
    
    # Extract text from each picture of the judgement and then delete the picture
    pics = glob.glob("*.jpg")
    pics = natsorted(pics)
    judgement_text = ''
    for pic in pics:
        text = pytesseract.image_to_string(Image.open(pic))
        os.remove(pic)
        # Append the text extracted from each picture cumulatively
        judgement_text = judgement_text + text
    
    # Save the entire text extracted in txt format
    with open(folder_path+case_id+'.txt', 'w') as f:
        f.write(judgement_text)
        
    converted_cases.append(case_id)

All 1162  POCSO judgments PDFs are converted into TXTs and saved in the same directory where the PDFs are present.