In [1]:
import os
import pandas as pd
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    """Extract all text from a PDF file using fitz."""
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text()
    except Exception as e:
        # Since error handling isn't necessary, we simply return empty text
        text = ""
    return text

def create_dataframe_from_folder(base_folder):
    # List to store each case as a row dictionary
    rows = []
    
    # Iterate over all items in the base_folder
    for case in os.listdir(base_folder):
        case_folder = os.path.join(base_folder, case)
        
        # Check if it is indeed a folder
        if os.path.isdir(case_folder):
            # Initialize the row with case name
            row = {"Case": case, "Answer": None, "Complaint": None}
            
            # Build the expected PDF paths
            answer_pdf_path = os.path.join(case_folder, "Answer.pdf")
            complaint_pdf_path = os.path.join(case_folder, "Complaint.pdf")
            
            # Extract and store text if file exists, otherwise assign NaN
            if os.path.exists(answer_pdf_path):
                row["Answer"] = extract_text_from_pdf(answer_pdf_path)
            else:
                row["Answer"] = None
                
            if os.path.exists(complaint_pdf_path):
                row["Complaint"] = extract_text_from_pdf(complaint_pdf_path)
            else:
                row["Complaint"] = None
                
            rows.append(row)
            
    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(rows)
    return df

# Example usage:
if __name__ == "__main__":
    base_folder = "Complaints/"  # change this to your folder's path
    df = create_dataframe_from_folder(base_folder)
    print(df)
    # Optionally, save the DataFrame to a CSV file
    df.to_csv("case_complaints.csv", index=False)


                                           Case  \
0       Martinka v. New Tang Dynasty Television   
1           National Fire Protection v. Upcodes   
2                              Mackie v. Hipple   
3                    August Image v. RunwayRiot   
4                   Campinha-Baconte v. Rearden   
5                    Emmerich v. Particle Media   
6                      Backgrid v. Fashion Nova   
7                              Bolano v. Pamart   
8                      Sands v. What_s Trending   
9                              Newegg v. Sutton   
10  Innovative Habitat v. Fleischman and Garcia   
11                          Boost Beauty v. Woo   
12  New Tradition Media v. Rittersbacher Sunset   
13                      Hirsch v. Complex Media   
14         North Jersey Media Group v. Fox News   
15                          Campbell v. Gannett   
16                   Pickersgill v. The Egotist   

                                               Answer  \
0   UNITED STATES DISTRI

In [2]:
df = pd.read_csv("case_complaints.csv")

In [3]:
df.head()

Unnamed: 0,Case,Answer,Complaint
0,Martinka v. New Tang Dynasty Television,UNITED STATES DISTRICT COURT \nSOUTHERN DISTRI...,\n \n \n \nUNITED STATES DISTRICT COURT \nSO...
1,National Fire Protection v. Upcodes,1 \n2 \n3 \n4 \n5 \n6 \n7 \n8 \n9 \n10 \n11 \n...,1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\...
2,Mackie v. Hipple,\n \nAnswer and 3rd Party Complaint – Page 1 ...,Case 2:09-cv-00164-RSL Document 1 Filed 02...
3,August Image v. RunwayRiot,\n1 \nIN THE UNITED STATES DISTRICT COURT \nF...,1 \n \nIN THE UNITED STATES DISTRICT COURT \nF...
4,Campinha-Baconte v. Rearden,\nAnswer of Annette Rearden \nITMO: Josepha A...,Case 3:10-cv-00139-TMB Document 1 Filed 0...


In [5]:
df.describe()

Unnamed: 0,Case,Answer,Complaint
count,17,15,17
unique,17,14,17
top,Martinka v. New Tang Dynasty Television,1 William Litvak (SBN 90533) \nEric P. Markus ...,\n \n \n \nUNITED STATES DISTRICT COURT \nSO...
freq,1,2,1
