In [None]:
def preprocess_file(input_path, output_path):
    with open(input_path, 'r', encoding='ISO-8859-1') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
        current_record = ''
        for line in infile:
            if line.count('|') == 10:  # Identifies a new record
                if current_record:  # If there's an ongoing record, write it before starting a new one
                    outfile.write(current_record + '\n')
                current_record = line.rstrip('\n')  # Start a new record
            else:
                # Part of the ongoing BillText; append including a space to avoid word merging
                current_record += ' ' + line.rstrip('\n')
        # Write the last record if it exists
        if current_record:
            outfile.write(current_record)

input_path = "../data_storage/legislation/complete_full_text.txt"
output_path = "../data_storage/legislation/preprocessed_full_text.txt"
preprocess_file(input_path, output_path)


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat_ws

spark = SparkSession.builder \
    .appName("Parse Legislation") \
    .getOrCreate()

file_path = "../data_storage/legislation/preprocessed_full_text.txt"

schema = "BillID STRING, StateCode STRING, StateBillID STRING, ShortBillName STRING, Created STRING, SponsorParty STRING, billtype STRING, status STRING, CommitteeCategories STRING, statesummary STRING, BillText STRING"
column_names = ['BillID', 'StateCode', 'StateBillID', 'ShortBillName', 'Created', 'SponsorParty', 'billtype', 'status', 'CommitteeCategories', 'statesummary', 'BillText']

# Load data
data = spark.read.option("delimiter", "|").csv(file_path, schema=schema)

# Combine remaining columns into a single column if there are more than 11 columns
remaining_columns = data.columns[10:]
data = data.withColumn("BillText", concat_ws("|", *[data[col] for col in remaining_columns]))

# Select only the first 11 columns
data = data.select(column_names)

data.show()

csv_file_path = "../data_storage/legislation/complete_full_text.csv"
data.write.mode("overwrite").option("header", "true").csv(csv_file_path)

# csv_directory_path = "../data_storage/legislation/complete_full_text.csv"

# # Load all part-files from the directory into a DataFrame
# df = spark.read.csv(path=csv_directory_path, header=True, inferSchema=True)
# df.toPandas().to_csv("../data_storage/legislation/complete_full_text_pdf.csv", index=False)

# # Stop Spark session
# spark.stop()



In [5]:
import pandas as pd
import glob

# Define the path and get all CSV files
csv_file_path = "../data_storage/legislation/complete_full_text.csv/*.csv"
csv_files = glob.glob(csv_file_path)

# Define your column names here based on your dataset's requirements
column_names = ['BillID', 'StateCode', 'StateBillID', 'ShortBillName', 'Created', 'SponsorParty', 'billtype', 'status', 'CommitteeCategories', 'statesummary', 'BillText']

dfs = []

for file in csv_files:
    processed_rows = []
    with open(file, 'r', encoding='utf-8') as f:
        next(f)
        for line in f:
            # Split the line into fields
            fields = line.strip().split(',')

            # Handle extra fields for the 4th column (ShortBillName)
            if len(fields) > 4:
                # Combine fields into the 4th column until what would be the start of the 5th column
                fields[3:10] = [' '.join(fields[3:10])]
                # Ensure no more than 11 fields before processing the 10th column
                fields = fields[:11]

            # Now, handle extra fields for the 10th column (statesummary)
            if len(fields) > 11:
                # Combine fields beyond the 11th into the 10th field
                fields[10:] = [' '.join(fields[10:])]
                fields = fields[:11]
            
            processed_rows.append(fields)
            
    # Create a DataFrame for each file's processed rows with the defined column names
    df = pd.DataFrame(processed_rows, columns=column_names)
    dfs.append(df)

# Concatenate all DataFrames from each file into one
combined_df = pd.concat(dfs, ignore_index=True)


In [6]:
combined_df.head(330)

Unnamed: 0,BillID,StateCode,StateBillID,ShortBillName,Created,SponsorParty,billtype,status,CommitteeCategories,statesummary,BillText
0,1623584,MI,SR0039,"""A resolution to recognize April 27 2023 as ...",Signed/Enacted/Adopted,,"""A RESOLUTION TO RECOGNIZE APRIL 27",2023,AS SURVIVORSSPEAK MICHIGAN DAY,"A PART OF NATIONAL CRIME VICTIMS RIGHTS WEEK""","""Michigan MI SR 0039 MI SR0039 MISR0039 MI SR ..."
1,1623654,AK,HB181,State Commission For Civil Rights 2023-04-26 1...,"""Alaska AK HB 181 AK HB181 AKHB181 AK HB 181 A...",cause includes incompetence,neglect of duty,and misconduct in office,and public statements and public or pr...,the governor shall provide a copy of the char...,or by counsel
2,1623695,MI,HB4474,Crime victims: other; elements for commission ...,"entitled\""The Michigan penal code","\""by amending section 147b (MCL 750.147b)","as added by 1988 PA 371.""","""Michigan MI HB 4474 MI HB4474 MIHB4474 MI HB ...","entitled\""The Michigan penal code","\""by amending section 147b (MCL 750.147b)",as added by 1988 PA 371. \t \t\t \t\...
3,1623696,ME,LD1833,"""An Act to Amend the Definition of \""Education...","""Maine ME LD 1833 ME LD1833 MELD1833 ME LD 183...",sub-§2-A,as amended by PL 1995,c. 393,§4,is further amended to read: 2-A. Educatio...,any public post-secondarypostsecondary inst...
4,1623715,IL,SR0225,SIKH HERITAGE MONTH 2023-04-26 14:31:05.063000...,"""Illinois IL SR 0225 IL SR0225 ILSR0225 IL SR ...",The United States is enriched by the diversit...,The Sikh community,which originated in Punjab,India and began immigrating into the United S...,has played an important role in developing Il...,Sikhism is the world's fifth-largest religion
...,...,...,...,...,...,...,...,...,...,...,...
325,1521088,NY,S01584,"""Prohibits certain student organizations which...","""AN ACT to amend the education law",in relation to certain student organizations ...,"""New York NY S 01584 NY S01584 NYS01584 NY S 1...",CUNY or community colleges. AN ACT to amend ...,in relation to certain student organizations ...,represented in Senate and Assembly,do enact as follows: \t\t Section 1. Sect...
326,1521119,NY,S01532,Requires the board of education and the truste...,in relation to requiring the board of educati...,"""New York NY S 01532 NY S01532 NYS01532 NY S 1...",in relation to requiring the board of educati...,represented in Senate and Assembly,do enact as follows: \t\t Section 1. Sect...,but are not limited to policies which: a. ens...,including using pronouns and names consi...
327,1521185,NY,S01565,Requires pet dealers and pet shops to provide ...,in relation to requiring pet dealers and pet ...,"""New York NY S 01565 NY S01565 NYS01565 NY S 1...",in relation to requiring pet dealers and pet ...,represented in Senate and Assembly,do enact as follows: \t\t Section 1. This...,as added by chapter 259 of the laws of 2000,subdivision 1 as amended by chapter 110 of th...
328,1521196,NY,S01556,"""Relates to an increase in punishment for cert...","""AN ACT to amend the penal law",in relation to on duty auxiliary police offic...,"""New York NY S 01556 NY S01556 NYS01556 NY S 1...",assault or menacing of such officer. AN ACT ...,in relation to on duty auxiliary police offic...,represented in Senate and Assembly,do enact as follows: \t\t Section 1. Sect...


In [7]:
import pandas as pd
import glob

# Define the path and get all CSV files
csv_file_path = "../data_storage/legislation/complete_full_text.csv/*.csv"
csv_files = glob.glob(csv_file_path)

# Define your column names here based on your dataset's requirements
column_names = ['BillID', 'StateCode', 'StateBillID', 'ShortBillName', 'Created', 'SponsorParty', 'billtype', 'status', 'CommitteeCategories', 'statesummary', 'BillText']

dfs = []

for file in csv_files:
    processed_rows = []
    with open(file, 'r', encoding='utf-8') as f:
        next(f)  # Skip the first line if it's headers
        for line in f:
            # Initialize an empty list to collect the split fields
            fields = []
            field_start = 0
            in_quotes = False
            for i, char in enumerate(line):
                # Toggle in_quotes status on quote
                if char == '"':
                    in_quotes = not in_quotes
                # Split on commas not within quotes
                elif char == ',' and not in_quotes:
                    fields.append(line[field_start:i])
                    field_start = i + 1
            # Add the last field
            fields.append(line[field_start:].strip())

            # Handle excess fields by combining them into the appropriate columns
            if len(fields) > 11:
                fields[3] = ','.join(fields[3:10])  # Combine into the 4th field
                fields[10] = ','.join(fields[10:])  # Combine remaining fields into the 11th
                fields = fields[:11]  # Keep only the first 11 fields
            
            processed_rows.append(fields)

    # Create a DataFrame for the processed rows for this file
    df = pd.DataFrame(processed_rows, columns=column_names)
    dfs.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)


In [14]:
combined_df = pd.read_csv("../data_storage/legislation/complete_cleaned_full_text.csv")
combined_df.to_csv("../data_storage/legislation/complete_cleaned_full_text.csv", index=False)