In [1]:
import re
import typing as t
import numpy as np

In [5]:
class TableCsv:
    def __init__(self,
                 columns: t.List[str],
                 values: t.List[t.List[t.Any]]):
        self.columns = columns
        self.values = values


def remove_commas_in_quotes(line):
    return re.sub(r'"[^"]*"', lambda m: m.group(0).replace(',', ''), line)

class Table:
    def __init__(self,
                 table_name: str,
                 csv_location: str,
                 attributes: t.Dict[str, t.Any],
                 ):
        self.table_name = table_name
        self.csv_location = csv_location
        self.attributes = attributes

    @staticmethod
    def parse_table(csv_path: str) -> 'Table':
        with open(csv_path, 'r', encoding='utf-8', errors='replace') as f:
            raw_contents = f.readlines()
        table_name = raw_contents[0].replace(',', '').strip()
        attributes: t.Dict[str, t.Any] = {}
        current_block = []
        first_table = True
        for line in raw_contents[1:]:
            cleaned_line = remove_commas_in_quotes(line)
            cleaned_line = cleaned_line.replace('"', '').strip()
            if cleaned_line.replace(',', '').strip() == '':
                # start of the next attribute
                if len(current_block) <= 0:
                    continue
                sub_table_name = [v.strip() for v in current_block[0].split(',') if v.strip() != '']
                if (first_table or len(sub_table_name) == 1) and len(current_block) >= 3:
                    # treat it as a csv table.
                    if first_table:
                        sub_table_name = table_name
                        start_idx = 0
                    else:
                        sub_table_name = sub_table_name[0]
                        start_idx = 1
                    columns = [col.strip() for col in current_block[start_idx].split(',')]
                    values = []
                    for l in current_block[start_idx+1:]:
                        value = [val.strip() for val in l.split(',')]
                        values.append(value)
                    attributes[sub_table_name] = TableCsv(columns, values)
                else:
                    # treat it an unorganized texts
                    for l in current_block:
                        attr_name = l.split(',')[0]
                        attr_content = '|'.join([v.strip() for v in l.split(',')[1:] if v.strip() != ''])
                        attributes[attr_name] = attr_content
                current_block = []
                first_table = False
            else:
                current_block.append(cleaned_line)
        return Table(table_name, csv_path, attributes)

In [6]:
table1 = Table.parse_table("raw/CSVs/2024_CSN_State_Rankings_Identity_Theft_Reports.csv")
table2 = Table.parse_table("raw/CSVs/2024_CSN_State_Rankings_Fraud_and_Other_Reports.csv")

In [8]:
def solve_q6() -> float:
    sub_table1 = table1.attributes['State Rankings: Identity Theft Reports']
    sub_table2 = table2.attributes['State Rankings: Fraud and Other Reports']
    best_density = -1
    best_name = None
    for row in sub_table1.values:
        state_name = row[1]
        population = int(row[3])/int(row[2])
        for row2 in sub_table2.values:
            if state_name == row2[1]:
                density = (int(row[3]) + int(row2[3])) / population
                if density > best_density:
                    best_density = density
                    best_name = state_name
                break
    return best_name
solve_q6()

'District of Columbia'

In [7]:
table1.attributes['State Rankings: Identity Theft Reports'].values

[['1', 'Florida', '528', '115840'],
 ['2', 'Georgia', '517', '55955'],
 ['3', 'Nevada', '466', '14631'],
 ['4', 'Texas', '393', '116484'],
 ['5', 'Delaware', '392', '3942'],
 ['6', 'Massachusetts', '388', '27141'],
 ['7', 'California', '356', '139665'],
 ['8', 'Louisiana', '346', '15991'],
 ['9', 'Illinois', '339', '43028'],
 ['10', 'Maryland', '324', '19990'],
 ['11', 'New Jersey', '313', '29037'],
 ['12', 'New York', '295', '58692'],
 ['13', 'South Carolina', '291', '15146'],
 ['14', 'Pennsylvania', '290', '37717'],
 ['15', 'Arizona', '287', '20863'],
 ['16', 'Alabama', '269', '13596'],
 ['17', 'Mississippi', '268', '7903'],
 ['18', 'Rhode Island', '264', '2896'],
 ['19', 'North Carolina', '259', '27466'],
 ['20', 'Michigan', '237', '23783'],
 ['21', 'Connecticut', '236', '8502'],
 ['22', 'Ohio', '236', '27766'],
 ['23', 'Virginia', '213', '18460'],
 ['24', 'Tennessee', '212', '14829'],
 ['25', 'Arkansas', '209', '6340'],
 ['26', 'Colorado', '208', '12087'],
 ['27', 'Indiana', '198',

Q1:
