This notebook reads data that we've downloaded from the LA City website and puts it into a big file

In [1]:
import os
import pandas as pd

DATA_DIR = '~/GitHub/la_mayors_office/data'

In [2]:
def make_APN(df):
    """ This function will add an APN column to a dataset
    made by concatenting the partial APNs
    """
    apn_cols = ['Assessor Book', 'Assessor Page', 'Assessor Parcel']
    series_apn = df[apn_cols].astype(str).apply(lambda x: ''.join(x), axis=1)
    return series_apn

# Process certificate of occupancy

Read it in, drop columns we don't care about and output a 'trimmed' version

Notes:
    PCIS = "Plan Check and Inspection System"
    "# of Residential Dwelling Units" may for additions/remodeling may just measure the number of units affected

In [3]:
fname_input = "Building_and_Safety_Certificate_of_Occupancy.csv"
fname_output = "Trimmed_Occupancy.csv"

location_columns = [
    "APN",
    "Latitude/Longitude",
    "Census Tract"
]

address_columns = [
    "Address Start",
    "Address Fraction Start",
    "Address End",
    "Address Fraction End",
    "Street Direction",
    "Street Name",
    "Street Suffix",
    "Suffix Direction",
    "Unit Range Start",
    "Unit Range End",
    "Zip Code"
]

permit_columns = [
    "Status",
    "# of Residential Dwelling Units",
    "PCIS Permit #",
    "Permit Type",
    "Permit Sub-Type",
    "Permit Issue Date",
    "Applicant First Name",
    "Applicant Last Name",
    "Applicant Business Name",
]

extra_columns = [
    "Valuation",
    "Work Description",
    "Contractor's Business Name",
    "# of Stories"
]

df_CofO = pd.read_csv(os.path.join(DATA_DIR, fname_input), dtype=str)

# Make APN
df_CofO["APN"] = make_APN(df_CofO)

df_CofO_trimmed = df_CofO[location_columns + address_columns + permit_columns + extra_columns]

df_CofO_trimmed.to_csv(
    os.path.join(DATA_DIR, fname_output),
    index=False
)

# Process demolition permits

Read it in, drop columns we don't care about, drop rows that are not demolition permits, and output a 'trimmed' version

Notes:


In [4]:
location_columns = [
    "APN",
    "Latitude/Longitude",
    "Census Tract"
]

address_columns = [
    "Address Start",
    "Address Fraction Start",
    "Address End",
    "Address Fraction End",
    "Street Direction",
    "Street Name",
    "Street Suffix",
    "Suffix Direction",
    "Unit Range Start",
    "Unit Range End",
    "Zip Code"
]

permit_columns = [
    "Status",
    "PCIS Permit #",
    "Reference # (Old Permit #)",
    "Permit Type",
    "Permit Sub-Type",
    "Permit Issue Date",
    "Applicant First Name",
    "Applicant Last Name",
    "Applicant Business Name",
]

extra_columns = [
    "Valuation",
    "Work Description",
    "Contractor's Business Name",
    "Council District",
]


fname_input = "Building_and_Safety_Permit_Information.csv"
fname_output = "Trimmed_Demolition.csv"

df_demolition = pd.read_csv(os.path.join(DATA_DIR, fname_input), dtype=str)

# Make APN
df_demolition["APN"] = make_APN(df_demolition)

rows_is_demolition = df_demolition['Permit Type'].isin(['Bldg-Demolition', 'NonBldg-Demolition'])

df_demolition = df_demolition.loc[
    rows_is_demolition,
    location_columns + address_columns + permit_columns + extra_columns
]

df_demolition.to_csv(
    os.path.join(DATA_DIR, fname_output),
    index=False
)

# Process building permits

Read it in, drop columns we don't care about, drop rows that are demolition permits, and output a 'trimmed' version

Notes:


In [5]:
fname_input = "Building_and_Safety_Permit_Information.csv"
fname_output = "Trimmed_Building.csv"

df_building = pd.read_csv(os.path.join(DATA_DIR, fname_input), dtype=str)

# Make APN
df_building["APN"] = make_APN(df_building)

rows_is_demolition = df_building['Permit Type'].isin(['Bldg-Demolition', 'NonBldg-Demolition'])

df_building = df_building.loc[
    ~rows_is_demolition,
    location_columns + address_columns + permit_columns + extra_columns
]

df_building.to_csv(
    os.path.join(DATA_DIR, fname_output),
    index=False
)

# Concatenate into a huge file

Read all the trimmed dataframes into a single mega-file

In [6]:
fname_list = [
    ("Demolition Permit", "Trimmed_Demolition.csv"),
    ("Building Permit", "Trimmed_Building.csv"),
    ("Occupancy Inspection", "Trimmed_Occupancy.csv")
]

fname_output = 'la_housing_dataset.csv'

df_full = []
for f in fname_list:
    df = pd.read_csv(os.path.join(DATA_DIR, f[1]))
    df["General Category"] = f[0]
    df_full.append(df)
df_full = pd.concat(df_full, axis=0)

In [7]:
df_full = df_full.sort_values(["APN", "Permit Issue Date"]).reset_index(drop=True)

# Re-ordering columns to be easier to read
new_order = location_columns + address_columns + ["General Category"] + permit_columns
new_order += [c for c in df_full if c not in new_order]
df_full = df_full[new_order]

df_full.to_csv(
    os.path.join(DATA_DIR, fname_output),
    index=False
)