In [1]:
# Import libraries
import pandas as pd
import numpy as np
import glob
import os

In [2]:
# Csv files named "permits_2021", "permits_2022", etc in permit-data folder. Load each one to a dictionary of dataframes, and use the year in the file name as the key

# Create empty dictionary
permits = {}

# Loop through files in permit-data folder, and print the number of files loaded, as well as the number of rows in each file. Sum the number of rows in each file to get the total number of rows in the dataset.
for file in glob.glob("permit-data/*.csv"):
    year = os.path.basename(file).split("_")[1].split(".")[0]
    permits[year] = pd.read_csv(file)
    print("Loaded {} rows from {}".format(len(permits[year]), file))
    print("Total rows: {}".format(sum([len(permits[year]) for year in permits])))

Loaded 10274 rows from permit-data\permits_2016.csv
Total rows: 10274
Loaded 10839 rows from permit-data\permits_2017.csv
Total rows: 21113
Loaded 11001 rows from permit-data\permits_2018.csv
Total rows: 32114
Loaded 11897 rows from permit-data\permits_2019.csv
Total rows: 44011
Loaded 12548 rows from permit-data\permits_2020.csv
Total rows: 56559
Loaded 14076 rows from permit-data\permits_2021.csv
Total rows: 70635
Loaded 11827 rows from permit-data\permits_2022.csv
Total rows: 82462
Loaded 8817 rows from permit-data\permits_2023.csv
Total rows: 91279
Loaded 565 rows from permit-data\permits_2024.csv
Total rows: 91844


In [3]:
# Concatenate all of the dataframes in the permits dictionary into a single dataframe. Check if the rows are in the same order as the original dataframes

# Print the total number of rows in the permits dictionary
total_rows = 0
for year, df in permits.items():
    total_rows += len(df)
print(f'There are {total_rows} in the permits dictionary')

# Concatenate all of the dataframes in the permits dictionary into a single dataframe
permits_df = pd.concat(permits.values(), ignore_index=True)

# Print the number of rows in the concatenated dataframe
print(f"There are {len(permits_df)} rows in the concatenated dataframe")

There are 91844 in the permits dictionary
There are 91844 rows in the concatenated dataframe


In [4]:
# Clean the permits_df

# clean_headers function
# Write a function that removes whitespace and periods in the headers of a dataframe, and replaces spaces with underscores.
def clean_headers(df):
    df.columns = df.columns.str.strip().str.replace(" ", "_").str.replace(".", "")
    return df

# strip_whitespace function
# Write a function that removes whitespace from the values in each column of a dataframe
def strip_whitespace(df):
    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = df[col].str.strip()
    return df

# uppercase_headers function
# Write a function that converts all of the headers to uppercase
def uppercase_headers(df):
    df.columns = df.columns.str.upper()
    return df

# remove_duplicates function
# Write a function that detects duplicates in the PERMIT# column of a dataframe, removes the duplicates, and prints the number of duplicates removed
def remove_duplicates(df):
    duplicates = df.duplicated(subset=["PERMIT#"])
    print("Number of duplicates removed: ", duplicates.sum())
    return df.drop_duplicates(subset=["PERMIT#"])

# remove_du function
# Write a function that removes the rows containing a value less than or equal to 0 in the D.U. column, and print the number of rows removed
def remove_du(df):
    du = df["DU"] <= 0
    print("Number of permits with <= 0 DU removed: ", du.sum())
    return df[~du]

# sum_du function
# Write a function that sums the D.U. column, and prints the total number of D.U.
def sum_du(df):
    print("Total D.U.: ", df["DU"].sum())
    return df

# convert_date function
# Write a function that converts the ISSUED_DATE column to datetime format
def convert_date(df):
    df["ISSUED_DATE"] = pd.to_datetime(df["ISSUED_DATE"])
    return df

# remove_time function
# Write a function that removes the time from the ISSUED_DATE column
def remove_time(df):
    df["ISSUED_DATE"] = df["ISSUED_DATE"].dt.date
    return df

# reset_index function
# Write a function that resets the index of a dataframe
def reset_index(df):
    df = df.reset_index(drop=True)
    return df

# Write a function that concatenates the ST_# and ROAD columns into a single column named ADDRESS
def concat_address(df):
    df["ADDRESS"] = df["ST_#"] + " " + df["ROAD"]
    return df

# Write a function that gets the first four digits from the ISSUED_DATE column and places them in a new column named YEAR. Then, convert the YEAR column to integer format
def get_year(df):
    try:
        df["YEAR"] = df["ISSUED_DATE"].astype(str).str[:4]
        df["YEAR"] = df["YEAR"].astype(int)
    except ValueError as e:
        print(f"Error converting 'YEAR' column to integers: {e}")
    return df

# Write a function that extracts the month from the ISSUED_DATE column and places it in a new column named MONTH.
def get_month(df):
    try:
        df["MONTH"] = df["ISSUED_DATE"].dt.month_name()
    except ValueError as e:
        print(f"Error extracting month from 'ISSUED_DATE' column: {e}")
    return df

In [5]:
# Using pd.pipe, apply the functions in the order they were defined
permits_df = (permits_df
    .pipe(clean_headers)
    .pipe(uppercase_headers)
    .pipe(remove_duplicates)
    .pipe(strip_whitespace)
    .pipe(sum_du)
    .pipe(concat_address)
    .pipe(get_year)
    .pipe(convert_date)
    .pipe(get_month)
    .pipe(remove_time)
    .pipe(reset_index)
)

permits_df

Number of duplicates removed:  20481


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].str.strip()


Total D.U.:  71991.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ADDRESS"] = df["ST_#"] + " " + df["ROAD"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["YEAR"] = df["ISSUED_DATE"].astype(str).str[:4]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["YEAR"] = df["YEAR"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try u

Unnamed: 0,ST_#,ROAD,PC,WARD,PLAN,LOT,CONTRACTOR,BLG_TYPE,MUNICIPALITY,DESCRIPTION,DU,VALUE,SQ_FT,PERMIT#,APPL_TYPE,ISSUED_DATE,FT2,ADDRESS,YEAR,MONTH
0,1355,BANK ST,K1H8K7,Ward 18,,pt lot 18,ALTI CONSTRUCTION LTD,Mixed,Old Ottawa,Repairs to the underground parking garage of a...,0.0,,0.00,,Construction,2016-01-04,,1355 BANK ST,2016,January
1,1105,MARCH RD,K2K1X7,Ward 5,3,13 14,NRB INC,Institutional,Kanata,Construct 1 new portable classroom onsite (St....,0.0,,768.00,,Construction,2016-01-04,,1105 MARCH RD,2016,January
2,14,NAIRN ST,K2V1B6,Ward 23,4M-1081,25,PROTECTIVE PLUMBING CANADA,Single,Kanata,PLUMBING ONLY- Install a Protective Plumbing D...,0.0,,0.00,,Construction,2016-01-04,,14 NAIRN ST,2016,January
3,107,MELANIE CRES,K2L2E4,Ward 23,M220,1311,PROTECTIVE PLUMBING CANADA,Single,Kanata,PLUMBING ONLY - Install a Protective Plumbing ...,0.0,,0.00,,Construction,2016-01-04,,107 MELANIE CRES,2016,January
4,20,COLONNADE RD,K2E7M6,Ward 9,,30,GRAEBECK CONSTRUCTION LTD.,Office,Nepean,Interior alterations on the 1st floor of a 2 s...,0.0,,2841.67,,Construction,2016-01-04,,20 COLONNADE RD,2016,January
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71358,336 B,TULIP CRES,K1E2B1,Ward 1,50M15,100,CONTRACTOR UNKNOWN,Semi - Detached,Cumberland,Construct an additional dwelling unit in the b...,1.0,25000,,,Construction,2024-01-31,705.0,336 B TULIP CRES,2024,January
71359,113,IBER RD,K2S1E7,Ward 6,4M658,04-Feb,CONTRACTOR UNKNOWN,Commercial,Goulbourn,Tenant fit-up on the 2nd floor of a 2 storey c...,0.0,5000,,,Construction,2024-01-31,100.0,113 IBER RD,2024,January
71360,698,BEATRICE DR,K2J0H2,Ward 24,4M1347,17,FENCED OTTAWA,Single,Nepean,Finish the basement in a 2 storey detached dwe...,0.0,34853,,,Construction,2024-01-31,750.0,698 BEATRICE DR,2024,January
71361,2469,JUNCTION AVE,K1V8G9,Ward 17,301,33,MR. ROOTER PLUMBING,Single,Old Ottawa,PLUMBING ONLY: Install a clean-out on the sani...,0.0,2500,,,Construction,2024-01-31,0.0,2469 JUNCTION AVE,2024,January


In [6]:
# Export the cleaned dataframe to a csv file
permits_df.to_csv("permits_cleaned.csv", index=False)