In [50]:
# Import libraries
import pandas as pd
import numpy as np
import glob
import os

In [51]:
# Csv files named "permits_2021", "permits_2022", etc in permit-data folder. Load each one to a dictionary of dataframes, and use the year in the file name as the key

# Create empty dictionary
permits = {}

# Loop through files in permit-data folder, and print the number of files loaded, as well as the number of rows in each file. Sum the number of rows in each file to get the total number of rows in the dataset.
for file in glob.glob("permit-data/*.csv"):
    year = os.path.basename(file).split("_")[1].split(".")[0]
    permits[year] = pd.read_csv(file)
    print("Loaded {} rows from {}".format(len(permits[year]), file))
    print("Total rows: {}".format(sum([len(permits[year]) for year in permits])))

Loaded 10274 rows from permit-data\permits_2016.csv
Total rows: 10274
Loaded 10839 rows from permit-data\permits_2017.csv
Total rows: 21113
Loaded 11001 rows from permit-data\permits_2018.csv
Total rows: 32114
Loaded 11897 rows from permit-data\permits_2019.csv
Total rows: 44011
Loaded 12548 rows from permit-data\permits_2020.csv
Total rows: 56559
Loaded 14076 rows from permit-data\permits_2021.csv
Total rows: 70635
Loaded 11827 rows from permit-data\permits_2022.csv
Total rows: 82462
Loaded 8817 rows from permit-data\permits_2023.csv
Total rows: 91279
Loaded 565 rows from permit-data\permits_2024.csv
Total rows: 91844


In [52]:
# Concatenate all of the dataframes in the permits dictionary into a single dataframe. Check if the rows are in the same order as the original dataframes

# Print the total number of rows in the permits dictionary
total_rows = 0
for year, df in permits.items():
    total_rows += len(df)
print(f'There are {total_rows} in the permits dictionary')

# Concatenate all of the dataframes in the permits dictionary into a single dataframe
permits_df = pd.concat(permits.values(), ignore_index=True)

# Print the number of rows in the concatenated dataframe
print(f"There are {len(permits_df)} rows in the concatenated dataframe")

There are 91844 in the permits dictionary
There are 91844 rows in the concatenated dataframe


In [53]:
# Clean the permits_df

# clean_headers function
# Write a function that removes whitespace and periods in the headers of a dataframe, and replaces spaces with underscores.
def clean_headers(df):
    df.columns = df.columns.str.strip().str.replace(" ", "_").str.replace(".", "")
    return df

# strip_whitespace function
# Write a function that removes whitespace from the values in each column of a dataframe
def strip_whitespace(df):
    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = df[col].str.strip()
    return df

# uppercase_headers function
# Write a function that converts all of the headers to uppercase
def uppercase_headers(df):
    df.columns = df.columns.str.upper()
    return df

# remove_duplicates function
# Write a function that detects duplicates in the PERMIT# column of a dataframe, removes the duplicates, and prints the number of duplicates removed
def remove_duplicates(df):
    duplicates = df.duplicated(subset=["PERMIT#"])
    print("Number of duplicates removed: ", duplicates.sum())
    return df.drop_duplicates(subset=["PERMIT#"])

# remove_du function
# Write a function that removes the rows containing a value less than or equal to 0 in the D.U. column, and print the number of rows removed
def remove_du(df):
    du = df["DU"] <= 0
    print("Number of permits with <= 0 DU removed: ", du.sum())
    return df[~du]

# sum_du function
# Write a function that sums the D.U. column, and prints the total number of D.U.
def sum_du(df):
    print("Total D.U.: ", df["DU"].sum())
    return df

# convert_date function
# Write a function that converts the ISSUED_DATE column to datetime format
def convert_date(df):
    df["ISSUED_DATE"] = pd.to_datetime(df["ISSUED_DATE"])
    return df

# remove_time function
# Write a function that removes the time from the ISSUED_DATE column
def remove_time(df):
    df["ISSUED_DATE"] = df["ISSUED_DATE"].dt.date
    return df

# reset_index function
# Write a function that resets the index of a dataframe
def reset_index(df):
    df = df.reset_index(drop=True)
    return df

# Write a function that concatenates the ST_# and ROAD columns into a single column named ADDRESS
def concat_address(df):
    df["ADDRESS"] = df["ST_#"] + " " + df["ROAD"]
    return df

# Write a function that gets the first four digits from the ISSUED_DATE column and places them in a new column named YEAR. Then, convert the YEAR column to integer format
def get_year(df):
    try:
        df["YEAR"] = df["ISSUED_DATE"].astype(str).str[:4]
        df["YEAR"] = df["YEAR"].astype(int)
    except ValueError as e:
        print(f"Error converting 'YEAR' column to integers: {e}")
    return df

# Write a function that extracts the month from the ISSUED_DATE column and places it in a new column named MONTH.
def get_month(df):
    try:
        df["MONTH"] = df["ISSUED_DATE"].dt.month_name()
    except ValueError as e:
        print(f"Error extracting month from 'ISSUED_DATE' column: {e}")
    return df

In [54]:
# Using pd.pipe, apply the functions in the order they were defined
permits_df = (permits_df
    .pipe(clean_headers)
    .pipe(uppercase_headers)
    .pipe(remove_duplicates)
    .pipe(remove_du)
    .pipe(strip_whitespace)
    .pipe(sum_du)
    .pipe(concat_address)
    .pipe(get_year)
    .pipe(convert_date)
    .pipe(get_month)
    .pipe(remove_time)
    .pipe(reset_index)
)

permits_df

Number of duplicates removed:  20481
Number of permits with <= 0 DU removed:  47245
Total D.U.:  74790.0


Unnamed: 0,ST_#,ROAD,PC,WARD,PLAN,LOT,CONTRACTOR,BLG_TYPE,MUNICIPALITY,DESCRIPTION,DU,VALUE,SQ_FT,PERMIT#,APPL_TYPE,ISSUED_DATE,FT2,ADDRESS,YEAR,MONTH
0,249,TRAILGATE ST,K1V0Z9,Ward 22,4M1349,54,RICHCRAFT HOMES LIMITED,Single,Gloucester,Construct a 2 Storey Single with attached gara...,1.0,,3757.0,,Construction,2016-01-04,,249 TRAILGATE ST,2016,January
1,215,ESCARPMENT CRES,K2T0L6,Ward 4,,73,CARDEL HOMES INC. OTTAWA DIVISION,Single,Kanata,Construct a 2 Storey Single with attached gara...,1.0,,2939.0,,Construction,2016-01-04,,215 ESCARPMENT CRES,2016,January
2,517,KILSPINDIE RIDGE,,Ward 22,4M-1518,,UNIFORM URBAN DEVELOPMENTS LTD.,Rowhouse,Nepean,Construct a 2 Storey Rowhouse with attached ga...,4.0,,6758.0,,Construction,2016-01-04,,517 KILSPINDIE RIDGE,2016,January
3,106,LOCHHOUSE WALK,K2J6A3,Ward 22,4M-1518,35,UNIFORM URBAN DEVELOPMENTS LTD.,Single,Nepean,Construct a 1 Storey Single with attached gara...,1.0,,2810.0,,Construction,2016-01-05,,106 LOCHHOUSE WALK,2016,January
4,228,ESCARPMENT CRES,K2T0L6,Ward 4,,50,UNIFORM URBAN DEVELOPMENTS LTD.,Single,Kanata,Construct a 2 Storey Single with attached gara...,1.0,,3191.0,,Construction,2016-01-05,,228 ESCARPMENT CRES,2016,January
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24113,673,HONEYDEW ST,K4M1B2,Ward 22,4M-1728,,HN HOMES LIMITED PARTNERSHIP,Rowhouse,Ottawa,Construct a 2 Storey Rowhouse with attached ga...,4.0,918482,,,Construction,2024-01-30,7062.0,673 HONEYDEW ST,2024,January
24114,665,HONEYDEW ST,K4M1B2,Ward 22,4M-1728,,HN HOMES LIMITED PARTNERSHIP,Rowhouse,Ottawa,Construct a 2 Storey Rowhouse with attached ga...,4.0,856720,,,Construction,2024-01-30,7387.0,665 HONEYDEW ST,2024,January
24115,400,PARNIAN PRIV,K2J6H3,Ward 3,4M-,,MATTAMY (HALF MOON BAY 4) LIMITED,Stacked Rowhouse,Ottawa,Construct a 3 storey 12 unit stacked rowhouse ...,12.0,2032040,,,Construction,2024-01-30,14576.0,400 PARNIAN PRIV,2024,January
24116,7033 B,"NOTRE-DAME, RUE",K1C1H8,Ward 2,86,23 and 24,LAVOIE DESIGN & DRAFTING INC.,Semi - Detached,Gloucester,Construct a 2 storey semi-detached dwelling wi...,4.0,740428,,,Construction,2024-01-30,5102.0,"7033 B NOTRE-DAME, RUE",2024,January


In [55]:
# Export the cleaned dataframe to a csv file
permits_df.to_csv("permits_cleaned.csv", index=False)