In [130]:
# Import libraries
import pandas as pd
import numpy as np
import spacy
import glob
import os

In [131]:
# Csv files named "permits_2021", "permits_2022", etc in permit-data folder. Load each one to a dictionary of dataframes, and use the year in the file name as the key

# Create empty dictionary
permits = {}

# Loop through files in permit-data folder, and print the number of files loaded, as well as the number of rows in each file. Sum the number of rows in each file to get the total number of rows in the dataset.
for file in glob.glob("permit-data/*.csv"):
    year = os.path.basename(file).split("_")[1].split(".")[0]
    permits[year] = pd.read_csv(file)
    print("Loaded {} rows from {}".format(len(permits[year]), file))
    print("Total rows: {}".format(sum([len(permits[year]) for year in permits])))

Loaded 10274 rows from permit-data\permits_2016.csv
Total rows: 10274
Loaded 10839 rows from permit-data\permits_2017.csv
Total rows: 21113
Loaded 11001 rows from permit-data\permits_2018.csv
Total rows: 32114
Loaded 11897 rows from permit-data\permits_2019.csv
Total rows: 44011
Loaded 12548 rows from permit-data\permits_2020.csv
Total rows: 56559
Loaded 14076 rows from permit-data\permits_2021.csv
Total rows: 70635
Loaded 11827 rows from permit-data\permits_2022.csv
Total rows: 82462
Loaded 932 rows from permit-data\permits_2023.csv
Total rows: 83394


In [132]:
# Concatenate all of the dataframes in the permits dictionary into a single dataframe. Check if the rows are in the same order as the original dataframes

# Print the total number of rows in the permits dictionary
total_rows = 0
for year, df in permits.items():
    total_rows += len(df)
print(f'There are {total_rows} in the permits dictionary')

# Concatenate all of the dataframes in the permits dictionary into a single dataframe
permits_df = pd.concat(permits.values(), ignore_index=True)

# Print the number of rows in the concatenated dataframe
print(f"There are {len(permits_df)} rows in the concatenated dataframe")

There are 83394 in the permits dictionary
There are 83394 rows in the concatenated dataframe


In [133]:
# Clean the permits_df


# clean_headers function
# Write a function that removes whitespace and periods in the headers of a dataframe, and replaces spaces with underscores.
def clean_headers(df):
    df.columns = df.columns.str.strip().str.replace(" ", "_").str.replace(".", "")
    return df

# uppercase_headers function
# Write a function that converts all of the headers to uppercase
def uppercase_headers(df):
    df.columns = df.columns.str.upper()
    return df

# remove_duplicates function
# Write a function that detects duplicates in the PERMIT# column of a dataframe, removes the duplicates, and prints the number of duplicates removed
def remove_duplicates(df):
    duplicates = df.duplicated(subset=["PERMIT#"])
    print("Number of duplicates removed: ", duplicates.sum())
    return df.drop_duplicates(subset=["PERMIT#"])

# remove_du function
# Write a function that removes the rows containing a value less than or equal to 0 in the D.U. column, and print the number of rows removed
def remove_du(df):
    du = df["DU"] <= 0
    print("Number of permits with <= 0 DU removed: ", du.sum())
    return df[~du]

# sum_du function
# Write a function that sums the D.U. column, and prints the total number of D.U.
def sum_du(df):
    print("Total D.U.: ", df["DU"].sum())
    return df

# convert_date function
# Write a function that converts the ISSUED_DATE column to datetime format
def convert_date(df):
    df["ISSUED_DATE"] = pd.to_datetime(df["ISSUED_DATE"])
    return df

# remove_time function
# Write a function that removes the time from the ISSUED_DATE column
def remove_time(df):
    df["ISSUED_DATE"] = df["ISSUED_DATE"].dt.date
    return df

# reset_index function
# Write a function that resets the index of a dataframe
def reset_index(df):
    df = df.reset_index(drop=True)
    return df

In [134]:
# Using pd.pipe, apply the functions in the order they were defined
permits_df = (permits_df
    .pipe(clean_headers)
    .pipe(uppercase_headers)
    .pipe(remove_duplicates)
    .pipe(remove_du)
    .pipe(sum_du)
    .pipe(convert_date)
    .pipe(remove_time)
    .pipe(reset_index)
)

permits_df

  df.columns = df.columns.str.strip().str.replace(" ", "_").str.replace(".", "")


Number of duplicates removed:  18455
Number of permits with <= 0 DU removed:  42483
Total D.U.:  67337


Unnamed: 0,ST_#,ROAD,PC,WARD,PLAN,LOT,CONTRACTOR,BLG_TYPE,MUNICIPALITY,DESCRIPTION,DU,VALUE,SQ_FT,PERMIT#,APPL_TYPE,ISSUED_DATE,FT2
0,249,TRAILGATE ST,K1V0Z9,Ward 22,4M1349,54,RICHCRAFT HOMES LIMITED,Single,Gloucester,Construct a 2 Storey Single with attached gara...,1,558455.0,3757.0,1600007,Construction,2016-01-04,
1,215,ESCARPMENT CRES,K2T0L6,Ward 4,,73,CARDEL HOMES INC. OTTAWA DIVISION,Single,Kanata,Construct a 2 Storey Single with attached gara...,1,436852.0,2939.0,1600013,Construction,2016-01-04,
2,517,KILSPINDIE RIDGE,,Ward 22,4M-1518,,UNIFORM URBAN DEVELOPMENTS LTD.,Rowhouse,Nepean,Construct a 2 Storey Rowhouse with attached ga...,4,753380.0,6758.0,1600015,Construction,2016-01-04,
3,106,LOCHHOUSE WALK,K2J6A3,Ward 22,4M-1518,35,UNIFORM URBAN DEVELOPMENTS LTD.,Single,Nepean,Construct a 1 Storey Single with attached gara...,1,417678.0,2810.0,1600016,Construction,2016-01-05,
4,228,ESCARPMENT CRES,K2T0L6,Ward 4,,50,UNIFORM URBAN DEVELOPMENTS LTD.,Single,Kanata,Construct a 2 Storey Single with attached gara...,1,474310.0,3191.0,1600017,Construction,2016-01-05,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22451,110,CASTING WAY,K0A2E0,Ward 21,,,MINTO COMMUNITIES INC.,Rowhouse,Rideau,Construct a 1 Storey Rowhouse with attached ga...,4,685416.0,,2300816,Construction,2023-02-28,5270.0
22452,814,COMPANION CRES,K0A2E0,Ward 21,,,MINTO COMMUNITIES INC.,Rowhouse,Rideau,Construct a 1 Storey Rowhouse with attached ga...,5,879205.0,,2300817,Construction,2023-02-28,6760.0
22453,912 B,SMYTH RD,,Ward 18,627,321,CONTRACTOR UNKNOWN,Single,Old Ottawa,Construct a secondary dwelling unit in a 1 sto...,1,50000.0,,2300822,Construction,2023-02-28,894.0
22454,31 B,ACONITUM WAY,,Ward 22,4M-1640,16,CONTRACTOR UNKNOWN,Single,Gloucester,Construct a secondary dwelling unit in the bas...,1,39818.0,,2300829,Construction,2023-02-28,0.0


In [135]:
# Export the cleaned dataframe to a csv file
permits_df.to_csv("permits_cleaned.csv", index=False)