In [21]:
# Load the dataset
import pandas as pd
file_path = '/content/Building_Permits.csv'
df = pd.read_csv(file_path, low_memory=False)

In [22]:
# 1. Look at a couple of rows from the sf_permits dataset. Do you notice any missing data?
print(df.head())  # View the first few rows

# Yes, there appears to be missing data represented by NaN (Not a Number) values in several columns.


# 2. Find out what percent of the sf_permits dataset is missing.
total_cells = df.size
missing_values_count = df.isnull().sum().sum()
percent_missing = (missing_values_count / total_cells) * 100
print(f"\nPercentage of missing data: {percent_missing:.2f}%")


# 3. Look at the columns Street Number Suffix and Zipcode from the sf_permits datasets.
#    Both of these contain missing values.

# a. Which, if either, of these are missing because they don't exist?
# b. Which, if either, are missing because they were not recorded?

print("\nStreet Number Suffix Missing Values:", df['Street Number Suffix'].isnull().sum())
print("Zipcode Missing Values:", df['Zipcode'].isnull().sum())

# Based on the context, it's likely that:
# - Street Number Suffix is missing because it doesn't exist in many cases (e.g., addresses without a suffix like "A" or "B").
# - Zipcode might be missing because it was not recorded properly or accurately during data collection.

# Further investigation (e.g., examining specific rows with missing values) could provide more insights.


  Permit Number  Permit Type            Permit Type Definition  \
0  201505065519            4                      sign - erect   
1  201604195146            4                      sign - erect   
2  201605278609            3  additions alterations or repairs   
3  201611072166            8            otc alterations permit   
4  201611283529            6                       demolitions   

  Permit Creation Date Block  Lot  Street Number Street Number Suffix  \
0           05/06/2015  0326  023            140                  NaN   
1           04/19/2016  0306  007            440                  NaN   
2           05/27/2016  0595  203           1647                  NaN   
3           11/07/2016  0156  011           1230                  NaN   
4           11/28/2016  0342  001            950                  NaN   

  Street Name Street Suffix  ...  Existing Construction Type  \
0       Ellis            St  ...                         3.0   
1       Geary            St  ...    

In [23]:
# 4. Removing rows and columns with missing values

# Remove rows with any missing values
df_rows_dropped = df.dropna()
print(f"\nNumber of rows remaining after dropping rows with missing values: {df_rows_dropped.shape[0]}")

# Remove columns with any missing values
df_cols_dropped = df.dropna(axis=1)
print(f"\nNumber of columns remaining after dropping columns with missing values: {df_cols_dropped.shape[1]}")

# 5. Replacing NaN values

# a. Backward Fill (using the value that comes directly after it, then fill remaining with 0)
df_backward_fill = df.bfill(axis=0).fillna(0)  # Backfill row-wise, then fill remaining with 0
print("\nDataFrame after backward filling and replacing remaining NaNs with 0:")
print(df_backward_fill.head())

# b. Forward Fill (using the most recent preceding value, then fill remaining with 0)
df_forward_fill = df.ffill(axis=0).fillna(0)  # Forward fill row-wise, then fill remaining with 0
print("\nDataFrame after forward filling and replacing remaining NaNs with 0:")
print(df_forward_fill.head())



Number of rows remaining after dropping rows with missing values: 0

Number of columns remaining after dropping columns with missing values: 12

DataFrame after backward filling and replacing remaining NaNs with 0:
  Permit Number  Permit Type            Permit Type Definition  \
0  201505065519            4                      sign - erect   
1  201604195146            4                      sign - erect   
2  201605278609            3  additions alterations or repairs   
3  201611072166            8            otc alterations permit   
4  201611283529            6                       demolitions   

  Permit Creation Date Block  Lot  Street Number Street Number Suffix  \
0           05/06/2015  0326  023            140                    A   
1           04/19/2016  0306  007            440                    A   
2           05/27/2016  0595  203           1647                    A   
3           11/07/2016  0156  011           1230                    A   
4           11/28/2016