Goal of notebook: carrying, some sort of linear or logistic regression (to be used as a benchmark; feel free to use sklearn).  Details left to you, but explain what you are doing in text cells in the notebook.


Our approach: We have decided we will be predicting an outcome of "Passed", "Rat Activity", or "Failed for other R(easons)"

Since we are cateogorizing a result, multiclass logistic regression seems like the best approach

Loading pickled data from Milestone 1 into notebook

In [1]:
import pandas as pd
import pickle
import numpy as np
from google.colab import drive
import gdown
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# RUN THIS CELL TO IMPORT 200 ENTRY DATASET ONLY

In [2]:
file_id = '1PZ0G_2Ou7RWSEc6J_zGZIdfnNtBt_JJa'
destination = 'debugging_dataset_one_hot_encoded.pkl'
# Use the correct URL format for gdown
gdown.download(f'https://drive.google.com/uc?id={file_id}', destination, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1PZ0G_2Ou7RWSEc6J_zGZIdfnNtBt_JJa
To: /content/debugging_dataset_one_hot_encoded.pkl
100%|██████████| 54.9k/54.9k [00:00<00:00, 36.6MB/s]


'debugging_dataset_one_hot_encoded.pkl'

# RUN THIS CELL TO IMPORT FULL DATASET

In [3]:
#https://drive.google.com/file/d/1pEK8HoH-V0xHgHTN5HMUcrOs16fkAXuj/view?usp=drive_link
file_id = '1pEK8HoH-V0xHgHTN5HMUcrOs16fkAXuj'
destination = 'full_dataset_one_hot_encoded (1).pkl'
# Use the correct URL format for gdown
gdown.download(f'https://drive.google.com/uc?id={file_id}', destination, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1pEK8HoH-V0xHgHTN5HMUcrOs16fkAXuj
From (redirected): https://drive.google.com/uc?id=1pEK8HoH-V0xHgHTN5HMUcrOs16fkAXuj&confirm=t&uuid=2dfa332b-f5a8-4f60-b7ee-51c16b2b0491
To: /content/full_dataset_one_hot_encoded (1).pkl
100%|██████████| 1.37G/1.37G [00:16<00:00, 83.1MB/s]


'full_dataset_one_hot_encoded (1).pkl'

In [4]:
with open(destination, 'rb') as file:
    df = pickle.load(file)
print(df.columns)

Index(['INSPECTION_TYPE', 'BBL', 'BORO_CODE', 'BLOCK', 'LOT', 'HOUSE_NUMBER',
       'STREET_NAME', 'ZIP_CODE', 'X_COORD', 'Y_COORD', 'LATITUDE',
       'LONGITUDE', 'BOROUGH', 'INSPECTION_DATE', 'RESULT', 'LOCATION',
       'COMMUNITY BOARD', 'COUNCIL DISTRICT', 'CENSUS TRACT', 'NTA', 'MONTH',
       'REFUSETONSCOLLECTED', 'PAPERTONSCOLLECTED', 'MGPTONSCOLLECTED',
       'Bronx', 'Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'month_1',
       'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7',
       'month_8', 'month_9', 'month_10', 'month_11', 'month_12'],
      dtype='object')


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7480726 entries, 0 to 7480725
Data columns (total 41 columns):
 #   Column               Dtype         
---  ------               -----         
 0   INSPECTION_TYPE      object        
 1   BBL                  float64       
 2   BORO_CODE            int64         
 3   BLOCK                int64         
 4   LOT                  int64         
 5   HOUSE_NUMBER         object        
 6   STREET_NAME          object        
 7   ZIP_CODE             float64       
 8   X_COORD              float64       
 9   Y_COORD              float64       
 10  LATITUDE             float64       
 11  LONGITUDE            float64       
 12  BOROUGH              object        
 13  INSPECTION_DATE      datetime64[ns]
 14  RESULT               object        
 15  LOCATION             object        
 16  COMMUNITY BOARD      float64       
 17  COUNCIL DISTRICT     float64       
 18  CENSUS TRACT         float64       
 19  NTA                  

In [6]:
#TRIM DATA TO A SUBSET TO AVOID CRASHING
n=1000000
df = df[:n].dropna()

# Sklearn does not like strings so we need to drop columns with strings or one hot encode them. We also need our data to be the same data type so we are converting to the same data type.

In [7]:
# Find columns that are of type 'object' (typically strings in pandas)
string_columns = df.select_dtypes(include=['object']).columns
print("Columns with string data:", string_columns)


Columns with string data: Index(['INSPECTION_TYPE', 'HOUSE_NUMBER', 'STREET_NAME', 'BOROUGH', 'RESULT',
       'LOCATION', 'NTA'],
      dtype='object')


In [8]:
inspection_type_coded = pd.get_dummies(df['INSPECTION_TYPE']) #converting to one hot encoding bc sklearn doesn't like strings
df = pd.concat([df, inspection_type_coded],axis=1)

print(df.columns)

Index(['INSPECTION_TYPE', 'BBL', 'BORO_CODE', 'BLOCK', 'LOT', 'HOUSE_NUMBER',
       'STREET_NAME', 'ZIP_CODE', 'X_COORD', 'Y_COORD', 'LATITUDE',
       'LONGITUDE', 'BOROUGH', 'INSPECTION_DATE', 'RESULT', 'LOCATION',
       'COMMUNITY BOARD', 'COUNCIL DISTRICT', 'CENSUS TRACT', 'NTA', 'MONTH',
       'REFUSETONSCOLLECTED', 'PAPERTONSCOLLECTED', 'MGPTONSCOLLECTED',
       'Bronx', 'Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'month_1',
       'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7',
       'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'Compliance',
       'Initial'],
      dtype='object')


In [24]:
# Check columns by type
datetime_columns = df.select_dtypes(include=['datetime']).columns
numeric_columns = df.select_dtypes(include=['number']).columns
bool_columns = df.select_dtypes(include=['bool']).columns

# Convert datetime columns to numeric
for col in datetime_columns:
    df[col] = df[col].astype('int64') // 10**9  # Converts to seconds since epoch as integer


# Convert boolean columns to integers
for col in bool_columns:
  df[col] = df[col].astype(int)

df1 = df.copy()
df2 = df.copy()
df3 = df.copy()

# Training with SKLEARN


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder,  StandardScaler
from sklearn.metrics import classification_report


# 1. Prepare X and y
X = df1.drop(columns=['RESULT', 'STREET_NAME', 'BOROUGH', 'INSPECTION_TYPE', 'HOUSE_NUMBER', 'NTA', 'LOCATION'])  # Dropping columns that are strings bc sklearn doesn't like strings
y = df1['RESULT']

# 2. Encode the target if it's categorical
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# 3. Split the data into training and testing sets and scale the features
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 4. Initialize and train the Logistic Regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=10000,class_weight='balanced')
model.fit(X_train, y_train)

# 5. Make predictions
y_pred = model.predict(X_test)

# 6. Evaluate the model
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))





                    precision    recall  f1-score   support

Failed for Other R       0.23      0.55      0.32     22460
            Passed       0.81      0.51      0.63    142571
      Rat Activity       0.32      0.51      0.39     34969

          accuracy                           0.51    200000
         macro avg       0.45      0.52      0.45    200000
      weighted avg       0.66      0.51      0.55    200000



Beyond a simple a logistic model, we had to scale our data in order to help it converge. The outline of the report shows a few interesting things. First, the prediction for rat activity was not completely accurate and only had a F1 score of 0.32, primarily due to a low precision, but the recall was also poor. This shows that logistic regression alone does not have the ability to capture the complexity of the data. This model is only able to predict passing inspection 81% of the time, but nothing else well. In addition, the data is inherently biased since there are many more passing results for rat infestations than not passing/other reason failures.



In [11]:
# View the contents of the 'RESULT' column
#print(df['RESULT'])
print(df1['RESULT'].unique())


['Passed' 'Failed for Other R' 'Rat Activity']


# Lets try as just failed or passed

In [19]:

df2['RESULT'] = df2['RESULT'].replace({
    'Rat Activity': 'Failed',
    'Failed for Other R': 'Failed'
})
df2['RESULT'] = df2['RESULT'].map({'Passed': 1, 'Failed': 0})

# 1. Prepare X and y
X = df2.drop(columns=['RESULT', 'STREET_NAME', 'BOROUGH', 'INSPECTION_TYPE', 'HOUSE_NUMBER', 'NTA', 'LOCATION'])  # Dropping columns that are strings bc sklearn doesn't like strings
y = df2['RESULT']



# 3. Split the data into training and testing sets and scale the features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 4. Initialize and train the Logistic Regression model
model = LogisticRegression(solver='lbfgs', max_iter=10000, class_weight='balanced')

model.fit(X_train, y_train)

# 5. Make predictions
y_pred = model.predict(X_test)

# 6. Evaluate the model
target_names = ['Failed', 'Passed']
print(classification_report(y_test, y_pred, target_names=target_names))



              precision    recall  f1-score   support

      Failed       0.43      0.53      0.48     57429
      Passed       0.79      0.72      0.75    142571

    accuracy                           0.67    200000
   macro avg       0.61      0.63      0.62    200000
weighted avg       0.69      0.67      0.68    200000



# This seems to improve the model slighly, as the f1-scores are slightly higher. Let's try without 'failed for other reasons'

In [25]:
df_filtered = df3[df3['RESULT'] != 'Failed for Other R'].copy()

df_filtered.loc[:, 'RESULT'] = df_filtered['RESULT'].replace({
    'Passed': 'Passed',
    'Rat Activity': 'Failed'
})


X = df_filtered.drop(columns=['RESULT', 'STREET_NAME', 'BOROUGH', 'INSPECTION_TYPE', 'HOUSE_NUMBER', 'NTA', 'LOCATION'])  # Dropping columns that are strings
y = df_filtered['RESULT']


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = LogisticRegression(solver='lbfgs', max_iter=10000, class_weight='balanced')
model.fit(X_train, y_train)


y_pred = model.predict(X_test)


print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Unique values in RESULT column after filtering: ['Passed' 'Failed']
              precision    recall  f1-score   support

      Failed       0.33      0.58      0.42     34772
      Passed       0.87      0.72      0.79    142755

    accuracy                           0.69    177527
   macro avg       0.60      0.65      0.61    177527
weighted avg       0.77      0.69      0.72    177527



This does not change the result significantly.

['Passed']
