In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import math

In [2]:
df = pd.read_csv('cleaned_dataset.csv')

In [3]:
df.head()

Unnamed: 0,damage,street_type,fire_unit,incident_start_date,structure_type,structure_category,roof_material,eaves,exterior_siding,window_pane,attached_patio_material,attached_fence_material,year_built
0,No Damage,Road,LNU,6/6/2020 12:00:00 AM,Single Family Residence Multi Story,Single Residence,Asphalt,Unenclosed,Wood,Single Pane,No Patio Cover/Carport,No Fence,1997.0
1,Affected (1-9%),Road,LNU,6/6/2020 12:00:00 AM,Single Family Residence Single Story,Single Residence,Asphalt,Unenclosed,Wood,Multi Pane,No Patio Cover/Carport,Combustible,1980.0
2,No Damage,Road,LNU,6/6/2020 12:00:00 AM,Single Family Residence Single Story,Single Residence,Asphalt,Enclosed,Wood,Single Pane,No Patio Cover/Carport,No Fence,2004.0
3,No Damage,Road,LNU,6/6/2020 12:00:00 AM,Single Family Residence Single Story,Single Residence,Asphalt,Unenclosed,Wood,Single Pane,Combustible,No Fence,1981.0
4,No Damage,Road,LNU,6/6/2020 12:00:00 AM,Single Family Residence Single Story,Single Residence,Tile,Enclosed,Wood,Multi Pane,Combustible,No Fence,1980.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100230 entries, 0 to 100229
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   damage                   100230 non-null  object 
 1   street_type              93525 non-null   object 
 2   fire_unit                100230 non-null  object 
 3   incident_start_date      100230 non-null  object 
 4   structure_type           100230 non-null  object 
 5   structure_category       100230 non-null  object 
 6   roof_material            99602 non-null   object 
 7   eaves                    99118 non-null   object 
 8   exterior_siding          99321 non-null   object 
 9   window_pane              99254 non-null   object 
 10  attached_patio_material  100230 non-null  object 
 11  attached_fence_material  78635 non-null   object 
 12  year_built               69812 non-null   float64
dtypes: float64(1), object(12)
memory usage: 9.9+ MB


In [5]:
df['fire_unit'].unique()

array(['LNU', 'AEU', 'BTU', 'SLU', 'SKU', 'SCU', 'BEU', 'LMU', 'RRU',
       'BDU', 'KRN', 'NEU', 'SHU', 'TGU', 'LAC', 'MEU', 'MVU', 'HUU',
       'TUU', 'FKU', 'MMU', 'CZU', 'ORC', 'VNC', 'TCU', 'SBC', 'SDU'],
      dtype=object)

In [6]:
df['incident_start_date'] = pd.to_datetime(df['incident_start_date'], format='%m/%d/%Y %I:%M:%S %p')
df['incident_year'] = df['incident_start_date'].dt.strftime('%Y-%m-%d')


In [7]:
bins_map = {} 
columns_to_bin = ['damage','street_type','fire_unit','structure_type','structure_category','roof_material','eaves','exterior_siding','window_pane','attached_patio_material','attached_fence_material']

for column in df.columns: 
    if column not in columns_to_bin:
        continue
    unique_units = df[column].unique() 
    
    mapping = {unit: idx for idx, unit in enumerate(unique_units, start=0)}
    bins_map[column] = mapping 
    
    df[column] = df[column].map(mapping)

print(bins_map)


{'damage': {'No Damage': 0, 'Affected (1-9%)': 1, 'Minor (10-25%)': 2, 'Destroyed (>50%)': 3, 'Major (26-50%)': 4, 'Inaccessible': 5}, 'street_type': {'Road': 0, 'Lane': 1, 'Other': 2, 'Trail': 3, 'Drive': 4, 'Street': 5, 'None': 6, 'Way': 7, 'Avenue': 8, 'Court': 9, 'Loop': 10, 'Route': 11, 'Place': 12, 'Alley': 13, 'Terrace': 14, 'Circle': 15, 'Boulevard': 16, 'Parkway': 17, nan: 18, ' ': 19, 'Hwy': 20, 'Dirt road': 21, '-': 22, 'not given': 23, 'Grade': 24, 'Ext.': 25, 'not noted': 26, 'Unk': 27, 'Pass': 28}, 'fire_unit': {'LNU': 0, 'AEU': 1, 'BTU': 2, 'SLU': 3, 'SKU': 4, 'SCU': 5, 'BEU': 6, 'LMU': 7, 'RRU': 8, 'BDU': 9, 'KRN': 10, 'NEU': 11, 'SHU': 12, 'TGU': 13, 'LAC': 14, 'MEU': 15, 'MVU': 16, 'HUU': 17, 'TUU': 18, 'FKU': 19, 'MMU': 20, 'CZU': 21, 'ORC': 22, 'VNC': 23, 'TCU': 24, 'SBC': 25, 'SDU': 26}, 'structure_type': {'Single Family Residence Multi Story': 0, 'Single Family Residence Single Story': 1, 'Utility Misc Structure': 2, 'Mobile Home Double Wide': 3, 'Motor Home': 4, 

In [8]:
# Ensure 'year_built' is numeric
df['year_built'] = pd.to_numeric(df['year_built'], errors='coerce')

# Extract 'incident_year' from 'incident_start_date' and ensure it's numeric
df['incident_year'] = pd.to_datetime(df['incident_start_date']).dt.year

# Handle missing 'year_built' values by filling them with the median
df['year_built'] = df['year_built'].fillna(df['year_built'].median())

# Calculate the age of the building, ensuring it's an integer
df['age'] = (df['incident_year'] - df['year_built']).astype(int)

In [9]:
# Drop the incident_start_date & year_built column
df.drop(columns=['incident_start_date'], inplace=True)
df.drop(columns=['year_built'], inplace=True)
df.drop(columns=['incident_year'], inplace=True)

In [10]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping


from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

2025-02-08 21:42:22.888522: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-02-08 21:42:23.001170: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-08 21:42:23.006058: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-02-08 21:42:23.006071: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

In [11]:
# Define features (X) and target (y)
X = df.drop(columns=['damage'])
y = df['damage']

In [12]:
# One-hot encode the target variable if it's multi-class
num_classes = len(y.unique())  
y = to_categorical(y, num_classes=num_classes)

In [13]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Build the Keras model

model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)), # Input layer
    BatchNormalization(),
    Dropout(0.3), # Dropout to prevent overfitting

    Dense(128, activation='relu'), # Hidden layer
    Dense(64, activation='relu'),  # Additional hidden layer
    Dense(32, activation='relu'),  # Another hidden layer
    Dense(16, activation='relu'),  # Another hidden layer
    Dense(num_classes, activation='softmax')  # Output layer
])

2025-02-08 21:42:24.146193: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2025-02-08 21:42:24.146211: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2025-02-08 21:42:24.146221: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (adamo-Surface-Pro-7): /proc/driver/nvidia/version does not exist
2025-02-08 21:42:24.146464: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100

In [None]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy}")

In [None]:
# Predict and evaluate using sklearn
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1) 
y_test_classes = y_test.argmax(axis=1) 

print(classification_report(y_test_classes, y_pred_classes))

In [None]:

model.save('sequential-model-2.h5')  # Saves in HDF5 format