In [None]:

from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Earthquake\ Shelter\ Project

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Earthquake Shelter Project


In [None]:
!pip install "pandas<2.0.0"



In [None]:
import pickle
import tempfile
import folium
import os
from joblib import dump, load

In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import seaborn as sns
import networkx as nx

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, roc_curve, classification_report, recall_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE

from scipy.sparse import csr_matrix

import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

In [None]:
!pip install boruta

Collecting boruta
  Downloading Boruta-0.3-py3-none-any.whl (56 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/56.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.6/56.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: boruta
Successfully installed boruta-0.3


In [None]:
from boruta import BorutaPy
import xgboost as xgb

In [None]:
def read_pickle(dataset_name):
    with open('data/' + dataset_name + '.pkl', 'rb') as file_:
        return pickle.load(file_)

In [None]:
X = np.load('data/arrays/chiba_more/x.npy', allow_pickle=True).astype(np.float32)
y = np.load('data/arrays/chiba_more/y.npy', allow_pickle=True).astype(np.float32)

In [None]:
X.shape, y.shape

((3632, 26), (3632, 1))

In the `Data_Visualization3.ipynb`, in the very bottom of the notebook,
`X_df` is the dataframe of the shelters, but doesn't have latitude or longitude on it.
That's why we're using `df1.loc[X_df.index][['latitude', 'longitude', 'shelter']]`



In [None]:
facilities = read_pickle('arrays/chiba_more/X_df')
facility_details = read_pickle('arrays/chiba_more/df1')
facilities.shape, facility_details.shape

((3632, 41), (4011, 22))

In [None]:
def plot_shelter_map(df, by='shelter'):
    m = folium.Map(location=[df['latitude'].mean(), df['longitude'].mean()], zoom_start=12, tiles='CartoDB positron')

    num_colors = len(df[by].unique())
    colors = plt.cm.get_cmap('tab10', num_colors)
    color_list = [mcolors.rgb2hex(colors(i)) for i in range(num_colors)]

    for idx, row in df.iterrows():
        if row['shelter'] == 1:
            folium.Circle(
                location=(row['latitude'], row['longitude']),
                radius=500,
                color=None,  # No border color
                weight=0,    # No border weight
                fill=True,
                fill_color=color_list[row[by]],
                fill_opacity=0.3
            ).add_to(m)
            folium.CircleMarker(
                location=(row['latitude'], row['longitude']),
                radius=2,  # Radius in pixels
                color=color_list[row[by]],
                fill=True,
                fill_color=color_list[row[by]],
                fill_opacity=0.6
            ).add_to(m)
        else:
            folium.CircleMarker(
                location=(row['latitude'], row['longitude']),
                radius=2,  # Radius in pixels
                color=color_list[row[by]],
                fill=True,
                fill_color=color_list[row[by]],
                fill_opacity=0.6
            ).add_to(m)

    return m

In [None]:
def plot_shelter_levels(df):
    # # Generate distinct colors
    # colors = plt.cm.tab10.colors[:10]
    # color_list = [mcolors.rgb2hex(color) for color in colors]

    # # Define color map for levels
    # level_color_map = {}
    # levels = df['level'].unique()
    # for i, level in enumerate(levels):
    #     level_color_map[level] = color_list[i % len(color_list)]
    # Define color map for levels
    level_color_map = {
        -1: '#34495e',  # Color for level -1
        0: '#e74c3c',   # Color for level 0
        1: '#e67e22',   # Color for level 1
        2: '#f1c40f'    # Color for level 2
    }

    # Create a Folium map centered on the average latitude and longitude
    m = folium.Map(location=[df['latitude'].mean(), df['longitude'].mean()], zoom_start=11, tiles='CartoDB positron')

    # Add circle markers to the map
    for idx, row in df.iterrows():
        if row['level'] == -1:
            folium.CircleMarker(
                location=(row['latitude'], row['longitude']),
                radius=1,
                color=level_color_map[row['level']],
                fill=True,
                fill_color=level_color_map[row['level']],
                fill_opacity=0.3
            ).add_to(m)
        else:
            folium.Circle(
                location=(row['latitude'], row['longitude']),
                radius=500,
                color=None,  # No border color
                weight=0,    # No border weight
                fill=True,
                fill_color=level_color_map[row['level']],
                fill_opacity=0.3
            ).add_to(m)
            folium.CircleMarker(
                location=(row['latitude'], row['longitude']),
                radius=1,
                color=level_color_map[row['level']],
                fill=True,
                fill_color=level_color_map[row['level']],
            ).add_to(m)

    return m

Use test data for a very specific location in Chiba

In [None]:
bottom_left = (35.7843004,139.8980858) # Matsudo station
top_right = (35.9293258,140.0115354)   # Togashira station (Toride)

In [None]:
c1 = (bottom_left[0] <= facilities['latitude']) & (facilities['latitude'] <= top_right[0])
c2 = (bottom_left[1] <= facilities['longitude']) & (facilities['longitude'] <= top_right[1])
selected_facilities = facilities[c1 & c2]

In [None]:
plot_shelter_map(selected_facilities, by='shelter')

  colors = plt.cm.get_cmap('tab10', num_colors)


In [None]:
print('Number of facilities:', selected_facilities.shape[0])
print('Number of shelters:', selected_facilities['shelter'].sum())
print('Number of non-shelter facilities:', selected_facilities.shape[0] - selected_facilities['shelter'].sum())

Number of facilities: 474
Number of shelters: 20
Number of non-shelter facilities: 454


In [None]:
(selected_facilities['shelter'].sum() / selected_facilities.shape[0]) * 100

4.219409282700422

Create the level column, starting for 0 to represent shelters that already exist.


In [None]:
selected_facilities['level'] = selected_facilities['shelter'] - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_facilities['level'] = selected_facilities['shelter'] - 1


In [None]:
selected_facilities['level']

10     -1
25     -1
40      0
82     -1
83     -1
       ..
4091   -1
4092    0
4108    0
4110    0
4115   -1
Name: level, Length: 474, dtype: int64

In [None]:
plot_shelter_levels(selected_facilities).save('data/maps/level0.html')

In [None]:
cols = ['stories_above_ground',
 'stories_below_ground',
 'steel_frame',
 'reinforced_concrete',
 'lightweight',
 'block',
 'wood',
 'rice_paddy',
 'other_agriculture',
 'forest',
 'wasteland',
 'building',
 'road_transport',
 'rail_transport',
 'other_site',
 'river_wetland',
 'seashore',
 'sea',
 'golf_course',
 'unknown',
 'dist_to_water',
 '0-14',
 '15-64',
 '65+',
 '75+',
 '80+']

## Get the model

In [None]:
np.float = np.float32
np.int = np.int32
np.bool = np.bool_

In [None]:
model = load('models/tokyo_brf.joblib')
scaler = load('models/tokyo_brf_scaler.joblib')
pca = load('models/tokyo_brf_pca.joblib')
boruta_br = load('models/tokyo_brf_boruta.joblib')

In [None]:
def preprocess_test_data(X, scaler, pca, boruta):
    scaled_test_data = scaler.transform(X)
    pca_test_data = pca.transform(scaled_test_data)
    return boruta.transform(pca_test_data)

In [None]:
selected_facilities['level'] = selected_facilities['shelter'] - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_facilities['level'] = selected_facilities['shelter'] - 1


In [None]:
def get_preds(df):
    X = df[df['level'] == -1][cols].values
    X = preprocess_test_data(X, scaler, pca, boruta_br)
    y_pred = model.predict(X).astype(np.int64)

    current_level = (df['level'].max() + 2)

    df.loc[df['level'] == -1, 'level'] = (y_pred * current_level) - 1
    # df[df['level'] == -1].loc(y_pred, 'level') = current_level

    return df

Note that the model is biased toward predicted more shelters than usual (reference the Chiba confusion matrix)
- good thing, because it's better to have more facilities prepared to be shelters in the case of an emergency

In [None]:
level2 = get_preds(selected_facilities)
plot_shelter_levels(level2).save('data/maps/level1.html')

In [None]:
level3 = get_preds(level2)
plot_shelter_levels(level3)

In [None]:
facilities_blank = selected_facilities['level'] = -1
level_blank = get_preds(selected_facilities)
plot_shelter_levels(level_blank).save('data/maps/level_blank.html')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facilities_blank = selected_facilities['level'] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['level'] == -1, 'level'] = (y_pred * current_level) - 1


The biggest improvements to be made can involve giving the model insight into what shelters exist around it. Since it isn't aware of any shelters in the area, that's why there are clumps of orange circles.

And by the way this model is designed (and how it doesn't consider any nearby shelters in its predictions) it can only be used to predict shelters once, not repeatedly. Like for a certain region, it can pick a set number of shelters regardless of how many shelters already exist.

### Conclusion
Doesn't depend on shelters that already exist means:
- it predicts shelters really close together (which makes sense because the model is trained on physical features which will be similar for neighboring plots of land)
- it always predicts the same shelters regardless of how many exist

Shower thought: **If the model predicts shelters that are close together because of their land features, then you could generate a heatmap of where the shelters are recommended to be placed**

Shower thought #2: **For shelters that don't have data published on them, you could extrapolate this data using something similar to what SMOTE does.**


### Why do most of the shelters not have data published on them? What does that actually mean?

So we use shelters that are also public facilities, so they need to correspond to a public facility in the separate public facilities dataset. The public facilities dataset contains some features that we're training on. If there's a shelter that doesn't have data published on them, it means that the address for the shelter couldn't be converted into the latitude, longitude coordinate or there wasn't a correponding public facility.