In [24]:
# Initial imports
import pandas as pd
from path import Path
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
import plotly.express as px
import hvplot.pandas

In [15]:
# Read in incident_declation file
file_path = Path("./Resources/incident_declaration.csv")
id_df = pd.read_csv(file_path)
id_df.tail(20)

Unnamed: 0,disaster_number,incident_type,incident_month,incident_begin_year,incident_duration
30287,4563,Hurricane,9,2020,2 days 00:00:00
30288,4563,Hurricane,9,2020,2 days 00:00:00
30289,4563,Hurricane,9,2020,2 days 00:00:00
30290,4563,Hurricane,9,2020,2 days 00:00:00
30291,4563,Hurricane,9,2020,2 days 00:00:00
30292,4563,Hurricane,9,2020,2 days 00:00:00
30293,4564,Hurricane,9,2020,14 days 00:00:00
30294,4564,Hurricane,9,2020,14 days 00:00:00
30295,4564,Hurricane,9,2020,14 days 00:00:00
30296,4564,Hurricane,9,2020,14 days 00:00:00


In [14]:
# Create scatterplot from incident duration file.
id_df.hvplot.scatter(x="disaster_number", y="incident_duration")

In [42]:
# View types of data
id_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30307 entries, 0 to 30306
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   disaster_number      30307 non-null  int64 
 1   incident_type        30307 non-null  object
 2   incident_month       30307 non-null  int64 
 3   incident_begin_year  30307 non-null  int64 
 4   incident_duration    30307 non-null  object
dtypes: int64(3), object(2)
memory usage: 1.2+ MB


In [41]:
# Generate our categorical variable list
id_cat = id_df.dtypes[id_df.dtypes == "object"].index.tolist()
id_cat

['incident_type', 'incident_duration']

In [44]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(id_df[id_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(id_cat)
encode_df.head()



Unnamed: 0,incident_type_Coastal Storm,incident_type_Earthquake,incident_type_Fire,incident_type_Flood,incident_type_Freezing,incident_type_Hurricane,incident_type_Mud/Landslide,incident_type_Severe Ice Storm,incident_type_Severe Storm(s),incident_type_Snow,...,incident_duration_87 days 00:00:00,incident_duration_89 days 00:00:00,incident_duration_9 days 00:00:00,incident_duration_90 days 00:00:00,incident_duration_91 days 00:00:00,incident_duration_92 days 00:00:00,incident_duration_94 days 00:00:00,incident_duration_95 days 00:00:00,incident_duration_96 days 00:00:00,incident_duration_99 days 00:00:00
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
# Merge one-hot encoded features and drop the originals
id_df = id_df.merge(encode_df,left_index=True, right_index=True)
id_df = id_df.drop(id_cat,1)
id_df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,disaster_number,incident_month,incident_begin_year,incident_type_Coastal Storm,incident_type_Earthquake,incident_type_Fire,incident_type_Flood,incident_type_Freezing,incident_type_Hurricane,incident_type_Mud/Landslide,...,incident_duration_87 days 00:00:00,incident_duration_89 days 00:00:00,incident_duration_9 days 00:00:00,incident_duration_90 days 00:00:00,incident_duration_91 days 00:00:00,incident_duration_92 days 00:00:00,incident_duration_94 days 00:00:00,incident_duration_95 days 00:00:00,incident_duration_96 days 00:00:00,incident_duration_99 days 00:00:00
0,608,10,1979,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,609,10,1979,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,610,11,1979,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,610,11,1979,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,610,11,1979,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Use K-Means to create Elbow Curve and clusters
Classifying incident durations of natural disasters.

In [47]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(id_df)
    inertia.append(km.inertia_)

In [48]:
# Create the Elbow Curve using hvPlot
elbow_data = {'k': k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [49]:
def get_clusters(k, data):
    # Create a copy of the DataFrame
    data = data.copy()

    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Fit the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)

    # Create return DataFrame with predicted clusters
    data["class"] = model.labels_

    return data

In [50]:
two_clusters = get_clusters(2,id_df)
two_clusters

Unnamed: 0,disaster_number,incident_month,incident_begin_year,incident_type_Coastal Storm,incident_type_Earthquake,incident_type_Fire,incident_type_Flood,incident_type_Freezing,incident_type_Hurricane,incident_type_Mud/Landslide,...,incident_duration_89 days 00:00:00,incident_duration_9 days 00:00:00,incident_duration_90 days 00:00:00,incident_duration_91 days 00:00:00,incident_duration_92 days 00:00:00,incident_duration_94 days 00:00:00,incident_duration_95 days 00:00:00,incident_duration_96 days 00:00:00,incident_duration_99 days 00:00:00,class
0,608,10,1979,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,609,10,1979,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,610,11,1979,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,610,11,1979,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,610,11,1979,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30302,4564,9,2020,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
30303,4564,9,2020,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
30304,4564,9,2020,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
30305,4564,9,2020,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [None]:
# Plotting the 2D-Scatter with x="Annual Income" and y="Spending Score (1-100)"
two_clusters.hvplot.scatter(x="disaster_number",y="incident_duration", by="class")

In [None]:
# Plot the 3D-scatter with x="Annual Income", y="Spending Score (1-100)" and z="Age"
fig = px.scatter_3d(
    two_clusters,
    x="disaster_number",
    y="Spending Score (1-100)",
    z="Annual Income",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [None]:
plt.scatter(id_df.incident_month, id_df.incident_duration)
plt.xlabel('Incident Month')
plt.ylabel('Incident Duration')
plt.show()

# Linear Regression
Create linear regression of months vs incident duration to see any possible trends.

In [None]:
# The data in the df column must be reshaped into an array with shape (num_samples, num_features)
X = id_df.incident_month.values.reshape(-1, 1)
y = id_df.incident_duration

In [None]:
X.shape

In [None]:
plt.scatter(id_df.incident_begin_year, id_df.incident_duration)
plt.xlabel('Incident Begin Year')
plt.ylabel('Incident Duration')
plt.show()

# Train and test data for KNN(Nearest Neighbors) model
Classify incident duration because it uses proximity to make classifications or predictions about the grouping of an individual data point. 

In [None]:
# Split our preprocessed data into our features and target arrays
y = id_df["incident_duration"].values
X = id_df.drop(["incident_duration"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

print(confusion_matrix(y_test,pred))

In [None]:
print(classification_report(y_test,pred))

# KNN(nearest neighbors) model

In [None]:
# Start with k=1
knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train,y_train)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')
           

pred = knn.predict(X_test)

#Predicting and evavluations 

print(confusion_matrix(y_test,pred))

In [None]:
# Use the elbow method to pick a good K Value:

error_rate = []

for i in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

plt.figure(figsize=(10,6))

plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',markerfacecolor='red', markersize=10)

plt.title('Error Rate vs. K Value')

plt.xlabel('K')

plt.ylabel('Error Rate')

In [None]:
# Retrain the model with that and check the classification report!
# Compare K=1 with K=23
knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print('WITH K=1')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

In [None]:
knn = KNeighborsClassifier(n_neighbors=23)

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print('WITH K=23')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

In [None]:
# Predict amount of natural disasters in the future - regression

In [None]:
# Cluster states with similar disasters

In [1]:
# Cluster states with similar amounts of disasters

In [None]:
# Cluster counties with similar disasters

In [None]:
# Cluster counties with similar amounts of disasters

In [None]:
# Logistic regression safe or not