In [1]:

import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('/Users/wangfan/Documents/wukm/Uncertainty Projects/NEO/NEO.csv')
df.head()

Unnamed: 0,neo_id,name,absolute_magnitude,estimated_diameter_min,estimated_diameter_max,orbiting_body,relative_velocity,miss_distance,is_hazardous
0,2162117,162117 (1998 SD15),19.14,0.394962,0.883161,Earth,71745.401048,58143620.0,False
1,2349507,349507 (2008 QY),18.5,0.530341,1.185878,Earth,109949.757148,55801050.0,True
2,2455415,455415 (2003 GA),21.45,0.136319,0.304818,Earth,24865.506798,67206890.0,False
3,3132126,(2002 PB),20.63,0.198863,0.444672,Earth,78890.076805,30396440.0,False
4,3557844,(2011 DW),22.7,0.076658,0.171412,Earth,56036.519484,63118630.0,False


In [3]:
df.shape

(338199, 9)

In [4]:
df.columns

Index(['neo_id', 'name', 'absolute_magnitude', 'estimated_diameter_min',
       'estimated_diameter_max', 'orbiting_body', 'relative_velocity',
       'miss_distance', 'is_hazardous'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338199 entries, 0 to 338198
Data columns (total 9 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   neo_id                  338199 non-null  int64  
 1   name                    338199 non-null  object 
 2   absolute_magnitude      338171 non-null  float64
 3   estimated_diameter_min  338171 non-null  float64
 4   estimated_diameter_max  338171 non-null  float64
 5   orbiting_body           338199 non-null  object 
 6   relative_velocity       338199 non-null  float64
 7   miss_distance           338199 non-null  float64
 8   is_hazardous            338199 non-null  bool   
dtypes: bool(1), float64(5), int64(1), object(2)
memory usage: 21.0+ MB


In [6]:
df.describe

<bound method NDFrame.describe of           neo_id                name  absolute_magnitude  \
0        2162117  162117 (1998 SD15)              19.140   
1        2349507    349507 (2008 QY)              18.500   
2        2455415    455415 (2003 GA)              21.450   
3        3132126           (2002 PB)              20.630   
4        3557844           (2011 DW)              22.700   
...          ...                 ...                 ...   
338194  54403809          (2023 VS4)              28.580   
338195  54415298          (2023 XW5)              28.690   
338196  54454871          (2024 KJ7)              21.919   
338197  54456245           (2024 NE)              23.887   
338198  54460573          (2024 NH3)              22.951   

        estimated_diameter_min  estimated_diameter_max orbiting_body  \
0                     0.394962                0.883161         Earth   
1                     0.530341                1.185878         Earth   
2                     0.13631

In [7]:
df['is_hazardous'].value_counts()

is_hazardous
False    295037
True      43162
Name: count, dtype: int64

In [8]:
df = df.drop(['neo_id', 'name', 'orbiting_body'], axis=1)
df.head()

Unnamed: 0,absolute_magnitude,estimated_diameter_min,estimated_diameter_max,relative_velocity,miss_distance,is_hazardous
0,19.14,0.394962,0.883161,71745.401048,58143620.0,False
1,18.5,0.530341,1.185878,109949.757148,55801050.0,True
2,21.45,0.136319,0.304818,24865.506798,67206890.0,False
3,20.63,0.198863,0.444672,78890.076805,30396440.0,False
4,22.7,0.076658,0.171412,56036.519484,63118630.0,False


In [9]:
# Check for missing values
print(df.isnull().sum())

absolute_magnitude        28
estimated_diameter_min    28
estimated_diameter_max    28
relative_velocity          0
miss_distance              0
is_hazardous               0
dtype: int64


In [10]:
# Show rows with null values
null_rows = df[df.isnull().any(axis=1)]
print(null_rows)

        absolute_magnitude  estimated_diameter_min  estimated_diameter_max  \
107508                 NaN                     NaN                     NaN   
111671                 NaN                     NaN                     NaN   
114171                 NaN                     NaN                     NaN   
116688                 NaN                     NaN                     NaN   
146059                 NaN                     NaN                     NaN   
148836                 NaN                     NaN                     NaN   
150446                 NaN                     NaN                     NaN   
153108                 NaN                     NaN                     NaN   
155973                 NaN                     NaN                     NaN   
186857                 NaN                     NaN                     NaN   
190088                 NaN                     NaN                     NaN   
191131                 NaN                     NaN              

In [11]:
from sklearn.impute import KNNImputer

# Select the columns for imputation
columns_for_imputation = ['absolute_magnitude', 'estimated_diameter_min', 'estimated_diameter_max']

# Initialize the KNNImputer
imputer = KNNImputer(n_neighbors=5)

# Fit and transform the data
df[columns_for_imputation] = imputer.fit_transform(df[columns_for_imputation])

# Verify the imputation
print(df.isnull().sum())

absolute_magnitude        0
estimated_diameter_min    0
estimated_diameter_max    0
relative_velocity         0
miss_distance             0
is_hazardous              0
dtype: int64


In [12]:
df['average_diameter'] = (df['estimated_diameter_min'] + df['estimated_diameter_max']) / 2

df['diameter_range'] = df['estimated_diameter_max'] - df['estimated_diameter_min']

df['scaled_relative_velocity'] = (df['relative_velocity'] - df['relative_velocity'].min()) / (df['relative_velocity'].max() - df['relative_velocity'].min())

df['log_miss_distance'] = np.log(df['miss_distance'])

df['velocity_diameter_interaction'] = df['relative_velocity'] * df['average_diameter']

df['velocity_distance_ratio'] = df['relative_velocity'] / df['miss_distance']

df['diameter_magnitude_ratio'] = df['average_diameter'] / df['absolute_magnitude']

In [13]:
from sklearn.preprocessing import MinMaxScaler

# Select numerical columns for normalization
numerical_cols = ['absolute_magnitude', 'estimated_diameter_min', 'estimated_diameter_max','relative_velocity','miss_distance']

# Initialize the scaler
scaler = MinMaxScaler()

# Fit and transform the numerical columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Display the normalized data
print(df.head())


   absolute_magnitude  estimated_diameter_min  estimated_diameter_max  \
0            0.406494                0.010506                0.010506   
1            0.380189                0.014112                0.014112   
2            0.501439                0.003617                0.003617   
3            0.467735                0.005283                0.005283   
4            0.552815                0.002028                0.002028   

   relative_velocity  miss_distance  is_hazardous  average_diameter  \
0           0.245362       0.777315         False          0.639061   
1           0.376388       0.745994          True          0.858109   
2           0.084582       0.898495         False          0.220568   
3           0.269865       0.406323         False          0.321768   
4           0.191486       0.843833         False          0.124035   

   diameter_range  scaled_relative_velocity  log_miss_distance  \
0        0.488200                  0.245362          17.878427   
1 

In [14]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Assuming 'is_hazardous' is the target variable
X = df.drop('is_hazardous', axis=1)
y = df['is_hazardous']



In [15]:
"""

from imblearn.under_sampling import NearMiss

# Set up the undersampling method
undersampler = NearMiss(version=1, n_neighbors=3)

# Apply the transformation to the dataset
X, y = undersampler.fit_resample(X, y)
"""

'\n\nfrom imblearn.under_sampling import NearMiss\n\n# Set up the undersampling method\nundersampler = NearMiss(version=1, n_neighbors=3)\n\n# Apply the transformation to the dataset\nX, y = undersampler.fit_resample(X, y)\n'

In [16]:
print(y)

0         False
1          True
2         False
3         False
4         False
          ...  
338194    False
338195    False
338196    False
338197    False
338198    False
Name: is_hazardous, Length: 338199, dtype: bool


In [17]:
print(X.shape)
print(y.shape)

(338199, 12)
(338199,)


In [18]:
# Label Onehot-encoding 
y_Onehot = to_categorical(y)

# Split the data into train and test sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y_Onehot, test_size=0.25, random_state=42)

np.save('/Users/wangfan/Documents/wukm/Uncertainty Projects/NEO/X_test_NEO.npy', X_test)
np.save('/Users/wangfan/Documents/wukm/Uncertainty Projects/NEO/y_test_NEO.npy', y_test)

# Print the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (253649, 12)
X_test shape: (84550, 12)
y_train shape: (253649, 2)
y_test shape: (84550, 2)


In [19]:
print(y_Onehot)

[[1. 0.]
 [0. 1.]
 [1. 0.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]


In [20]:
from tensorflow import keras
from tensorflow.keras import layers

model1 = keras.Sequential([
    layers.BatchNormalization(input_shape=[X_train.shape[1]]),
    layers.Dense(16,activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(16,activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(2,activation='softmax')
])

model1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stopping = keras.callbacks.EarlyStopping(
    patience=5,
    min_delta=0.001,
    restore_best_weights=True,
)
history = model1.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=512,
    epochs=200,
    callbacks=[early_stopping],
)
# Evaluate the model on the test set
accuracy = model1.evaluate(X_test, y_test)
print('Accuracy: {}'.format(accuracy))


Epoch 1/200


  super().__init__(**kwargs)


[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7133 - loss: 0.6625 - val_accuracy: 0.8729 - val_loss: 0.2977
Epoch 2/200
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 678us/step - accuracy: 0.8643 - loss: 0.3264 - val_accuracy: 0.8725 - val_loss: 0.2668
Epoch 3/200
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 685us/step - accuracy: 0.8726 - loss: 0.2854 - val_accuracy: 0.8733 - val_loss: 0.2577
Epoch 4/200
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 760us/step - accuracy: 0.8733 - loss: 0.2732 - val_accuracy: 0.8759 - val_loss: 0.2526
Epoch 5/200
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 664us/step - accuracy: 0.8747 - loss: 0.2671 - val_accuracy: 0.8764 - val_loss: 0.2512
Epoch 6/200
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 665us/step - accuracy: 0.8759 - loss: 0.2632 - val_accuracy: 0.8809 - val_loss: 0.2485
Epoch 7/200
[1m496/496

In [21]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Make predictions on the test set
y_pred = model1.predict(X_test)

# Convert predictions and true labels from one-hot encoding to class indices
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test_classes, y_pred_classes)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

[1m2643/2643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 251us/step
Confusion Matrix:
[[73415   349]
 [ 9610  1176]]


In [22]:
model1.save('/Users/wangfan/Documents/wukm/Uncertainty Projects/NEO/NEO.keras')