In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report


In [2]:
import sys
import sklearn

print("Python version:", sys.version)
print("Pandas version:", pd.__version__)
print("NumPy version:", np.__version__)
print("Scikit-learn version:", sklearn.__version__)


Python version: 3.13.9 | packaged by Anaconda, Inc. | (main, Oct 21 2025, 19:09:58) [MSC v.1929 64 bit (AMD64)]
Pandas version: 2.3.3
NumPy version: 2.3.5
Scikit-learn version: 1.7.2


In [5]:
import os

os.getcwd()


'C:\\Users\\joydi'

In [12]:
import pandas as pd

df = pd.read_csv("sbdb_raw.csv", low_memory=False)
df.head()

## Data Import and Initial Validation

## The NASA JPL Small-Body Database CSV file was successfully imported using Pandas.A DtypeWarning was observed due to mixed data types
# in several columns, which is expected given the heterogeneous nature of astronomical data. To ensure accurate type inference, the dataset
# was reloaded using `low_memory=False`. Detailed data type cleaning and missing value handling are planned in subsequent preprocessing
# steps.



Unnamed: 0,spkid,full_name,pdes,name,prefix,neo,pha,sats,H,G,...,rms,two_body,A1,A1_sigma,A2,A2_sigma,A3,A3_sigma,DT,DT_sigma
0,20000433,433 Eros (A898 PA),433,Eros,,Y,N,0,10.38,0.46,...,0.29796,,,,,,,,,
1,20000719,719 Albert (A911 TB),719,Albert,,Y,N,0,15.59,,...,0.40517,,,,,,,,,
2,20000887,887 Alinda (A918 AA),887,Alinda,,Y,N,0,13.81,-0.12,...,0.25971,,,,,,,,,
3,20001036,1036 Ganymed (A924 UB),1036,Ganymed,,Y,N,0,9.18,0.3,...,0.31161,,,,,,,,,
4,20001221,1221 Amor (1932 EA1),1221,Amor,,Y,N,0,17.37,,...,0.40562,,,,-7.2e-15,2.1e-15,,,,


In [13]:
df.columns


Index(['spkid', 'full_name', 'pdes', 'name', 'prefix', 'neo', 'pha', 'sats',
       'H', 'G', 'M1', 'M2', 'K1', 'K2', 'PC', 'diameter', 'extent', 'albedo',
       'rot_per', 'GM', 'BV', 'UB', 'IR', 'spec_B', 'spec_T', 'H_sigma',
       'diameter_sigma', 'orbit_id', 'epoch', 'epoch_mjd', 'epoch_cal',
       'equinox', 'e', 'a', 'q', 'i', 'om', 'w', 'ma', 'ad', 'n', 'tp',
       'tp_cal', 'per', 'per_y', 'moid', 'moid_ld', 'moid_jup', 't_jup',
       'sigma_e', 'sigma_a', 'sigma_q', 'sigma_i', 'sigma_om', 'sigma_w',
       'sigma_ma', 'sigma_ad', 'sigma_n', 'sigma_tp', 'sigma_per', 'class',
       'producer', 'data_arc', 'first_obs', 'last_obs', 'n_obs_used',
       'n_del_obs_used', 'n_dop_obs_used', 'condition_code', 'rms', 'two_body',
       'A1', 'A1_sigma', 'A2', 'A2_sigma', 'A3', 'A3_sigma', 'DT', 'DT_sigma'],
      dtype='object')

In [15]:
data_dict = pd.DataFrame({
    "Column Name": df.columns,
    "Description": "",
    "Units / Notes": ""
})

descriptions = {
    "spkid": "Unique SPK-ID identifier for the small body",
    "full_name": "Full object designation",
    "pdes": "Primary designation",
    "name": "IAU-approved object name",
    "neo": "Near-Earth Object flag (Y/N)",
    "pha": "Potentially Hazardous Asteroid flag (Y/N), target variable",
    "H": "Absolute magnitude, proxy for asteroid size",
    "diameter": "Estimated asteroid diameter",
    "albedo": "Geometric albedo",
    "rot_per": "Rotation period",
    "e": "Orbital eccentricity",
    "a": "Semi-major axis of orbit",
    "q": "Perihelion distance",
    "i": "Orbital inclination",
    "moid": "Minimum Orbit Intersection Distance with Earth",
    "moid_ld": "MOID expressed in Lunar Distances",
    "t_jup": "Tisserand parameter with respect to Jupiter",
    "condition_code": "Orbit uncertainty condition code (0 = well determined)",
    "data_arc": "Time span of observations",
    "n_obs_used": "Number of observations used in orbit determination",
    "rms": "Root mean square residual of orbital fit"
}

units = {
    "diameter": "km",
    "rot_per": "hours",
    "a": "AU",
    "q": "AU",
    "i": "degrees",
    "moid": "AU",
    "moid_ld": "Lunar Distances",
    "data_arc": "days",
    "rms": "arcseconds"
}

for col, desc in descriptions.items():
    data_dict.loc[data_dict["Column Name"] == col, "Description"] = desc

for col, unit in units.items():
    data_dict.loc[data_dict["Column Name"] == col, "Units / Notes"] = unit

pd.set_option("display.max_rows", None)
data_dict



Unnamed: 0,Column Name,Description,Units / Notes
0,spkid,Unique SPK-ID identifier for the small body,
1,full_name,Full object designation,
2,pdes,Primary designation,
3,name,IAU-approved object name,
4,prefix,,
5,neo,Near-Earth Object flag (Y/N),
6,pha,"Potentially Hazardous Asteroid flag (Y/N), tar...",
7,sats,,
8,H,"Absolute magnitude, proxy for asteroid size",
9,G,,
