In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.metrics import mean_squared_error
import joblib

In [None]:
df = pd.read_csv('/kaggle/input/nearest-earth-objects(1910-2024).csv')
df

In [None]:
print(df.isna().sum())

In [None]:
df = df.dropna()

In [None]:
df.dtypes

In [None]:
print('Number of unique values in neo_id is ', df['neo_id'].nunique())
print('Number of unique values in name is ', df['name'].nunique())
print('Number of unique values in orbiting_body is ', df['orbiting_body'].nunique())

In [None]:
df = df.drop(['neo_id', 'name', 'orbiting_body'], axis=1)

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df['average_diameter'] = (df['estimated_diameter_min'] + df['estimated_diameter_max']) / 2

df['diameter_range'] = df['estimated_diameter_max'] - df['estimated_diameter_min']

df['scaled_relative_velocity'] = (df['relative_velocity'] - df['relative_velocity'].min()) / (df['relative_velocity'].max() - df['relative_velocity'].min())

df['log_miss_distance'] = np.log(df['miss_distance'])

df['velocity_diameter_interaction'] = df['relative_velocity'] * df['average_diameter']

df['velocity_distance_ratio'] = df['relative_velocity'] / df['miss_distance']

df['diameter_magnitude_ratio'] = df['average_diameter'] / df['absolute_magnitude']

In [None]:
print(df.columns.to_list())

In [None]:
y = df['is_hazardous']
features = df.drop(['is_hazardous'],axis=1)

In [None]:
y

In [None]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [None]:
num_bins = 10
y_binned = pd.cut(y, bins=num_bins, labels=False)

# Initialize StratifiedKFold
str_kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 0
fold_metrics = []
mean_importances = np.zeros(features.shape[1])  # Initialize array for mean importances

for train_idx, test_idx in str_kf.split(features, y_binned):
    cv_train, cv_test = features.iloc[train_idx], features.iloc[test_idx]
    cv_y_train, cv_y_val = y[train_idx], y[test_idx]
    print('Fold: {}'.format(fold))
    print('CV train shape: {}'.format(cv_train.shape))
    
    model = ExtraTreesClassifier()
    model.fit(cv_train, cv_y_train)
    
    y_pred = model.predict(cv_test)
    rmse = mean_squared_error(cv_y_val, y_pred, squared=False)
    print(f'RMSE: {rmse}')
    fold_metrics.append(rmse)
    
    # Add feature importances to the mean_importances array
    mean_importances += model.feature_importances_
    
    fold += 1

print(f'Average RMSE: {np.mean(fold_metrics)}')

In [None]:
y_pred = model.predict(cv_test)
rmse = mean_squared_error(y_pred, cv_y_val)
print(rmse)

In [None]:
model.score(cv_train, cv_y_train)

In [None]:
model.score(cv_test, cv_y_val)

In [None]:
mean_importances /= str_kf.get_n_splits()

# Print average RMSE across all folds
print(f'Average RMSE: {np.mean(fold_metrics)}')

# Print feature importances
feature_names = features.columns
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': mean_importances
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(importance_df)