In [3]:
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import math
import os
import warnings
import lime
from lime import lime_tabular, submodular_pick
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from IPython.display import display_html


from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# show all columns in functions like head()
pd.set_option('display.max_columns', None)
# to reset option use: pd.reset_option('max_columns')
warnings.filterwarnings("ignore")

# funtion to remove outliers
# defined to remove skewness from our plots
def rem_out(df, columns):
    """
    funtion to remove outliers
    defined to remove skewness from our plots
    """
    for column in columns:
        q1 = df[column].quantile(0.25)
        q3 = df[column].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (1.5 * iqr)
        upper_bound = q3 + (1.5 * iqr)
        df = df[(df[column] > lower_bound) & (df[column] < upper_bound)]
        return df

def add_spines(colour = '#425169', linewidth = 2):
    """
    Add beautiful spines to you plots
    """
    ax = plt.gca()
    ax.spines['top'].set_visible(True)
    ax.spines['right'].set_visible(True)
    ax.spines[['bottom', 'left', 'top', 'right']].set_color(colour)
    ax.spines[['bottom', 'left', 'top', 'right']].set_linewidth(linewidth)

In [4]:
df = pd.read_csv('data/astroid/dataset.csv', low_memory=False)


In [6]:
columns=['a', 'e', 'i', 'om', 'w', 'q', 'ad', 'per_y', 'H', 'diameter', 'albedo', 'moid','n', 'per', 'ma', 'pha', 'neo']
num_cols = ['a', 'e', 'i', 'om', 'w', 'q', 'ad', 'per_y', 'H', 'diameter', 'albedo', 'moid','n', 'per', 'ma']
cat_cols = ['pha', 'neo']
df = df[columns]
# Columns dropped from the original dataset:
# [extent, GM, IR, BV, UB, G, rot_per](Because many values were missing)
# [spec_B, spec_T](Contained too many classes(34 each!!!) and lack of data)
# [class, data_arc](Factors that dont affect diameter))

df = df.dropna(subset=["diameter"]) # Removing rows with null diameter
df = df[pd.to_numeric(df['diameter'], errors='coerce').notnull()] # Removing non numeric values
#df = df.interpolate() # Substituting Variables. Not a good idea :/ I know
df = df.round(decimals=5) # Rounds Float Values

dfnum = df[num_cols].astype(float)
dfcat = df[cat_cols]
df = pd.concat([dfnum, dfcat], axis=1)
df = df.reset_index() # Resetting dataframe indexes
df = df.drop(['index'], axis=1)

df = pd.get_dummies(df, columns = ['pha', 'neo'])

# important columns to diameter prediction
df = df.dropna(subset=["H", "albedo"])

""""removing outliers with the help of Isolation forestðŸŒ²ðŸŒ³"""
def iso_forest(df):
    contamination = 0.05
    iso_model = IsolationForest(contamination=contamination, n_estimators=1000, random_state=21)
    yhat = iso_model.fit_predict(df)

    # select all rows that are not outliers
    mask = yhat != -1
    df = df.iloc[mask]
    return df

df = iso_forest(df)
# through plotting we identified a single point with high diameter that was distorting the distributions
df = df.loc[df.diameter<200]

# we found out that no values of true values of neo or pha columns make it into the final dataset
df = df.drop(['neo_Y', 'neo_N', 'pha_Y', 'pha_N'], axis=1)

In [None]:
df_orig = pd.read_csv('data/astroid/dataset.csv', low_memory=False)
df = pd.read_csv('/kaggle/input/asteroid-preprocessed/asteroid_final (2).csv', low_memory=False)
df_diameter = rem_out(df, ['diameter'])
df.head().style.background_gradient(cmap='Blues')