- Environment Setup

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")
# Wrangling
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Statistical Tests
import scipy.stats as stats
from scipy.stats import norm
# Visualizing
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.dates as dates
import seaborn as sns
from sklearn.model_selection import learning_curve
import datetime
pd.options.display.float_format = '{:20,.2f}'.format

In [2]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

# plotting defaults
plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-whitegrid')
plt.rc('font', size=16)

# Acquire

In [3]:
df = pd.read_csv('asteroid.csv')

In [4]:
df.head()

Unnamed: 0,id,spkid,full_name,pdes,name,prefix,neo,pha,H,G,M1,M2,K1,K2,PC,diameter,extent,albedo,rot_per,GM,BV,UB,IR,spec_B,spec_T,H_sigma,diameter_sigma
0,a0000001,2000001,1 Ceres (A801 AA),1,Ceres,,N,N,3.53,0.12,,,,,,939.4,964.4 x 964.2 x 891.8,0.09,9.07,62.63,0.71,0.43,,C,G,,0.2
1,a0000002,2000002,2 Pallas (A802 FA),2,Pallas,,N,N,4.21,0.11,,,,,,545.0,582x556x500,0.1,7.81,14.3,0.64,0.28,,B,B,,18.0
2,a0000003,2000003,3 Juno (A804 RA),3,Juno,,N,N,5.27,0.32,,,,,,246.6,,0.21,7.21,,0.82,0.43,,Sk,S,,10.59
3,a0000004,2000004,4 Vesta (A807 FA),4,Vesta,,N,N,3.31,0.32,,,,,,525.4,572.6 x 557.2 x 446.4,0.42,5.34,17.8,0.78,0.49,,V,V,,0.2
4,a0000005,2000005,5 Astraea (A845 XA),5,Astraea,,N,N,6.98,,,,,,,106.7,,0.27,16.81,,0.83,0.41,,S,S,,3.14


In [5]:
df.shape

(1100500, 27)

In [6]:
df.describe()

Unnamed: 0,spkid,H,G,M1,M2,K1,K2,PC,diameter,albedo,rot_per,GM,BV,UB,IR,H_sigma,diameter_sigma
count,1100500.0,1092961.0,119.0,1643.0,811.0,1643.0,811.0,788.0,140082.0,138846.0,32598.0,14.0,1021.0,979.0,1.0,325968.0,139864.0
mean,10106063.55,17.09,0.18,11.32,14.49,11.26,5.07,0.03,5.45,0.13,29.64,7.82,0.77,0.36,-0.33,0.31,0.49
std,17869786.53,1.78,0.13,3.97,2.83,7.01,0.63,0.0,9.31,0.11,81.67,16.79,0.09,0.1,,0.11,0.8
min,1000001.0,-1.11,-0.25,4.0,7.5,4.0,5.0,0.03,0.0,0.0,0.0,0.0,0.58,0.12,-0.33,0.0,0.0
25%,2271388.75,16.27,0.1,7.75,12.4,5.88,5.0,0.03,2.76,0.05,4.45,0.0,0.7,0.29,-0.33,0.24,0.18
50%,2546513.5,17.1,0.19,11.2,14.5,10.0,5.0,0.03,3.94,0.08,7.42,0.62,0.74,0.36,-0.33,0.3,0.34
75%,3871900.25,17.9,0.25,14.2,16.4,13.75,5.0,0.03,5.72,0.19,18.49,6.5,0.85,0.44,-0.33,0.37,0.63
max,54161860.0,99.99,0.6,24.3,24.5,50.0,15.0,0.03,939.4,1.0,3240.0,62.63,1.08,0.66,-0.33,1.79,140.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100500 entries, 0 to 1100499
Data columns (total 27 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   id              1100500 non-null  object 
 1   spkid           1100500 non-null  int64  
 2   full_name       1100500 non-null  object 
 3   pdes            1100500 non-null  object 
 4   name            25806 non-null    object 
 5   prefix          3753 non-null     object 
 6   neo             1096949 non-null  object 
 7   pha             1083124 non-null  object 
 8   H               1092961 non-null  float64
 9   G               119 non-null      float64
 10  M1              1643 non-null     float64
 11  M2              811 non-null      float64
 12  K1              1643 non-null     float64
 13  K2              811 non-null      float64
 14  PC              788 non-null      float64
 15  diameter        140082 non-null   float64
 16  extent          20 non-null       ob

In [8]:
df.isnull().sum()

id                      0
spkid                   0
full_name               0
pdes                    0
name              1074694
prefix            1096747
neo                  3551
pha                 17376
H                    7539
G                 1100381
M1                1098857
M2                1099689
K1                1098857
K2                1099689
PC                1099712
diameter           960418
extent            1100480
albedo             961654
rot_per           1067902
GM                1100486
BV                1099479
UB                1099521
IR                1100499
spec_B            1098834
spec_T            1099520
H_sigma            774532
diameter_sigma     960636
dtype: int64

In [9]:
def acquire_asteroid():
    '''
    This is a simple function to create a pandas dataframe from the asteroid.csv file
    '''
    df = pd.read_csv('asteroid.csv')
    return df

In [10]:
df2 = acquire_asteroid()

In [11]:
df2.head()

Unnamed: 0,id,spkid,full_name,pdes,name,prefix,neo,pha,H,G,M1,M2,K1,K2,PC,diameter,extent,albedo,rot_per,GM,BV,UB,IR,spec_B,spec_T,H_sigma,diameter_sigma
0,a0000001,2000001,1 Ceres (A801 AA),1,Ceres,,N,N,3.53,0.12,,,,,,939.4,964.4 x 964.2 x 891.8,0.09,9.07,62.63,0.71,0.43,,C,G,,0.2
1,a0000002,2000002,2 Pallas (A802 FA),2,Pallas,,N,N,4.21,0.11,,,,,,545.0,582x556x500,0.1,7.81,14.3,0.64,0.28,,B,B,,18.0
2,a0000003,2000003,3 Juno (A804 RA),3,Juno,,N,N,5.27,0.32,,,,,,246.6,,0.21,7.21,,0.82,0.43,,Sk,S,,10.59
3,a0000004,2000004,4 Vesta (A807 FA),4,Vesta,,N,N,3.31,0.32,,,,,,525.4,572.6 x 557.2 x 446.4,0.42,5.34,17.8,0.78,0.49,,V,V,,0.2
4,a0000005,2000005,5 Astraea (A845 XA),5,Astraea,,N,N,6.98,,,,,,,106.7,,0.27,16.81,,0.83,0.41,,S,S,,3.14


In [12]:
df2.shape

(1100500, 27)

# Prepare

## - Rename Columns
For sake of readability, names of columns will be adjusted

In [13]:
list(df.columns)

['id',
 'spkid',
 'full_name',
 'pdes',
 'name',
 'prefix',
 'neo',
 'pha',
 'H',
 'G',
 'M1',
 'M2',
 'K1',
 'K2',
 'PC',
 'diameter',
 'extent',
 'albedo',
 'rot_per',
 'GM',
 'BV',
 'UB',
 'IR',
 'spec_B',
 'spec_T',
 'H_sigma',
 'diameter_sigma']

In [14]:
# df = df.rename(columns={'a': 'axis', 'e': 'eccentricity', 'i': 'Inclination', 'om': 'longitude', 'w': 'perihelion_argument', 'q': 'perihelion_distance', 'ad': 'aphelion_distance', 'per_y': 'orbital_period', 'H': 'absolute_magnitude', 'neo': 'near_earth', 'pha': 'hazardous', 'moid': 'orbit_intersection', 'n': 'mean_motion', 'per': 'orbital_period', 'ma': 'mean_anomaly'})

In [15]:
list(df.columns)

['id',
 'spkid',
 'full_name',
 'pdes',
 'name',
 'prefix',
 'neo',
 'pha',
 'H',
 'G',
 'M1',
 'M2',
 'K1',
 'K2',
 'PC',
 'diameter',
 'extent',
 'albedo',
 'rot_per',
 'GM',
 'BV',
 'UB',
 'IR',
 'spec_B',
 'spec_T',
 'H_sigma',
 'diameter_sigma']