- Environment Setup

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")
# Wrangling
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Statistical Tests
import scipy.stats as stats
from scipy.stats import norm
# Visualizing
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.dates as dates
import seaborn as sns
from sklearn.model_selection import learning_curve
import datetime
pd.options.display.float_format = '{:20,.2f}'.format

In [4]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

# plotting defaults
plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-whitegrid')
plt.rc('font', size=16)

# Acquire

In [2]:
df = pd.read_csv('asteroid.csv')

In [5]:
df.head()

Unnamed: 0,name,a,e,i,om,w,q,ad,per_y,data_arc,condition_code,n_obs_used,H,neo,pha,diameter,extent,albedo,rot_per,GM,BV,UB,IR,spec_B,spec_T,G,moid,class,n,per,ma
0,Ceres,2.77,0.08,10.59,80.31,73.6,2.56,2.98,4.61,8822.0,0,1002,3.34,N,N,939.4,964.4 x 964.2 x 891.8,0.09,9.07,62.63,0.71,0.43,,C,G,0.12,1.59,MBA,0.21,1683.15,77.37
1,Pallas,2.77,0.23,34.84,173.08,310.05,2.13,3.41,4.62,72318.0,0,8490,4.13,N,N,545.0,582x556x500,0.1,7.81,14.3,0.64,0.28,,B,B,0.11,1.23,MBA,0.21,1686.16,59.7
2,Juno,2.67,0.26,12.99,169.85,248.14,1.98,3.35,4.36,72684.0,0,7104,5.33,N,N,246.596,,0.21,7.21,,0.82,0.43,,Sk,S,0.32,1.03,MBA,0.23,1592.79,34.93
3,Vesta,2.36,0.09,7.14,103.81,150.73,2.15,2.57,3.63,24288.0,0,9325,3.2,N,N,525.4,572.6 x 557.2 x 446.4,0.42,5.34,17.8,0.78,0.49,,V,V,0.32,1.14,MBA,0.27,1325.43,95.86
4,Astraea,2.57,0.19,5.37,141.58,358.69,2.08,3.07,4.13,63507.0,0,2916,6.85,N,N,106.699,,0.27,16.81,,0.83,0.41,,S,S,,1.1,MBA,0.24,1508.6,282.37


In [6]:
df.shape

(839714, 31)

In [7]:
df.describe()

Unnamed: 0,a,e,i,om,w,q,ad,per_y,data_arc,n_obs_used,H,albedo,rot_per,GM,BV,UB,IR,G,moid,n,per,ma
count,839712.0,839714.0,839714.0,839714.0,839714.0,839714.0,839708.0,839713.0,824240.0,839714.0,837025.0,136409.0,18796.0,14.0,1021.0,979.0,1.0,119.0,823272.0,839712.0,839708.0,839706.0
mean,2.76,0.16,8.95,168.5,181.08,2.4,3.39,6.86,5688.42,259.19,16.79,0.13,21.14,7.82,0.77,0.36,-0.33,0.18,1.42,0.24,2505.53,180.66
std,114.38,0.09,6.67,103.1,104.02,2.23,12.75,252.26,4208.18,371.76,1.82,0.11,73.13,16.79,0.09,0.1,,0.13,2.25,0.08,92139.79,106.56
min,-104279.22,0.0,0.01,0.0,0.0,0.07,0.77,0.0,0.0,2.0,-1.1,0.0,0.0,0.0,0.58,0.12,-0.33,-0.25,0.0,0.0,151.13,-67.14
25%,2.39,0.09,4.07,80.21,91.04,1.97,2.78,3.68,3608.0,50.0,15.9,0.05,4.21,0.0,0.7,0.29,-0.33,0.1,0.98,0.19,1345.56,86.64
50%,2.64,0.14,7.26,160.29,181.67,2.23,3.04,4.3,5806.0,118.0,16.8,0.08,6.65,0.62,0.74,0.36,-0.33,0.19,1.24,0.23,1570.52,181.52
75%,3.0,0.2,12.26,252.2,271.52,2.58,3.36,5.19,7270.0,296.0,17.6,0.19,12.62,6.5,0.85,0.44,-0.33,0.25,1.59,0.27,1894.18,274.3
max,3043.15,1.2,175.19,360.0,360.0,80.42,6081.84,167877.71,72684.0,9325.0,33.2,1.0,3240.0,62.63,1.08,0.66,-0.33,0.6,79.5,2.38,61317334.56,491.62


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 839714 entries, 0 to 839713
Data columns (total 31 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   name            21967 non-null   object 
 1   a               839712 non-null  float64
 2   e               839714 non-null  float64
 3   i               839714 non-null  float64
 4   om              839714 non-null  float64
 5   w               839714 non-null  float64
 6   q               839714 non-null  float64
 7   ad              839708 non-null  float64
 8   per_y           839713 non-null  float64
 9   data_arc        824240 non-null  float64
 10  condition_code  838847 non-null  object 
 11  n_obs_used      839714 non-null  int64  
 12  H               837025 non-null  float64
 13  neo             839708 non-null  object 
 14  pha             823272 non-null  object 
 15  diameter        137636 non-null  object 
 16  extent          18 non-null      object 
 17  albedo    

In [9]:
df.isnull().sum()

name              817747
a                      2
e                      0
i                      0
om                     0
w                      0
q                      0
ad                     6
per_y                  1
data_arc           15474
condition_code       867
n_obs_used             0
H                   2689
neo                    6
pha                16442
diameter          702078
extent            839696
albedo            703305
rot_per           820918
GM                839700
BV                838693
UB                838735
IR                839713
spec_B            838048
spec_T            838734
G                 839595
moid               16442
class                  0
n                      2
per                    6
ma                     8
dtype: int64

In [10]:
def acquire_asteroid():
    '''
    This is a simple function to create a pandas dataframe from the asteroid.csv file
    '''
    df = pd.read_csv('asteroid.csv')
    return df

In [15]:
df2 = acquire_asteroid()

In [16]:
df2.head()

Unnamed: 0,name,a,e,i,om,w,q,ad,per_y,data_arc,condition_code,n_obs_used,H,neo,pha,diameter,extent,albedo,rot_per,GM,BV,UB,IR,spec_B,spec_T,G,moid,class,n,per,ma
0,Ceres,2.77,0.08,10.59,80.31,73.6,2.56,2.98,4.61,8822.0,0,1002,3.34,N,N,939.4,964.4 x 964.2 x 891.8,0.09,9.07,62.63,0.71,0.43,,C,G,0.12,1.59,MBA,0.21,1683.15,77.37
1,Pallas,2.77,0.23,34.84,173.08,310.05,2.13,3.41,4.62,72318.0,0,8490,4.13,N,N,545.0,582x556x500,0.1,7.81,14.3,0.64,0.28,,B,B,0.11,1.23,MBA,0.21,1686.16,59.7
2,Juno,2.67,0.26,12.99,169.85,248.14,1.98,3.35,4.36,72684.0,0,7104,5.33,N,N,246.596,,0.21,7.21,,0.82,0.43,,Sk,S,0.32,1.03,MBA,0.23,1592.79,34.93
3,Vesta,2.36,0.09,7.14,103.81,150.73,2.15,2.57,3.63,24288.0,0,9325,3.2,N,N,525.4,572.6 x 557.2 x 446.4,0.42,5.34,17.8,0.78,0.49,,V,V,0.32,1.14,MBA,0.27,1325.43,95.86
4,Astraea,2.57,0.19,5.37,141.58,358.69,2.08,3.07,4.13,63507.0,0,2916,6.85,N,N,106.699,,0.27,16.81,,0.83,0.41,,S,S,,1.1,MBA,0.24,1508.6,282.37


In [17]:
df2.shape

(839714, 31)