In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Load the dataset

* `Vmag` - Visual Apparent Magnitude of the Star
* `Plx` - Distance Between the Star and the Earth
* `e_Plx` - Standard Error of Plx (Drop the Row if you find the e_Plx is too high!)
* `B-V` - B-V color index. (A hot star has a B-V color index close to 0 or negative, while a cool star has a B-V color index close to 2.0. Other stars are somewhere in between.)
* `SpType` - Stellar classification. (Roman Numerals >IV are giants. Otherwise are dwarfs)

In [2]:
# Use pd.read_csv to read file
path = "./Resources/Star9999_raw.csv"
star_raw_data = pd.read_csv(path)

star_raw_data

Unnamed: 0.1,Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType
0,0,9.10,3.54,1.39,0.482,F5
1,1,9.27,21.90,3.10,0.999,K3V
2,2,6.61,2.81,0.63,-0.019,B9
3,3,8.06,7.75,0.97,0.370,F0V
4,4,8.55,2.87,1.11,0.902,G8III
...,...,...,...,...,...,...
9994,9994,8.45,-0.93,1.13,1.404,K5
9995,9995,7.84,4.26,1.00,1.140,K1IIICN...
9996,9996,9.38,3.61,1.36,0.507,G0
9997,9997,7.64,4.75,0.97,0.075,A2


# Pre-Processing the data

In [3]:
# Check the DataType of our dataset
star_raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  9999 non-null   int64  
 1   Vmag        9999 non-null   float64
 2   Plx         9999 non-null   object 
 3   e_Plx       9999 non-null   object 
 4   B-V         9999 non-null   object 
 5   SpType      9722 non-null   object 
dtypes: float64(1), int64(1), object(4)
memory usage: 468.8+ KB


In [4]:
# Convert Columns data type to float values
star_raw_data["Vmag"] = pd.to_numeric(star_raw_data["Vmag"], downcast="float", errors='coerce')
star_raw_data["Plx"] = pd.to_numeric(star_raw_data["Plx"], downcast="float", errors='coerce')
star_raw_data["e_Plx"] = pd.to_numeric(star_raw_data["e_Plx"], downcast="float", errors='coerce')
star_raw_data["B-V"] = pd.to_numeric(star_raw_data["B-V"], downcast="float", errors='coerce')
star_raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  9999 non-null   int64  
 1   Vmag        9999 non-null   float32
 2   Plx         9984 non-null   float32
 3   e_Plx       9984 non-null   float32
 4   B-V         9907 non-null   float32
 5   SpType      9722 non-null   object 
dtypes: float32(4), int64(1), object(1)
memory usage: 312.6+ KB


In [5]:
# Check the DataType of our dataset
star_raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  9999 non-null   int64  
 1   Vmag        9999 non-null   float32
 2   Plx         9984 non-null   float32
 3   e_Plx       9984 non-null   float32
 4   B-V         9907 non-null   float32
 5   SpType      9722 non-null   object 
dtypes: float32(4), int64(1), object(1)
memory usage: 312.6+ KB


In [6]:
star_raw_data

Unnamed: 0.1,Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType
0,0,9.10,3.54,1.39,0.482,F5
1,1,9.27,21.90,3.10,0.999,K3V
2,2,6.61,2.81,0.63,-0.019,B9
3,3,8.06,7.75,0.97,0.370,F0V
4,4,8.55,2.87,1.11,0.902,G8III
...,...,...,...,...,...,...
9994,9994,8.45,-0.93,1.13,1.404,K5
9995,9995,7.84,4.26,1.00,1.140,K1IIICN...
9996,9996,9.38,3.61,1.36,0.507,G0
9997,9997,7.64,4.75,0.97,0.075,A2


In [7]:
star_raw_data.describe(include='all')

Unnamed: 0.1,Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType
count,9999.0,9999.0,9984.0,9984.0,9907.0,9722
unique,,,,,,885
top,,,,,,K0
freq,,,,,,730
mean,4999.0,8.479595,7.662233,1.372422,0.745759,
std,2886.607005,1.306919,11.122961,1.438556,0.454104,
min,0.0,0.45,-18.17,0.47,-0.359,
25%,2499.5,7.74,2.7275,0.92,0.4225,
50%,4999.0,8.56,4.96,1.12,0.65,
75%,7498.5,9.25,8.93,1.39,1.084,


In [8]:
# check the number of missing data
star_raw_data.isnull().sum()

Unnamed: 0      0
Vmag            0
Plx            15
e_Plx          15
B-V            92
SpType        277
dtype: int64

In [9]:
# remove all the missing data
star_raw_data = star_raw_data.dropna() 
# check the number of missing data
star_raw_data.isnull().sum()

Unnamed: 0    0
Vmag          0
Plx           0
e_Plx         0
B-V           0
SpType        0
dtype: int64

In [10]:
#Remove unwanted column
star_raw_data = star_raw_data.drop('Unnamed: 0', axis=1)
star_raw_data.head()

Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType
0,9.1,3.54,1.39,0.482,F5
1,9.27,21.9,3.1,0.999,K3V
2,6.61,2.81,0.63,-0.019,B9
3,8.06,7.75,0.97,0.37,F0V
4,8.55,2.87,1.11,0.902,G8III


In [11]:
#Check of the distribution of the target
star_raw_data["SpType"].unique()

array(['F5', 'K3V', 'B9', 'F0V', 'G8III', 'M0V:', 'G0', 'M6e-M8.5e Tc',
       'G5', 'F6V', 'A2', 'K4III', 'K0III', 'K0', 'K2', 'F3V', 'K5',
       'G8/K0III/IV', 'F2V', 'G0V', 'G3IV', 'F7V', 'G5V', 'F3/F5V', 'A0',
       'B8', 'F2', 'F7.5IV-V', 'G6V', 'B...', 'G9III-IV', 'K1III',
       'K0/K1III', 'G1IV', 'A4V', 'M:', 'G2IV/V', 'K2V', 'B5', 'F2IV',
       'B9p SiEu', 'K2III', 'G3/G5V', 'M0', 'A0V', 'G2V', 'F5IV/V', 'F8V',
       'G2', 'F5V', 'F6/F7V', 'C5p', 'M1III', 'F0', 'G7II-III', 'M2III',
       'K3II/III', 'M2', 'M1.5V:', 'G0IV', 'A2IV', 'K1IIICNIV', 'F0III',
       'Am...', 'F8', 'F5IV', 'K2III-IV', 'B9IIIp Mn', 'K0/K1IV', 'B0',
       'K3', 'B7III-IV', 'A9V', 'F3IV...', 'M3III', 'B8V', 'A3', 'B9V',
       'A3V', 'G9III', 'Ap...', 'G3V', 'R...', 'M0V', 'O7', 'G9II-III',
       'A1m...', 'B4V', 'A6V', 'M3', 'G8/K0III', 'K3III', 'G6/G8III',
       'A2V', 'G8', 'F0/F2V', 'F6/F7IV', 'G8IV', 'K0...', 'F2/F3V', 'M1',
       'K1IIICN...', 'K5III:', 'A2III', 'B3Ia', 'B2IVne+...', 'A5'

In [12]:
# Look at SpType value counts for binning
star_raw_data["SpType"].value_counts()

K0             727
G5             623
G0             385
A0             382
F5             368
              ... 
sdG8             1
B0Ib             1
K0II-IIIvar      1
F0Vw...          1
K1/K2II          1
Name: SpType, Length: 872, dtype: int64

# TO DO

SpType - Spectral type for Stellar classification.
(Roman Numerals >= III are giants. <III are dwarfs, No Roman Numerals means they are some special stars (sorry I made a mistake ;_;))
I, II, III are giants, IV, V , VI are dwarfs
VII are usually white dwarfs but they're so freakin hard to observe the numbers of them are few
No Roman Numerals means they are some special stars

## Create a new columns to translate Sptype as per the above to help to map the target class