In [4]:
import numpy as np
import pandas as pd

In [5]:
stardf = pd.read_csv("Star9999_raw.csv")
stardf.head()

Unnamed: 0.1,Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType
0,0,9.1,3.54,1.39,0.482,F5
1,1,9.27,21.9,3.1,0.999,K3V
2,2,6.61,2.81,0.63,-0.019,B9
3,3,8.06,7.75,0.97,0.37,F0V
4,4,8.55,2.87,1.11,0.902,G8III


In [6]:
stardf.info ()
#pix, epix and bv need to be converted to float type data 
#spType is missing data
#noting my code as I go incase I go insane and someone has to retrace my steps lol :) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  9999 non-null   int64  
 1   Vmag        9999 non-null   float64
 2   Plx         9999 non-null   object 
 3   e_Plx       9999 non-null   object 
 4   B-V         9999 non-null   object 
 5   SpType      9722 non-null   object 
dtypes: float64(1), int64(1), object(4)
memory usage: 468.8+ KB


Vmag - Visual Apparent Magnitude of the Star

Plx - Distance Between the Star and the Earth

e_Plx - Standard Error of Plx (Drop the Row if you find the e_Plx is too high!)

B-V - B-V color index. (A hot star has a B-V color index close to 0 or negative, while a cool star has a B-V color index close to 2.0. Other stars are somewhere in between.)

SpType - Stellar classification. (Roman Numerals >IV are giants. Otherwise are dwarfs)

In [7]:
stardf.describe ()

Unnamed: 0.1,Unnamed: 0,Vmag
count,9999.0,9999.0
mean,4999.0,8.47959
std,2886.607005,1.306924
min,0.0,0.45
25%,2499.5,7.74
50%,4999.0,8.56
75%,7498.5,9.25
max,9998.0,13.27


In [8]:
stardf.nunique()
#need to drop index column 

Unnamed: 0    9999
Vmag           801
Plx           2479
e_Plx          511
B-V           1859
SpType         885
dtype: int64

In [9]:
#first convert to correct data types and go from there. 
#fix_stardf=stardf.astype({'Plx': 'float', 'e_Plx': 'float', 'B-V': 'float'})
#must be letters in some null columns, will have to coerce. 

In [11]:
stardf["Plx"] = pd.to_numeric(stardf["Plx"], downcast="float",errors='coerce')
stardf["e_Plx"] = pd.to_numeric(stardf["e_Plx"], downcast="float",errors='coerce')
stardf["B-V"] = pd.to_numeric(stardf["B-V"], downcast="float",errors='coerce')
stardf["SpType"] = pd.to_numeric(stardf["SpType"], downcast="float",errors='coerce')
stardf.info ()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  9999 non-null   int64  
 1   Vmag        9999 non-null   float64
 2   Plx         9984 non-null   float32
 3   e_Plx       9984 non-null   float32
 4   B-V         0 non-null      float32
 5   SpType      0 non-null      float32
dtypes: float32(4), float64(1), int64(1)
memory usage: 312.6 KB


In [13]:
stardf.head ()

Unnamed: 0.1,Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType
0,0,9.1,3.54,1.39,,
1,1,9.27,21.9,3.1,,
2,2,6.61,2.81,0.63,,
3,3,8.06,7.75,0.97,,
4,4,8.55,2.87,1.11,,


In [37]:
#find the null values and remove them 
null_val=stardf.isnull ().sum ()
null_val

Unnamed: 0      0
Vmag            0
Plx            15
e_Plx          15
B-V            92
SpType        277
dtype: int64

In [38]:
#find the missing percentage of data
#first find the total cells in the df
totalcells= np.product(stardf.shape)
totalcells

59994

In [39]:
totalmissing= null_val.sum ()
totalmissing

399

In [40]:
missing=(totalmissing/totalcells)*100
missing 

0.6650665066506651

We are only missing 0.665% of our data, this website says we can miss up to 5% of data without it impacting the machine learning. 
https://pressbooks.library.upei.ca/montelpare/chapter/working-with-missing-data/#:~:text=The%20overall%20percentage%20of%20data,to%20ignore%20them%20(REF).

In [41]:
clean_stardf= stardf.dropna ()
clean_stardf

Unnamed: 0.1,Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType
0,0,9.10,3.54,1.39,0.482,F5
1,1,9.27,21.90,3.10,0.999,K3V
2,2,6.61,2.81,0.63,-0.019,B9
3,3,8.06,7.75,0.97,0.370,F0V
4,4,8.55,2.87,1.11,0.902,G8III
...,...,...,...,...,...,...
9994,9994,8.45,-0.93,1.13,1.404,K5
9995,9995,7.84,4.26,1.00,1.140,K1IIICN...
9996,9996,9.38,3.61,1.36,0.507,G0
9997,9997,7.64,4.75,0.97,0.075,A2


In [42]:
#because i dropped entire rows of data I need to figure out the new percentage loss of data 
dropped=stardf.shape[0]-clean_stardf.shape[0]
dropped
# given that this dataset is 10000 rows it's pretty clear, but just to check i did the below code too. 

320

In [43]:
totalpercentmissing= (dropped/10000)*100
totalpercentmissing

3.2

We are missing 3.2% of our data from the whole data set, which is still below the 5% so we are fine to simply drop the rows. 
The dataframe is now clean with the correct data types and with no null values. I just need to remove the first col because it's the same as the index.  

In [44]:
clean_stardf= clean_stardf. drop('Unnamed: 0', axis=1)
clean_stardf

Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType
0,9.10,3.54,1.39,0.482,F5
1,9.27,21.90,3.10,0.999,K3V
2,6.61,2.81,0.63,-0.019,B9
3,8.06,7.75,0.97,0.370,F0V
4,8.55,2.87,1.11,0.902,G8III
...,...,...,...,...,...
9994,8.45,-0.93,1.13,1.404,K5
9995,7.84,4.26,1.00,1.140,K1IIICN...
9996,9.38,3.61,1.36,0.507,G0
9997,7.64,4.75,0.97,0.075,A2


In [45]:
#index is off because of dropped columns 
clean_stardf=clean_stardf.reset_index(drop=True)

In [46]:
clean_stardf

Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType
0,9.10,3.54,1.39,0.482,F5
1,9.27,21.90,3.10,0.999,K3V
2,6.61,2.81,0.63,-0.019,B9
3,8.06,7.75,0.97,0.370,F0V
4,8.55,2.87,1.11,0.902,G8III
...,...,...,...,...,...
9674,8.45,-0.93,1.13,1.404,K5
9675,7.84,4.26,1.00,1.140,K1IIICN...
9676,9.38,3.61,1.36,0.507,G0
9677,7.64,4.75,0.97,0.075,A2


In [47]:
#index is fixed, going to check the data now 

In [48]:
clean_stardf.info ()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9679 entries, 0 to 9678
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Vmag    9679 non-null   float64
 1   Plx     9679 non-null   float32
 2   e_Plx   9679 non-null   float32
 3   B-V     9679 non-null   float32
 4   SpType  9679 non-null   object 
dtypes: float32(3), float64(1), object(1)
memory usage: 264.8+ KB


In [49]:
clean_stardf.describe ()

Unnamed: 0,Vmag,Plx,e_Plx,B-V
count,9679.0,9679.0,9679.0,9679.0
mean,8.407476,7.544123,1.296846,0.742994
std,1.253048,11.039949,1.187328,0.454989
min,0.45,-18.17,0.47,-0.359
25%,7.71,2.73,0.91,0.419
50%,8.52,4.92,1.11,0.644
75%,9.17,8.755,1.36,1.0835
max,12.81,280.269989,33.049999,2.835


In [50]:
# no missing data and the index lines up with the values :) 

In [51]:
clean_stardf.to_csv("clean_stars.csv", index=False)