Copyright (c) 2022, Hikmet Güner
All rights reserved.

This source code is licensed under the BSD-style license found in the
LICENSE file in the root directory of this source tree. 

<h1>Imports</h1>

In [284]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans


<h2>Reading and Analyzing Data</h2>

In [285]:
df = pd.read_csv("exoplanets.csv",header=0, index_col=0)
df.head(10)

Unnamed: 0,pl_name,hostname,default_flag,sy_snum,sy_pnum,discoverymethod,disc_year,disc_facility,soltype,pl_controv_flag,...,sy_vmagerr2,sy_kmag,sy_kmagerr1,sy_kmagerr2,sy_gaiamag,sy_gaiamagerr1,sy_gaiamagerr2,rowupdate,pl_pubdate,releasedate
0,11 Com b,11 Com,1,2,1,Radial Velocity,2007,Xinglong Station,Published Confirmed,0,...,-0.023,2.282,0.346,-0.346,4.44038,0.003848,-0.003848,2014-05-14,2008-01,2014-05-14
1,11 Com b,11 Com,0,2,1,Radial Velocity,2007,Xinglong Station,Published Confirmed,0,...,-0.023,2.282,0.346,-0.346,4.44038,0.003848,-0.003848,2014-07-23,2011-08,2014-07-23
2,11 UMi b,11 UMi,0,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,Published Confirmed,0,...,-0.005,1.939,0.27,-0.27,4.56216,0.003903,-0.003903,2018-04-25,2011-08,2014-07-23
3,11 UMi b,11 UMi,1,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,Published Confirmed,0,...,-0.005,1.939,0.27,-0.27,4.56216,0.003903,-0.003903,2018-09-04,2017-03,2018-09-06
4,11 UMi b,11 UMi,0,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,Published Confirmed,0,...,-0.005,1.939,0.27,-0.27,4.56216,0.003903,-0.003903,2018-04-25,2009-10,2014-05-14
5,14 And b,14 And,0,1,1,Radial Velocity,2008,Okayama Astrophysical Observatory,Published Confirmed,0,...,-0.023,2.331,0.24,-0.24,4.91781,0.002826,-0.002826,2014-07-23,2011-08,2014-07-23
6,14 And b,14 And,1,1,1,Radial Velocity,2008,Okayama Astrophysical Observatory,Published Confirmed,0,...,-0.023,2.331,0.24,-0.24,4.91781,0.002826,-0.002826,2014-05-14,2008-12,2014-05-14
7,14 Her b,14 Her,0,1,2,Radial Velocity,2002,W. M. Keck Observatory,Published Confirmed,0,...,-0.023,4.714,0.016,-0.016,6.383,0.000351,-0.000351,2021-09-20,2021-05,2021-09-20
8,14 Her b,14 Her,0,1,2,Radial Velocity,2002,W. M. Keck Observatory,Published Confirmed,0,...,-0.023,4.714,0.016,-0.016,6.383,0.000351,-0.000351,2018-04-25,2003-01,2014-08-21
9,14 Her b,14 Her,0,1,2,Radial Velocity,2002,W. M. Keck Observatory,Published Confirmed,0,...,-0.023,4.714,0.016,-0.016,6.383,0.000351,-0.000351,2018-04-25,2008-04,2014-08-21


<h2>Checking Null Values</h2>

Calculating total amount of null values for insight, preparing for what's to come.

In [286]:
total_null_values = df.isnull().sum().sum()

Iterating over columns to find the ratio of null values to total values.
If the ratio is greater than 0.7, the column will be dropped.

In [287]:
drop_columns = []
for name, val in df.items():
    na_ratio =  val.isnull().sum() / len(val)
    if(na_ratio > 0.69):
        print(f"{name:<16} {na_ratio:.5f}")
        drop_columns.append(name)

pl_orbsmaxerr1   0.87165
pl_orbsmaxerr2   0.87168
pl_radj          0.70245
pl_radjerr1      0.72020
pl_radjerr2      0.72020
pl_radjlim       0.70245
pl_bmasse        0.85565
pl_bmasseerr1    0.86913
pl_bmasseerr2    0.86913
pl_bmasselim     0.85565
pl_bmassj        0.85568
pl_bmassjerr1    0.86932
pl_bmassjerr2    0.86932
pl_bmassjlim     0.85568
pl_bmassprov     0.85565
pl_orbeccenerr1  0.92074
pl_orbeccenerr2  0.92077
pl_eqterr1       0.95367
pl_eqterr2       0.95367
st_spectype      0.94197


Adding the columns that are not useful for the model to the drop columns list

In [288]:
drop_columns.extend([
    'default_flag',
    'discoverymethod',
    'disc_year',
    'disc_facility',
    'soltype',
    'pl_controv_flag',
    'pl_refname',
    'sy_refname',
    'rastr',
    'ra',
    'decstr',
    'dec',
    'rowupdate',
    'pl_pubdate',
    'releasedate',
    ])


Dropping columns that are related to the parent star, the data could be used for another analysis on how to detect stars that might have habitable planets within their systems.

In [289]:
drop_columns.extend([
    'st_teff',
    'st_tefferr1',
    'st_tefferr2',
    'st_tefflim',
    'st_rad',
    'st_raderr1',
    'st_raderr2',
    'st_radlim',
    'st_mass',
    'st_masserr1',
    'st_masserr2',
    'st_masslim',
    'st_met',
    'st_meterr1',
    'st_meterr2',
    'st_metlim',
    'st_metratio',
    'st_logg',
    'st_loggerr1',
    'st_loggerr2',
    'st_logglim',
    'sy_vmag',
    'sy_vmagerr1',
    'sy_vmagerr2',
    'sy_kmag',
    'sy_kmagerr1',
    'sy_kmagerr2',
    'sy_gaiamag',
    'sy_gaiamagerr1',
    'sy_gaiamagerr2'
    ])


index = 0
for columns in drop_columns:
    if index % 3 == 0:
        print()
    print(f'{columns:<20}', end='')
    index += 1


pl_orbsmaxerr1      pl_orbsmaxerr2      pl_radj             
pl_radjerr1         pl_radjerr2         pl_radjlim          
pl_bmasse           pl_bmasseerr1       pl_bmasseerr2       
pl_bmasselim        pl_bmassj           pl_bmassjerr1       
pl_bmassjerr2       pl_bmassjlim        pl_bmassprov        
pl_orbeccenerr1     pl_orbeccenerr2     pl_eqterr1          
pl_eqterr2          st_spectype         default_flag        
discoverymethod     disc_year           disc_facility       
soltype             pl_controv_flag     pl_refname          
sy_refname          rastr               ra                  
decstr              dec                 rowupdate           
pl_pubdate          releasedate         st_teff             
st_tefferr1         st_tefferr2         st_tefflim          
st_rad              st_raderr1          st_raderr2          
st_radlim           st_mass             st_masserr1         
st_masserr2         st_masslim          st_met              
st_meterr1          st_

Dropping columns from the dataset

In [290]:
df = df.drop(columns=drop_columns)
df.head()

Unnamed: 0,pl_name,hostname,sy_snum,sy_pnum,pl_orbper,pl_orbpererr1,pl_orbpererr2,pl_orbperlim,pl_orbsmax,pl_orbsmaxlim,...,pl_insolerr1,pl_insolerr2,pl_insollim,pl_eqt,pl_eqtlim,ttv_flag,st_refname,sy_dist,sy_disterr1,sy_disterr2
0,11 Com b,11 Com,2,1,326.03,0.32,-0.32,0.0,1.29,0.0,...,,,,,,0,<a refstr=LIU_ET_AL__2008 href=https://ui.adsa...,93.1846,1.9238,-1.9238
1,11 Com b,11 Com,2,1,,,,,1.21,0.0,...,,,,,,0,<a refstr=KUNITOMO_ET_AL__2011 href=https://ui...,93.1846,1.9238,-1.9238
2,11 UMi b,11 UMi,1,1,,,,,1.51,0.0,...,,,,,,0,<a refstr=KUNITOMO_ET_AL__2011 href=https://ui...,125.321,1.9765,-1.9765
3,11 UMi b,11 UMi,1,1,516.21997,3.2,-3.2,0.0,1.53,0.0,...,,,,,,0,<a refstr=STASSUN_ET_AL__2017 href=https://ui....,125.321,1.9765,-1.9765
4,11 UMi b,11 UMi,1,1,516.22,3.25,-3.25,0.0,1.54,0.0,...,,,,,,0,<a refstr=DOLLINGER_ET_AL__2009 href=https://u...,125.321,1.9765,-1.9765


In [293]:
null_after_dropcol = df.isnull().sum().sum()
null_after_dropcol

214945

In [292]:
fill_columns = []
for name, val in df.items():
    na_ratio =  val.isnull().sum() / len(val)
    if(na_ratio > 0.3):
        print(f"{name:<16} {na_ratio:.5f}")
        fill_columns.append(name)

pl_orbsmax       0.44160
pl_orbsmaxlim    0.35844
pl_rade          0.30232
pl_radeerr1      0.32142
pl_radeerr2      0.32142
pl_orbeccen      0.48851
pl_orbeccenlim   0.40637
pl_insol         0.57883
pl_insolerr1     0.58516
pl_insolerr2     0.58516
pl_insollim      0.43248
pl_eqt           0.54749
pl_eqtlim        0.47091
