Copyright (c) 2022, Hikmet Güner
All rights reserved.

This source code is licensed under the BSD-style license found in the
LICENSE file in the root directory of this source tree. 

<h1>Imports</h1>

In [23]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler


<h2>Reading and Analyzing Data</h2>

In [7]:
df = pd.read_csv("exoplanets.csv",header=0, index_col=0)
df.head(10)

Unnamed: 0,pl_name,hostname,default_flag,sy_snum,sy_pnum,discoverymethod,disc_year,disc_facility,soltype,pl_controv_flag,...,sy_vmagerr2,sy_kmag,sy_kmagerr1,sy_kmagerr2,sy_gaiamag,sy_gaiamagerr1,sy_gaiamagerr2,rowupdate,pl_pubdate,releasedate
0,11 Com b,11 Com,1,2,1,Radial Velocity,2007,Xinglong Station,Published Confirmed,0,...,-0.023,2.282,0.346,-0.346,4.44038,0.003848,-0.003848,2014-05-14,2008-01,2014-05-14
1,11 Com b,11 Com,0,2,1,Radial Velocity,2007,Xinglong Station,Published Confirmed,0,...,-0.023,2.282,0.346,-0.346,4.44038,0.003848,-0.003848,2014-07-23,2011-08,2014-07-23
2,11 UMi b,11 UMi,0,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,Published Confirmed,0,...,-0.005,1.939,0.27,-0.27,4.56216,0.003903,-0.003903,2018-04-25,2011-08,2014-07-23
3,11 UMi b,11 UMi,1,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,Published Confirmed,0,...,-0.005,1.939,0.27,-0.27,4.56216,0.003903,-0.003903,2018-09-04,2017-03,2018-09-06
4,11 UMi b,11 UMi,0,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,Published Confirmed,0,...,-0.005,1.939,0.27,-0.27,4.56216,0.003903,-0.003903,2018-04-25,2009-10,2014-05-14
5,14 And b,14 And,0,1,1,Radial Velocity,2008,Okayama Astrophysical Observatory,Published Confirmed,0,...,-0.023,2.331,0.24,-0.24,4.91781,0.002826,-0.002826,2014-07-23,2011-08,2014-07-23
6,14 And b,14 And,1,1,1,Radial Velocity,2008,Okayama Astrophysical Observatory,Published Confirmed,0,...,-0.023,2.331,0.24,-0.24,4.91781,0.002826,-0.002826,2014-05-14,2008-12,2014-05-14
7,14 Her b,14 Her,0,1,2,Radial Velocity,2002,W. M. Keck Observatory,Published Confirmed,0,...,-0.023,4.714,0.016,-0.016,6.383,0.000351,-0.000351,2021-09-20,2021-05,2021-09-20
8,14 Her b,14 Her,0,1,2,Radial Velocity,2002,W. M. Keck Observatory,Published Confirmed,0,...,-0.023,4.714,0.016,-0.016,6.383,0.000351,-0.000351,2018-04-25,2003-01,2014-08-21
9,14 Her b,14 Her,0,1,2,Radial Velocity,2002,W. M. Keck Observatory,Published Confirmed,0,...,-0.023,4.714,0.016,-0.016,6.383,0.000351,-0.000351,2018-04-25,2008-04,2014-08-21


<h2>Editing Columns</h2>

In [8]:
df = df.drop(columns=['hostname','discoverymethod','disc_year','disc_facility','soltype','pl_refname'])
df.head(10)

Unnamed: 0,pl_name,default_flag,sy_snum,sy_pnum,pl_controv_flag,pl_orbper,pl_orbpererr1,pl_orbpererr2,pl_orbperlim,pl_orbsmax,...,sy_vmagerr2,sy_kmag,sy_kmagerr1,sy_kmagerr2,sy_gaiamag,sy_gaiamagerr1,sy_gaiamagerr2,rowupdate,pl_pubdate,releasedate
0,11 Com b,1,2,1,0,326.03,0.32,-0.32,0.0,1.29,...,-0.023,2.282,0.346,-0.346,4.44038,0.003848,-0.003848,2014-05-14,2008-01,2014-05-14
1,11 Com b,0,2,1,0,,,,,1.21,...,-0.023,2.282,0.346,-0.346,4.44038,0.003848,-0.003848,2014-07-23,2011-08,2014-07-23
2,11 UMi b,0,1,1,0,,,,,1.51,...,-0.005,1.939,0.27,-0.27,4.56216,0.003903,-0.003903,2018-04-25,2011-08,2014-07-23
3,11 UMi b,1,1,1,0,516.21997,3.2,-3.2,0.0,1.53,...,-0.005,1.939,0.27,-0.27,4.56216,0.003903,-0.003903,2018-09-04,2017-03,2018-09-06
4,11 UMi b,0,1,1,0,516.22,3.25,-3.25,0.0,1.54,...,-0.005,1.939,0.27,-0.27,4.56216,0.003903,-0.003903,2018-04-25,2009-10,2014-05-14
5,14 And b,0,1,1,0,,,,,0.68,...,-0.023,2.331,0.24,-0.24,4.91781,0.002826,-0.002826,2014-07-23,2011-08,2014-07-23
6,14 And b,1,1,1,0,185.84,0.23,-0.23,0.0,0.83,...,-0.023,2.331,0.24,-0.24,4.91781,0.002826,-0.002826,2014-05-14,2008-12,2014-05-14
7,14 Her b,0,1,2,0,1766.41,0.67,-0.68,0.0,2.83,...,-0.023,4.714,0.016,-0.016,6.383,0.000351,-0.000351,2021-09-20,2021-05,2021-09-20
8,14 Her b,0,1,2,0,1724.0,50.0,-50.0,0.0,2.82,...,-0.023,4.714,0.016,-0.016,6.383,0.000351,-0.000351,2018-04-25,2003-01,2014-08-21
9,14 Her b,0,1,2,0,1766.0,,,0.0,2.864,...,-0.023,4.714,0.016,-0.016,6.383,0.000351,-0.000351,2018-04-25,2008-04,2014-08-21


<h2>Checking Null Values</h2>

In [4]:
df.isnull().sum().sum()

916679

In [13]:
columns_to_take = ["sy_snum","pl_orbper","pl_orbsmax","pl_rade","pl_insol","pl_orbeccen"]
num_of_null = []
for col_name in columns_to_take:
    num_of_null.append(df[col_name].isnull().sum())

print(num_of_null)
print(len(df))

[0, 2805, 14375, 9841, 18842, 15902]
32552


In [41]:
df_cluster = df[["sy_snum","pl_orbper","pl_orbsmax","pl_rade","pl_radj","pl_bmasse","pl_bmassj","pl_insol","pl_orbeccen"]].dropna()
df_cluster['pl_rade_calculated'] = df_cluster.apply(lambda row: row.pl_rade*6371, axis = 1)
df_cluster['pl_radj_calculated'] = df_cluster.apply(lambda row: row.pl_radj*69911, axis = 1)
df_cluster['pl_bmasse_calculated'] = df_cluster.apply(lambda row: row.pl_bmasse*5972E24, axis = 1)
df_cluster['pl_bmassj_calculated'] = df_cluster.apply(lambda row: row.pl_bmassj*1898E27, axis = 1)
df_cluster.head()


Unnamed: 0,sy_snum,pl_orbper,pl_orbsmax,pl_rade,pl_radj,pl_bmasse,pl_bmassj,pl_insol,pl_orbeccen,pl_rade_calculated,pl_radj_calculated,pl_bmasse_calculated,pl_bmassj_calculated
425,1,3650.0,4.5,12.442,1.11,4131.79,13.0,0.19,0.0,79267.982,77601.21,2.467505e+31,2.4674000000000002e+31
426,1,3.5951,0.047,1.95,0.174,8.75,0.02753,1037.0,0.06,12423.45,12164.514,5.2255e+28,5.225194e+28
427,1,15.624,0.13,3.67,0.327,14.67,0.04616,160.0,0.07,23381.57,22860.897,8.760924e+28,8.761168e+28
428,1,35.747,0.22,3.94,0.352,10.18,0.03203,53.0,0.15,25101.74,24608.672,6.079496e+28,6.079294e+28
449,1,1.580404,0.0149,2.742,0.245,8.17,0.02571,21.0,0.063,17469.282,17128.195,4.879124e+28,4.879758e+28


In [26]:

print(len(df_cluster))

10887


In [27]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_cluster)

In [28]:
scaled_features[:5]

array([[-2.02161797e-01, -9.42971619e-02, -6.84796718e-02,
        -4.14366212e-02, -2.21988979e-01,  2.15599434e+00],
       [-2.02161797e-01,  5.58810790e+01,  2.75718421e+01,
         6.72568184e-02, -2.24190340e-01, -1.10780911e-01],
       [-2.02161797e-01, -3.65391102e-01, -5.99902659e-01,
        -4.17483019e-02,  4.65354395e-01,  2.15599434e+00],
       [-2.02161797e-01, -1.79843094e-01, -7.48061359e-02,
        -2.38786101e-02, -1.17906504e-01,  2.53379022e+00],
       [-2.02161797e-01,  1.30557904e-01,  4.94575636e-01,
        -2.10734840e-02, -1.89068324e-01,  5.55615723e+00]])

In [30]:
kmeans = KMeans(
     init="random",
     n_clusters=3,
     n_init=10,     
     random_state=42
 )

In [31]:

kmeans.fit(scaled_features)

In [32]:
kmeans.inertia_

44648.72087509375

In [33]:
kmeans.cluster_centers_

array([[-2.02161797e-01, -1.78161306e-01, -2.11925108e-01,
         2.52070843e-03, -5.82611422e-03, -1.89669723e-02],
       [-1.61194343e-01,  2.26092500e+00,  2.70217616e+00,
        -1.71760315e-02, -2.20620822e-01,  2.20700122e-02],
       [ 4.63958770e+00, -7.88932768e-02, -1.16005539e-01,
        -2.45743490e-02,  5.08290836e-01,  3.70782337e-01]])