In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import warnings
import math

from matplotlib.lines import Line2D
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial import ConvexHull
from scipy import interpolate
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.preprocessing import normalize
from sklearn.neighbors import KernelDensity
from sklearn.metrics.cluster import rand_score
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('new.csv',header=0, index_col=0)
pd.set_option('mode.chained_assignment', None)

In [3]:
df.shape

(5243, 164)

In [4]:
df.columns

Index(['pl_name', 'hostname', 'pl_letter', 'hd_name', 'hip_name', 'tic_id',
       'gaia_id', 'sy_snum', 'sy_pnum', 'sy_mnum',
       ...
       'sy_tmag', 'sy_tmag_reflink', 'sy_kepmag', 'sy_kepmag_reflink',
       'pl_nnotes', 'st_nphot', 'st_nrvc', 'st_nspec', 'pl_nespec',
       'pl_ntranspec'],
      dtype='object', length=164)

In [5]:
df = df[['pl_name','sy_snum',
'sy_pnum',
'sy_mnum',
'cb_flag',
'pl_controv_flag', 
'pl_orbper',
'pl_orbsmax',
'pl_rade',
'pl_radj',
'pl_bmasse',
'pl_bmassj',
'pl_bmassprov',
'pl_dens',
'pl_orbeccen',
'pl_insol',
'pl_eqt',
'pl_rvamp',
'st_spectype',
'st_teff',
'st_rad',
'st_mass',
'st_met',
'st_lum',
'st_age',
'st_dens']]

In [6]:
df.shape

(5243, 26)

In [7]:
df.isnull().sum().to_frame()

Unnamed: 0,0
pl_name,0
sy_snum,0
sy_pnum,0
sy_mnum,0
cb_flag,0
pl_controv_flag,0
pl_orbper,191
pl_orbsmax,286
pl_rade,17
pl_radj,18


In [8]:
df2 = df.pivot_table(index = ['pl_name'], aggfunc ='size')


In [9]:
num_of_duplicates = len(df) - len(df.drop_duplicates())
num_of_duplicates

0

In [10]:
all_columns = df.columns
all_columns

Index(['pl_name', 'sy_snum', 'sy_pnum', 'sy_mnum', 'cb_flag',
       'pl_controv_flag', 'pl_orbper', 'pl_orbsmax', 'pl_rade', 'pl_radj',
       'pl_bmasse', 'pl_bmassj', 'pl_bmassprov', 'pl_dens', 'pl_orbeccen',
       'pl_insol', 'pl_eqt', 'pl_rvamp', 'st_spectype', 'st_teff', 'st_rad',
       'st_mass', 'st_met', 'st_lum', 'st_age', 'st_dens'],
      dtype='object')

In [11]:
col_will_drop = []

for i in all_columns:
    if df[i].isnull().sum() > 1000:
        col_will_drop.append(i)

col_will_drop

['pl_insol', 'pl_eqt', 'pl_rvamp', 'st_spectype', 'st_age']

In [12]:
df = df.drop(columns = col_will_drop)
df.columns

Index(['pl_name', 'sy_snum', 'sy_pnum', 'sy_mnum', 'cb_flag',
       'pl_controv_flag', 'pl_orbper', 'pl_orbsmax', 'pl_rade', 'pl_radj',
       'pl_bmasse', 'pl_bmassj', 'pl_bmassprov', 'pl_dens', 'pl_orbeccen',
       'st_teff', 'st_rad', 'st_mass', 'st_met', 'st_lum', 'st_dens'],
      dtype='object')

#### Getting the amount of missing columns per row

In [13]:
def get_empty_col(row_index):
    total_nan = 0
    for col in df.columns:
        if pd.isna(df[col][row_index]):
            total_nan += 1
    return total_nan

#### Find the number of empty columns per row, append it to the dataframe

In [14]:
df['empty_col_count'] = 0
for i in df.index:
    df['empty_col_count'][i] = get_empty_col(i)

In [15]:
df.head()

Unnamed: 0_level_0,pl_name,sy_snum,sy_pnum,sy_mnum,cb_flag,pl_controv_flag,pl_orbper,pl_orbsmax,pl_rade,pl_radj,...,pl_bmassprov,pl_dens,pl_orbeccen,st_teff,st_rad,st_mass,st_met,st_lum,st_dens,empty_col_count
rowid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,11 Com b,2,1,0,0,0,326.03,1.29,12.1,1.08,...,Msini,19.1,0.231,4742.0,19.0,2.7,-0.35,2.243,,1
2,11 UMi b,1,1,0,0,0,516.21997,1.53,12.3,1.09,...,Msini,13.8,0.08,4213.0,29.79,2.78,-0.02,2.43,,1
3,14 And b,1,1,0,0,0,185.84,0.83,12.9,1.15,...,Msini,3.9,0.0,4813.0,11.0,2.2,-0.24,1.763,,1
4,14 Her b,1,2,0,0,0,1765.038901,2.773069,12.6,1.12,...,Mass,7.11,0.372689,5338.0,0.93,0.91,0.405,-0.153,1.27393,0
5,16 Cyg B b,3,1,0,0,0,798.5,1.66,13.5,1.2,...,Msini,1.26,0.68,5750.0,1.13,1.08,0.06,0.097,1.01103,0


In [16]:
print('Max:\t', df['empty_col_count'].max())
print('Min:\t',df['empty_col_count'].min())
print('Total:\t',len(df.columns))

Max:	 9
Min:	 0
Total:	 22


In [18]:
df = df[df['empty_col_count'] < 5]
df.shape

(5061, 22)

In [19]:
df.isnull().sum().to_frame()

Unnamed: 0,0
pl_name,0
sy_snum,0
sy_pnum,0
sy_mnum,0
cb_flag,0
pl_controv_flag,0
pl_orbper,39
pl_orbsmax,275
pl_rade,7
pl_radj,8
