In [9]:
import pandas as pd
import numpy as np

In [10]:
# Read in data
df = pd.read_csv("data/TOI_2024.05.30_18.42.55.csv", comment='#')
df.head()

Unnamed: 0,toi,tid,tfopwg_disp,ra,dec,st_pmra,st_pmdec,pl_tranmid,pl_orbper,pl_trandurh,pl_trandep,pl_rade,pl_insol,pl_eqt,st_tmag,st_dist,st_teff,st_logg,st_rad
0,278.01,244161191,,4.883562,-5.911561,108.144,-68.177,2458382.0,0.298731,0.621621,6605.102931,2.526862,188.331332,944.822863,13.1672,44.397,2955.0,5.13527,0.300639
1,6697.01,441546821,,197.317124,-60.308435,-39.61,-20.132,2459339.0,16.436173,14.647392,2626.770489,8.105287,192.22932,949.674238,7.7897,95.3906,6613.0,4.23,1.4927
2,1012.01,427508467,APC,117.095285,6.785064,-2.99,-0.029,2459253.0,0.884182,1.621,1890.0,24.8132,112144.0,5098.0,8.12718,295.959,8928.0,3.92,2.72
3,1039.01,461867584,APC,149.688545,-58.760453,-6.434,4.098,2460031.0,4.420125,3.956,2466.0,,,,8.5483,2864.34,,,
4,1060.01,101230735,APC,299.290995,-48.934476,2.224,-14.604,2460150.0,2.068563,0.373,502.0,3.89739,24.409,619.0,9.6652,128.491,5687.9,4.6,1.14


In [11]:
# Check for missing values and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7147 entries, 0 to 7146
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   toi          7147 non-null   float64
 1   tid          7147 non-null   int64  
 2   tfopwg_disp  7145 non-null   object 
 3   ra           7147 non-null   float64
 4   dec          7147 non-null   float64
 5   st_pmra      7028 non-null   float64
 6   st_pmdec     7028 non-null   float64
 7   pl_tranmid   7147 non-null   float64
 8   pl_orbper    7047 non-null   float64
 9   pl_trandurh  7147 non-null   float64
 10  pl_trandep   7147 non-null   float64
 11  pl_rade      6677 non-null   float64
 12  pl_insol     6997 non-null   float64
 13  pl_eqt       6872 non-null   float64
 14  st_tmag      7147 non-null   float64
 15  st_dist      6941 non-null   float64
 16  st_teff      7016 non-null   float64
 17  st_logg      6352 non-null   float64
 18  st_rad       6677 non-null   float64
dtypes: flo

In [12]:
# Drop rows with missing values and count TFOPWG Dispostion designations
df.dropna(inplace=True)
df.tfopwg_disp.value_counts()

tfopwg_disp
PC     3958
FP      796
KP      516
CP      443
APC     331
FA       61
Name: count, dtype: int64

In [13]:
# Split confirmed and unconfirmed planets into separate dataframes
unconfirmed = df[df.tfopwg_disp.isin(["PC", "APC"])]
confirmed = df[df.tfopwg_disp.isin(["FP", "KP", "CP", "FA"])]
confirmed_false = confirmed[confirmed.tfopwg_disp.isin(["FP", "FA"])]
confirmed_true = confirmed[confirmed.tfopwg_disp.isin(["KP", "CP"])]

print(f"Unconfirmed: {len(unconfirmed)}, Confirmed: {len(confirmed)}")
print(f"Confirmed False: {len(confirmed_false)}, Confirmed True: {len(confirmed_true)}")

Unconfirmed: 4289, Confirmed: 1816
Confirmed False: 857, Confirmed True: 959


In [14]:
# Replace tfopwg_disp values with planet flag
condition = [True if disp in ["KP", "CP"] else False for disp in confirmed.tfopwg_disp]
confirmed.insert(2, "planet", condition)
data=confirmed.drop(["tfopwg_disp","toi","tid"], axis=1)

In [15]:
# Check new dataframe
data.head()

Unnamed: 0,planet,ra,dec,st_pmra,st_pmdec,pl_tranmid,pl_orbper,pl_trandurh,pl_trandep,pl_rade,pl_insol,pl_eqt,st_tmag,st_dist,st_teff,st_logg,st_rad
424,True,337.510274,-75.646561,-63.327,-1.942,2458333.0,9.139804,3.091294,358.41578,3.062985,243.011763,1006.993283,9.0197,129.804,5958.2,4.34,1.58
425,True,302.114174,-54.317501,1.594,-15.903,2460123.0,15.507786,4.565,513.0,2.43432,88.0718,853.0,8.4362,88.4343,6122.0,4.30881,1.17
426,True,293.286615,-54.532728,108.439,-82.632,2460129.0,17.471314,4.05,1118.0,3.1646,61.9515,781.0,8.0888,57.2651,5783.54,4.49925,0.97
427,True,38.120968,-78.023683,107.847,30.732,2460177.0,4.115058,1.08,487.0,2.30057,77.3573,826.0,9.478,82.1733,5394.0,4.50942,0.89
428,True,296.003935,-47.56203,-3.457,-100.891,2458657.0,6.443866,1.930945,1121.951227,2.460344,60.831305,712.281284,10.0059,68.0726,4803.0,4.52079,0.737189


In [16]:
# Get and display the correlations
correlations = data.corr()["planet"].sort_values(ascending = False)
print('Correlations:\n', correlations)

Correlations:
 planet         1.000000
st_logg        0.278225
pl_tranmid     0.256396
pl_trandep     0.086659
pl_trandurh    0.067262
ra             0.014530
st_tmag       -0.010608
st_pmra       -0.014405
pl_orbper     -0.019032
dec           -0.033230
pl_rade       -0.070276
st_pmdec      -0.094459
st_dist       -0.179616
pl_insol      -0.203812
st_teff       -0.278900
st_rad        -0.316539
pl_eqt        -0.395028
Name: planet, dtype: float64
