In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
# Load the dataset and read it with pandas
df = pd.read_csv(r"C:\Exoplanet-search\archive\cumulative.csv")

# Set pandas options to make debugging easier
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10) # Limit rows to 10 for readability

# Display the first few rows to understand the data
df.head()

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_teq_err1,koi_teq_err2,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_tce_plnt_num,koi_tce_delivname,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,-0.00216,0.146,0.318,-0.146,2.9575,0.0819,-0.0819,615.8,19.5,-19.5,2.26,0.26,-0.15,793.0,,,93.59,29.45,-16.65,35.8,1.0,q1_q17_dr25_tce,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,0.586,0.059,-0.443,4.507,0.116,-0.116,874.8,35.5,-35.5,2.83,0.32,-0.19,443.0,,,9.11,2.87,-1.62,25.8,2.0,q1_q17_dr25_tce,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,-0.000581,0.969,5.126,-0.077,1.7822,0.0341,-0.0341,10829.0,171.0,-171.0,14.6,3.92,-1.31,638.0,,,39.3,31.04,-10.49,76.3,1.0,q1_q17_dr25_tce,5853.0,158.0,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,1.276,0.115,-0.092,2.40641,0.00537,-0.00537,8079.2,12.8,-12.8,33.46,8.5,-2.83,1395.0,,,891.96,668.95,-230.35,505.6,1.0,q1_q17_dr25_tce,5805.0,157.0,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,-0.00113,0.701,0.235,-0.478,1.6545,0.042,-0.042,603.3,16.9,-16.9,2.75,0.88,-0.35,1406.0,,,926.16,874.33,-314.24,40.9,1.0,q1_q17_dr25_tce,6031.0,169.0,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [None]:
print(df.columns)

In [None]:
df['koi_disposition'].value_counts().plot(kind='bar', color=['b', 'y', 'r'])
plt.title("Distribution of Planet Candidate Dispositions")
plt.xlabel("Class")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.show()

In [None]:
from sklearn.impute import SimpleImputer

# Define the features to use (ignore target)
features = [
    # STELLAR PARAMETERS - tell the model about the host star
    "koi_steff",  # Stellar effective temperature (K)
    "koi_slogg",      # Stellar surface gravity (log10(cm/s^2))
    "koi_srad",       # Stellar radius (Solar radii)
    # PLANETARY CANDIDATE PARAMETERS - tell the model about the 
    # planet's transit characteristics
    "koi_period",     # Orbital period (days)
    "koi_duration",   # Transit duration (hours)
    "koi_depth",      # Transit depth (ppm)
    "koi_prad",       # Planetary radius (Earth radii)
]

target = "koi_disposition"

# Impute missing values with the median of each column
imputer = SimpleImputer(strategy='median')
df[features] = imputer.fit_transform(df[features])


In [None]:
# Map classes to 0,1,2
target_mapping = {
    'CANDIDATE': 0,
    'CONFIRMED': 1,
    'FALSE POSITIVE': 2
}

df[target] = df[target].map(target_mapping)

In [None]:
# Separate features and target variable

# X will contain the values of the feature columns. This is what the model
# will use to learn patterns.
X = df[features].values

# y will contain the values of the target column. This is what the model
# is trying to predict.
y = df[target].values

# Decision Tree models expect two seperate arrays: one for input features (X)
# and one for target labels (y).

#.values converts the DataFrame columns to NumPy arrays, which sklearn
# works with.

In [None]:
# Scale features (standardisation) - optional for trees but neccessary for other models

# Rescale features to have mean 0 and standard deviation 1.
# This is important because features are on different scales (e.g.,
# temperature in Kelvin vs. radius in Earth radii). Scaling helps the
# model learn more effectively.

scaler = StandardScaler() # Create a StandardScaler object
X_scaled = scaler.fit_transform(X) # Fit scaler to X and transform X

# After this:
# Each feature in X_scaled will have mean approximately 0 and standard deviation 1.
# Helps the model converge faster and perform better.

In [None]:
# Split the dataset into training and test sets

# Divide the dataset so that the model learns patterns on the training set.
# Then we can evaluate how well it learned by testing on the test set.

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, # Features
    y, # Target
    test_size = 0.2, # 20% of data for testing
    random_state = 42, # Ensures reproducible splits
    stratify = y # Maintain class distribution in train/test splits
)

In [None]:
class_weights = compute_class_weight(
    class_weight='balanced', 
    classes=np.unique(y_train), 
    y=y_train
)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

In [None]:
# Covert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [None]:
# Create PyTorch Dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

print("Training samples:", len(train_dataset)) 
print("Testing samples:", len(test_dataset))

In [None]:
# Define the neural network

class ExoplanetClassifier(nn.Module):
    def __init__(self):
        super(ExoplanetClassifier, self).__init__()
        # First hidden layer
        self.fc1 = nn.Linear(7, 128)
        # Second hidden layer
        self.fc2 = nn.Linear(128, 64)
        # Output layer
        self.fc3 = nn.Linear(64, 32)   # Second hidden -> Third hidden (new)
        self.fc4 = nn.Linear(32, 3)    # Output layer
        self.relu = nn.ReLU()
        # Dropout layer
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
# Initialise the model
model = ExoplanetClassifier()
print(model)

In [None]:
# Training function
def train_model(model, train_loader, criterion, optimizer, epochs = 30):
    model.train()
    loss_history = []

    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()  # Zero the parameter gradients
            outputs = model(inputs)  # Forward pass
            loss = criterion(outputs, labels)  # Compute loss
            loss.backward()  # Backward pass
            optimizer.step()  # Update weights
            running_loss += loss.item()

        epoch_loss = running_loss / len(train_loader)
        loss_history.append(epoch_loss)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")

    return loss_history

In [None]:
# Make sure to import
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch


In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Train the model
loss_history = train_model(model, train_loader, nn.CrossEntropyLoss(), 
                            optim.Adam(model.parameters(), lr=0.001),
                            epochs=20)

In [None]:
plt.plot(range(1, len(loss_history) + 1), loss_history, marker='o', color='b')
plt.title("Training Loss Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.grid()
plt.show()

In [None]:
def evaluate_model(model, test_loader):
    model.eval()  # evaluation mode
    all_preds = []
    all_labels = []
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    # Accuracy
    accuracy = 100 * correct / total
    print(f"Accuracy: {accuracy:.2f}%")

    # Classification report
    from sklearn.metrics import classification_report
    print("Classification Report:")
    print(classification_report(all_labels, all_preds, target_names=['CANDIDATE','CONFIRMED','FALSE POSITIVE']))

    # Confusion matrix
    from sklearn.metrics import confusion_matrix
    import seaborn as sns
    import matplotlib.pyplot as plt

    cm = confusion_matrix(all_labels, all_preds)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['CANDIDATE','CONFIRMED','FALSE POSITIVE'],
                yticklabels=['CANDIDATE','CONFIRMED','FALSE POSITIVE'])
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

# Evaluate your trained NN
evaluate_model(model, test_loader)


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import numpy as np

# ---------------------------
# 1️⃣ Define features and target
# ---------------------------
features = [
    "koi_steff", "koi_slogg", "koi_srad",
    "koi_period", "koi_duration", "koi_depth",
    "koi_prad"
]
target = "koi_disposition"

# Map classes to numbers
target_mapping = {'CANDIDATE':0, 'CONFIRMED':1, 'FALSE POSITIVE':2}
df[target] = df[target].map(target_mapping)

# ---------------------------
# 2️⃣ Impute missing values
# ---------------------------
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(df[features])
y = df[target].values

# ---------------------------
# 3️⃣ Scale features
# ---------------------------
scaler = StandardScaler()
X = scaler.fit_transform(X)

# ---------------------------
# 4️⃣ Split into train/test
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test  = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test  = torch.tensor(y_test, dtype=torch.long)

# ---------------------------
# 5️⃣ Compute class weights
# ---------------------------
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train.numpy())
class_weights = torch.tensor(class_weights, dtype=torch.float32)

# ---------------------------
# 6️⃣ Define the NN
# ---------------------------
class ExoplanetClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(len(features), 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 3)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

model = ExoplanetClassifier()

# ---------------------------
# 7️⃣ Define loss and optimizer
# ---------------------------
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# ---------------------------
# 8️⃣ Training loop
# ---------------------------
epochs = 30
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 5 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

# ---------------------------
# 9️⃣ Evaluate
# ---------------------------
model.eval()
with torch.no_grad():
    outputs = model(X_test)
    _, preds = torch.max(outputs, 1)
    accuracy = (preds == y_test).float().mean()
    print(f"Test Accuracy: {accuracy:.4f}")


Epoch 5/30, Loss: 1.0889
Epoch 10/30, Loss: 1.0754
Epoch 15/30, Loss: 1.0608
Epoch 20/30, Loss: 1.0433
Epoch 25/30, Loss: 1.0282
Epoch 30/30, Loss: 1.0159
Test Accuracy: 0.4971
