In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer


In [None]:

file_path = '/content/drive/My Drive/kepler_data.csv'
data = pd.read_csv(file_path)

In [None]:
# Display the initial shape of the data
print("Initial data shape:", data.shape)

Initial data shape: (9564, 49)


In [None]:
features = [
    'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
    'koi_period', 'koi_period_err1', 'koi_period_err2', 'koi_time0bk', 'koi_time0bk_err1',
    'koi_time0bk_err2', 'koi_impact', 'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
    'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1', 'koi_depth_err2',
    'koi_prad', 'koi_prad_err1', 'koi_prad_err2', 'koi_teq', 'koi_insol', 'koi_insol_err1',
    'koi_insol_err2', 'koi_model_snr', 'koi_steff', 'koi_steff_err1', 'koi_steff_err2',
    'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad', 'koi_srad_err1',
    'koi_srad_err2', 'ra', 'dec', 'koi_kepmag'
]

# Target variable
target = 'koi_disposition'

# Check for missing values in the relevant columns
missing_values = data[features + [target]].isnull().sum()
print("Missing values before imputation:")
print(missing_values[missing_values > 0])


Missing values before imputation:
koi_score            1510
koi_period_err1       454
koi_period_err2       454
koi_time0bk_err1      454
koi_time0bk_err2      454
koi_impact            363
koi_impact_err1       454
koi_impact_err2       454
koi_duration_err1     454
koi_duration_err2     454
koi_depth             363
koi_depth_err1        454
koi_depth_err2        454
koi_prad              363
koi_prad_err1         363
koi_prad_err2         363
koi_teq               363
koi_insol             321
koi_insol_err1        321
koi_insol_err2        321
koi_model_snr         363
koi_steff             363
koi_steff_err1        468
koi_steff_err2        483
koi_slogg             363
koi_slogg_err1        468
koi_slogg_err2        468
koi_srad              363
koi_srad_err1         468
koi_srad_err2         468
koi_kepmag              1
dtype: int64


In [None]:
features = [feat for feat in features if feat not in ['koi_teq_err1', 'koi_teq_err2']]

# Impute missing values
imputer = SimpleImputer(strategy='mean')
data[features] = imputer.fit_transform(data[features])

# Display the shape of the data after imputation
print("Data shape after imputation:", data.shape)


Data shape after imputation: (9564, 49)


In [None]:
# Encode the target variable
label_encoder = LabelEncoder()
data[target] = label_encoder.fit_transform(data[target])

# Split the data into features and target
X = data[features]
y = data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Model evaluation
y_pred = model.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))




Confusion Matrix:
[[368  95  21]
 [ 63 423   4]
 [ 15   0 924]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.76      0.79       484
           1       0.82      0.86      0.84       490
           2       0.97      0.98      0.98       939

    accuracy                           0.90      1913
   macro avg       0.87      0.87      0.87      1913
weighted avg       0.90      0.90      0.90      1913

Accuracy: 0.8964976476738108


1. The Random Forest algorithm was chosen for its ability to handle both classification and regression tasks, its robustness to overfitting, and its capability to handle high-dimensional datasets with a large number of features.

 It builds multiple decision trees during training and outputs the mode of the classes for classification tasks or the average prediction for regression tasks.

 It typically performs well across various types of datasets and is relatively easy to use with minimal hyperparameter tuning.


 2. Tuning methods have not been explicitly implemented in the code. However,  Grid Search Cross-Validation, Randomized Search Cross-Validation, and Bayesian Optimization etc can be utilized

3. Random Forest was chosen as the preferred algorithm for classifying exoplanets based on the "koi_disposition" column due to its robustness, scalability, and ability to handle high-dimensional data effectively. Compared to other algorithms such as Support Vector Machines, Gradient Boosting Machines, Logistic Regression, and k-Nearest Neighbors, Random Forest requires less parameter tuning while still achieving competitive performance. Its ensemble learning approach combines the strengths of decision trees, resulting in reliable predictions and ease of implementation.


4. The accuracy of the classification model is 0.8965, or approximately 89.65%.

5. Confusion Matrix: It provides a tabular representation of the model's predictions, showing the number of true positive, true negative, false positive, and false negative predictions.

 Classification Report: It includes precision, recall, F1-score, and support for each class in the classification problem. Precision measures the accuracy of positive predictions, recall measures the ability of the model to correctly identify positive instances, and F1-score is the harmonic mean of precision and recall.

 Accuracy Score: It calculates the accuracy of the model, which is the proportion of correctly classified instances out of the total number of instances.