# Import Packages and Define Functions

In [1]:
pip install --upgrade xgboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as pltticker
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier, VotingClassifier 
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score
from sklearn.metrics import make_scorer
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as IPipeline
from xgboost import XGBClassifier
from joblib import parallel_backend
import pickle
import re
import dataframe_image as dfi
from packages.model_evaluation import class_k_recall, class_k_precision, class_k_f1, class_k_acc
from packages.model_evaluation import evaluate_model

In [3]:
# Configures axis and title labels
def configure_axislabels_and_title(ax, xlabel, ylabel, title):
    # Set fonts, padding, and fontsize for axis labels and title
    ax.set_xlabel(xlabel,
                  fontfamily = 'Arial',
                  fontsize = 24,
                  labelpad = 5)

    ax.set_ylabel(ylabel,
                  fontfamily = 'Arial',
                  fontsize = 24,
                  labelpad = 10)

    ax.set_title(title,
                 fontfamily = 'Arial',
                 fontsize = 32,
                 pad = 10)
    
    return None

In [4]:
# Configures ticklabels and tick parameters
def configure_ticklabels_and_params(ax):
    # Set label sizes and tick lengths
    ax.tick_params(axis = 'both',
                   which = 'major',
                   labelsize = 16,
                   length = 8,
                   width = 1)

    # Set font for tick labels on both axes
    for tick in ax.get_xticklabels():
        tick.set_fontname("Arial")

    for tick in ax.get_yticklabels():
        tick.set_fontname("Arial")
        
    return None

In [5]:
# Takes in a feature frame and returns a bar plot of the top 10 features by largest absolute coefficient
def plot_feature_importances(feature_frame):
    fig, ax = plt.subplots(figsize = (16, 8))

    plt.tight_layout()
    
    # Subselect the top 10 coefficients by magnitude
    top_10_features = feature_frame.nlargest(10, 'coefficient')

    # Create horizontal seaborn barplot
    sns.barplot(data = top_10_features, x = 'coefficient', y = 'feature',
                orient = 'h', ax = ax);
    
    return fig, ax

In [6]:
# Takes in a list of tuples of the form (frame, target) and a feature column
# Plots boxplots of the feature for each frame, target pair in a row
# Returns fig, ax objects
def plot_boxplots(frame_targets, column):
    num_frames = len(frame_targets)
    
    # Define columns and figsize based on number of frames
    fig, ax = plt.subplots(1, num_frames, figsize = (8 * num_frames, 8))
    
    plt.tight_layout(pad = 4.0)
    
    # Plot boxplots based on feature column and target for each frame
    for index, (frame, target) in enumerate(frame_targets):
        # If num_frames == 1, ax object is not subscriptable
        if num_frames != 1:
            axis = ax[index]
        else:
            axis = ax
        
        # Plot seaborn boxplot of feature column and target
        sns.boxplot(data = frame,
                    x = frame[column],
                    y = frame[target],
                    orient = 'h',
                    ax = axis);
        
    return fig, ax

In [7]:
# Takes in X-, y-values and a fitted model
# Returns a fig, ax pair containing a seaborn heatmap of the confusion matrix
def plot_confusion_matrix_fancy(X_test, y_test, model):
    matrix = confusion_matrix(y_test, model.predict(X_test))
    
    fig, ax = plt.subplots(figsize = (12, 12))
    
    plt.tight_layout()
    
    # Plot seaborn heatmap, no decimals or scientific notation, no colorbar
    sns.heatmap(matrix, annot = True, fmt = 'g', cmap = 'viridis', linewidths = 1,
                cbar = False, annot_kws = {'fontsize': 24, 'fontfamily': 'Arial'}, ax = ax)
    
    return fig, ax

# Import Dataframes and Models

In [8]:
# Assuming you have the CSV files in your working directory
train_multi_path = 'train_multi.csv'
test_multi_path = 'test_multi.csv'
train_binary_path = 'train_binary.csv'
test_binary_path = 'test_binary.csv'

# Load the datasets
train_multi = pd.read_csv(train_multi_path)
test_multi = pd.read_csv(test_multi_path)
train_binary = pd.read_csv(train_binary_path)
test_binary = pd.read_csv(test_binary_path)

# Drop the 'Unnamed: 0.1' and 'Unnamed: 0' columns from each dataframe
train_multi = train_multi.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
test_multi = test_multi.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
train_binary = train_binary.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
test_binary = test_binary.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])

# Save the modified dataframes back to CSV files
train_multi.to_csv('train_multi_modified.csv', index=False)
test_multi.to_csv('test_multi_modified.csv', index=False)
train_binary.to_csv('train_binary_modified.csv', index=False)
test_binary.to_csv('test_binary_modified.csv', index=False)

# Display the first few rows of the modified dataframes to verify
print(train_multi.head(), test_multi.head(), train_binary.head(), test_binary.head())

   num_of_objects_around_star  right_ascension  declination  g_band_mag  \
0                           2        284.75485    44.956032      14.293   
1                           1        299.18677    40.501701      14.266   
2                           1        282.90509    43.871769      16.325   
3                           1        294.14340    38.893162      15.596   
4                           1        293.91489    50.469330      14.253   

   r_band_mag  i_band_mag  z_band_mag  j_band_mag  h_band_mag  k_band_mag  \
0      13.907      13.802      13.775      12.941      12.678      12.607   
1      13.698      13.531      13.434      12.607      12.276      12.206   
2      15.267      14.898      14.726      13.619      13.044      12.908   
3      14.910      14.660      14.484      13.591      13.119      13.030   
4      13.824      13.724      13.680      12.835      12.574      12.465   

   ...  abs_gr_diff  abs_gi_diff  abs_gz_diff  abs_ri_diff  abs_rz_diff  \
0  ...     

In [13]:
# Load dataframes
train_multi = pd.read_csv('train_multi_modified.csv')
X_train_multi = train_multi.drop('classification', axis = 1)
y_train_multi = train_multi.classification

test_multi = pd.read_csv('test_multi_modified.csv')
X_test_multi = test_multi.drop('classification', axis = 1)
y_test_multi = test_multi.classification

full_multi = pd.concat([train_multi, test_multi], axis = 0)

train_binary = pd.read_csv('train_binary_modified.csv')
X_train_binary = train_binary.drop('disposition', axis = 1)
y_train_binary = train_binary.disposition

test_binary = pd.read_csv('test_binary_modified.csv')
X_test_binary = test_binary.drop('disposition', axis = 1)
y_test_binary = test_binary.disposition

full_binary = pd.concat([train_binary, test_binary], axis = 0)

In [14]:
print(f"Training data shape: {X_train_multi.shape}")
print(f"Prediction data shape: {X_test_multi.shape}")
print(f"Training feature names: {X_train_binary.columns}")
print(f"Prediction feature names: {X_test_binary.columns}")

Training data shape: (3458, 56)
Prediction data shape: (865, 56)
Training feature names: Index(['num_of_objects_around_star', 'right_ascension', 'declination',
       'g_band_mag', 'r_band_mag', 'i_band_mag', 'z_band_mag', 'j_band_mag',
       'h_band_mag', 'k_band_mag', 'kepler_band_mag', 'num_of_transits',
       'max_single_event_stat', 'max_multi_event_stat', 'odd_even_depth_stat',
       'limb_dark_co4', 'limb_dark_co3', 'limb_dark_co2', 'limb_dark_co1',
       'transit_signal_to_noise', 'planet_radius_earth',
       'orbit_semimajor_axis', 'impact', 'transit_duration', 'transit_depth',
       'orbital_period', 'planet_star_radius_ratio',
       'planet_star_distance_radius', 'inclination', 'planet_temp',
       'star_temp', 'star_surface_gravity', 'star_metallicity', 'star_radius',
       'star_mass', 'flux_weight_offset_sig', 'centroid_right_ascension',
       'centroid_declination', 'centroid_right_ascension_offset',
       'centroid_declination_offset', 'planet_star_right_asce

In [15]:
print(f"Shape of X_test_binary: {X_test_binary.shape}")
print(f"Features in X_test_binary: {X_test_binary.columns}")

# Load the model and inspect the expected input
with open('binary_base_logistic.pickle', 'rb') as f:
    base_binary_model = pickle.load(f)

# If the model is a pipeline, get the expected input features
if hasattr(base_binary_model, 'named_steps'):
    for step_name, step in base_binary_model.named_steps.items():
        if hasattr(step, 'get_feature_names_out'):
            print(f"Features expected by step '{step_name}': {step.get_feature_names_out()}")


Shape of X_test_binary: (1086, 56)
Features in X_test_binary: Index(['num_of_objects_around_star', 'right_ascension', 'declination',
       'g_band_mag', 'r_band_mag', 'i_band_mag', 'z_band_mag', 'j_band_mag',
       'h_band_mag', 'k_band_mag', 'kepler_band_mag', 'num_of_transits',
       'max_single_event_stat', 'max_multi_event_stat', 'odd_even_depth_stat',
       'limb_dark_co4', 'limb_dark_co3', 'limb_dark_co2', 'limb_dark_co1',
       'transit_signal_to_noise', 'planet_radius_earth',
       'orbit_semimajor_axis', 'impact', 'transit_duration', 'transit_depth',
       'orbital_period', 'planet_star_radius_ratio',
       'planet_star_distance_radius', 'inclination', 'planet_temp',
       'star_temp', 'star_surface_gravity', 'star_metallicity', 'star_radius',
       'star_mass', 'flux_weight_offset_sig', 'centroid_right_ascension',
       'centroid_declination', 'centroid_right_ascension_offset',
       'centroid_declination_offset', 'planet_star_right_ascension_offset',
       'plan

In [17]:
with open('binary_base_logistic.pickle', 'rb') as f:
    base_binary_model = pickle.load(f)

base_binary_class_report, base_binary_features = evaluate_model(X_test_binary, y_test_binary, base_binary_model)
base_binary_features.coefficient = base_binary_features.coefficient.abs()

with open('binary_xgboost.pickle', 'rb') as f:
    best_binary_model = pickle.load(f)

best_binary_class_report, best_binary_features = evaluate_model(X_test_binary, y_test_binary, best_binary_model)
best_binary_features.coefficient = best_binary_features.coefficient.abs()

with open('multi_base_tree.pickle', 'rb') as f:
    base_multi_model = pickle.load(f)

base_multi_class_report, base_multi_features = evaluate_model(X_test_multi, y_test_multi, base_multi_model)
base_multi_features.coefficient = base_multi_features.coefficient.abs()

with open('multi_xgboost.pickle', 'rb') as f:
    best_multi_model = pickle.load(f)

best_multi_class_report, best_multi_features = evaluate_model(X_test_multi, y_test_multi, best_multi_model)
best_multi_features.coefficient = best_multi_features.coefficient.abs()

ValueError: columns are missing: {'Unnamed: 0', 'Unnamed: 0.1'}

# Classification Reports

In [None]:
# Save classification reports for each model as .png images
# dfi.export(base_binary_class_report, '../images/base_binary_class_report.png', bbox_inches = 'tight')
# dfi.export(best_binary_class_report, '../images/best_binary_class_report.png', bbox_inches = 'tight')
# dfi.export(base_multi_class_report, '../images/base_multi_class_report.png', bbox_inches = 'tight')
# dfi.export(best_multi_class_report, '../images/best_multi_class_report.png', bbox_inches = 'tight')

# Feature Importances

In [None]:
# Create barplot of top 10 feature importances for best binary classification model (XGBoost)
# Create barplot
fig, ax = plot_feature_importances(best_binary_features)

# Configure axis labels, title, ticks, and ticklabels
# Labels selected by manual inspection of top 10 feature importances
configure_axislabels_and_title(ax, 'Absolute Value of Coefficient',
                               None, 'Best Binary Classification Feature Importances')

feature_labels = ['Flux-Weighted Offset Significance (percent)', 'Planet Radius (Earth Radii)',
                  'Number of Objects Around Star', 'Maximum Multiple Event Statistic', 'Sky Angular Shift',
                  'Planet-Star Distance Over Star Radius', 'Planet-Star Radius Ratio', 'Planet Temperature',
                  'Star Metallicity', 'Star Density']

configure_ticklabels_and_params(ax)

ax.set_yticklabels(feature_labels);

# Save figure as .png
plt.savefig('../images/best_binary_feature_importances.png', bbox_inches = 'tight')

In [None]:
# Create barplot of top 10 feature importances for best multi-class classification model (XGBoost)
# Create barplot
fig, ax = plot_feature_importances(best_multi_features)

# Configure axis labels, title, ticks, and ticklabels
# Labels selected by manual inspection of top 10 feature importances
configure_axislabels_and_title(ax, 'Absolute Value of Coefficient',
                               None, 'Best Multi-Classification Feature Importances')

feature_labels = ['Number of Objects Around Star', 'Sky Angular Shift', 'Orbit Semimajor Axis Length',
                  'Transit Depth', 'Maximum Multiple Event Statistic', 'Planet-Star Radius Ratio',
                  'Flux-Weighted Offset Significance (percent)', 'Maximum Single Event Statistic',
                  'Orbital Period (Days)', 'Kepler-Band Magnitude']

configure_ticklabels_and_params(ax)

ax.set_yticklabels(feature_labels);

plt.tight_layout()

# Save figure as .png
# plt.savefig('../images/best_multi_feature_importances.png', bbox_inches = 'tight')

# Feature Distributions

In [None]:
# Create boxplots for transit depth feature
fig, ax = plot_boxplots([(full_binary, 'disposition'), (full_multi, 'classification')], 'transit_depth')

# Configure axis labels, title, ticks, and ticklabels
for axis in ax:
    configure_axislabels_and_title(axis, 'Transit Depth (ppm)',
                               'Classification', None)
    axis.get_xaxis().set_major_formatter(pltticker.FuncFormatter(lambda x, p: format(int(x),',')))
    configure_ticklabels_and_params(axis)

ax[0].set_title('Binary Classification', fontfamily = 'Arial', fontsize = 32, pad = 10);
ax[1].set_title('Multi-class Classification', fontfamily = 'Arial', fontsize = 32, pad = 10);

# Save figure as .png
# plt.savefig('../images/transit_depth_boxplots.png', bbox_inches = 'tight')

In [None]:
# Create boxplots for planet-star radius ratio feature
fig, ax = plot_boxplots([(full_binary, 'disposition'), (full_multi, 'classification')], 'planet_star_radius_ratio')

# Configure axis labels, title, ticks, and ticklabels
for axis in ax:
    configure_axislabels_and_title(axis, 'Planet-Star Radius Ratio',
                               'Classification', None)
    axis.get_xaxis().set_major_formatter(pltticker.FuncFormatter(lambda x, p: format(int(x),',')))
    configure_ticklabels_and_params(axis)

ax[0].set_title('Binary Classification', fontfamily = 'Arial', fontsize = 32, pad = 10);
ax[1].set_title('Multi-class Classification', fontfamily = 'Arial', fontsize = 32, pad = 10);

# Save figure as .png
# plt.savefig('../images/planet_star_radius_ratio_boxplots.png', bbox_inches = 'tight')

In [None]:
# Create boxplots for orbit semimajor axis feature
fig, ax = plot_boxplots([(full_binary, 'disposition'), (full_multi, 'classification')], 'orbit_semimajor_axis')

# Configure axis labels, title, ticks, and ticklabels
for axis in ax:
    configure_axislabels_and_title(axis, 'Orbit Semimajor Axis',
                               'Classification', None)
    axis.get_xaxis().set_major_formatter(pltticker.FuncFormatter(lambda x, p: format(int(x),',')))
    configure_ticklabels_and_params(axis)

ax[0].set_title('Binary Classification', fontfamily = 'Arial', fontsize = 32, pad = 10);
ax[1].set_title('Multi-class Classification', fontfamily = 'Arial', fontsize = 32, pad = 10);

# Save figure as .png
# plt.savefig('../images/orbit_semimajor_axis_boxplots.png', bbox_inches = 'tight')

In [None]:
# Create boxplots for maximum multiple event statistic feature
fig, ax = plot_boxplots([(full_binary, 'disposition'), (full_multi, 'classification')], 'max_multi_event_stat')

# Configure axis labels, title, ticks, and ticklabels
for axis in ax:
    configure_axislabels_and_title(axis, 'Maximum Multiple Event Statistic',
                               'Classification', None)
    axis.get_xaxis().set_major_formatter(pltticker.FuncFormatter(lambda x, p: format(int(x),',')))
    configure_ticklabels_and_params(axis)

ax[0].set_title('Binary Classification', fontfamily = 'Arial', fontsize = 32, pad = 10);
ax[1].set_title('Multi-class Classification', fontfamily = 'Arial', fontsize = 32, pad = 10);

# Save figure as .png
# plt.savefig('../images/max_multi_event_stat_boxplots.png', bbox_inches = 'tight')

In [None]:
# Create boxplots for orbital period feature
fig, ax = plot_boxplots([(full_binary, 'disposition'), (full_multi, 'classification')], 'orbital_period')

# Configure axis labels, title, ticks, and ticklabels
for axis in ax:
    configure_axislabels_and_title(axis, 'Orbital Period (Earth Days)',
                               'Classification', None)
    axis.get_xaxis().set_major_formatter(pltticker.FuncFormatter(lambda x, p: format(int(x),',')))
    configure_ticklabels_and_params(axis)

ax[0].set_title('Binary Classification', fontfamily = 'Arial', fontsize = 32, pad = 10);
ax[1].set_title('Multi-class Classification', fontfamily = 'Arial', fontsize = 32, pad = 10);

# Save figure as .png
# plt.savefig('../images/orbital_period_boxplots.png', bbox_inches = 'tight')

In [None]:
# Create boxplots for angular sky offset feature
fig, ax = plot_boxplots([(full_binary, 'disposition'), (full_multi, 'classification')], 'angular_offset_sky')

# Configure axis labels, title, ticks, and ticklabels
for axis in ax:
    configure_axislabels_and_title(axis, 'Angular Sky Shift (arcseconds)',
                               'Classification', None)
    axis.get_xaxis().set_major_formatter(pltticker.FuncFormatter(lambda x, p: format(int(x),',')))
    configure_ticklabels_and_params(axis)

ax[0].set_title('Binary Classification', fontfamily = 'Arial', fontsize = 32, pad = 10);
ax[1].set_title('Multi-class Classification', fontfamily = 'Arial', fontsize = 32, pad = 10);

# Save figure as .png
# plt.savefig('../images/angular_offset_sky_boxplots.png', bbox_inches = 'tight')

# Confusion Matrices

In [None]:
# Create confusion matrix for base binary classification model (logistic regression)
fig, ax = plot_confusion_matrix_fancy(X_test_binary, y_test_binary, base_binary_model)

# Configure axis labels, title, ticks, and ticklabels
configure_axislabels_and_title(ax, 'Predicted Labels',
                               'Actual Labels', 'Base Binary Model Confusion Matrix')

configure_ticklabels_and_params(ax)

# Save figure as .png
# plt.savefig('../images/base_binary_confusion_matrix.png', bbox_inches = 'tight')

In [None]:
# Create confusion matrix for final binary classification model (XGBoost)
fig, ax = plot_confusion_matrix_fancy(X_test_binary, y_test_binary, best_binary_model)

# Configure axis labels, title, ticks, and ticklabels
configure_axislabels_and_title(ax, 'Predicted Labels',
                               'Actual Labels', 'Best Binary Model Confusion Matrix')

configure_ticklabels_and_params(ax)

# Save figure as .png
# plt.savefig('../images/best_binary_confusion_matrix.png', bbox_inches = 'tight')

In [None]:
# Create confusion matrix for base multi-class classification model (decision tree classifier)
fig, ax = plot_confusion_matrix_fancy(X_test_multi, y_test_multi, base_multi_model)

# Configure axis labels, title, ticks, and ticklabels
configure_axislabels_and_title(ax, 'Predicted Labels',
                               'Actual Labels', 'Base Multiclass Model Confusion Matrix')

configure_ticklabels_and_params(ax)

# Save figure as .png
# plt.savefig('../images/base_multi_confusion_matrix.png', bbox_inches = 'tight')

In [None]:
# Create confusion matrix for best multi-class classification model (XGBoost)
fig, ax = plot_confusion_matrix_fancy(X_test_multi, y_test_multi, best_multi_model)

# Configure axis labels, title, ticks, and ticklabels
configure_axislabels_and_title(ax, 'Predicted Labels',
                               'Actual Labels', 'Best Multiclass Model Confusion Matrix')

configure_ticklabels_and_params(ax)

# Save figure as .png
# plt.savefig('../images/best_multi_confusion_matrix.png', bbox_inches = 'tight')