In [12]:
import pandas

data = pandas.read_csv("data/Clean_Quasar_Data.csv")

In [13]:
# Selects relevant features, sets matrix and target vector
# X = data[['PSFFLUX_0', 'PSFFLUX_1', 'PSFFLUX_2', 'PSFFLUX_3', 'PSFFLUX_4']]
# X = data[['FUV', 'NUV']]
# X = data[['YFLUX', 'JFLUX', 'HFLUX', 'KFLUX']]
# X = data[['W1_FLUX', 'W2_FLUX']]
# X = data[['FIRST_FLUX']]
# X = data[['XMM_SOFT_FLUX', 'XMM_HARD_FLUX']]
# X = data[['GAIA_PARALLAX']]
# X = data[['GAIA_G_FLUX_SNR', 'GAIA_BP_FLUX_SNR', 'GAIA_RP_FLUX_SNR']]

X = data[['PSFFLUX_0', 'PSFFLUX_1', 'PSFFLUX_2', 'PSFFLUX_3', 'PSFFLUX_4',
          'FUV', 'NUV',
          'YFLUX', 'JFLUX', 'HFLUX', 'KFLUX',
          'W1_FLUX', 'W2_FLUX',
          'FIRST_FLUX',
          'XMM_SOFT_FLUX','XMM_HARD_FLUX',
          'GAIA_PARALLAX',
          'GAIA_G_FLUX_SNR', 'GAIA_BP_FLUX_SNR', 'GAIA_RP_FLUX_SNR']]

y = data['BAL_PROB']

In [14]:
# K-FOLD CROSS-VALIDATION

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

model = RandomForestClassifier(random_state=365)

kf = KFold(n_splits=5, shuffle=True, random_state=365)

fold_accuracies = []

# Loops through each fold
for train_index, val_index in kf.split(X):
    # Splits the data into training and validation sets
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Handles missing values and data cleaning
    X_train = pd.DataFrame(X_train)
    y_train = pd.Series(y_train)
    X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
    X_train.dropna(inplace=True)
    y_train = y_train.loc[X_train.index]  # After NaN removal, aligns y_train with X_train's indices

    model.fit(X_train, y_train)
    
    # Predicts on the validation set
    val_predictions = model.predict(X_val)
    
    # Evaluates the model on this fold
    fold_accuracy = accuracy_score(y_val, val_predictions)
    fold_accuracies.append(fold_accuracy)
    
    print(f"Fold accuracy: {fold_accuracy}")

average_accuracy = np.mean(fold_accuracies)
print(f"Average accuracy across folds: {average_accuracy}")

Fold accuracy: 0.9318327018795037
Fold accuracy: 0.9310155263353391
Fold accuracy: 0.9320258524626699
Fold accuracy: 0.9316979674313562
Fold accuracy: 0.9310739331986212
Average accuracy across folds: 0.931529196261498


In [15]:
from sklearn.model_selection import train_test_split

# First split: 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Second split: 70% training, 10% validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=42)

In [16]:
import numpy

X_train = pandas.DataFrame(X_train)  # Ensures X_train is a DataFrame
y_train = pandas.Series(y_train)     # Ensures y_train is a Series

# Replaces inf and drop rows with NaN in X_train
X_train.replace([numpy.inf, -numpy.inf], numpy.nan, inplace=True)
X_train.dropna(inplace=True)

# Aligns y_train with X_train's indices
y_train = y_train.loc[X_train.index]

# Checks to ensure both have the same length
print(len(X_train), len(y_train))  # Should print the same number for both

126958 126958


In [17]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

In [18]:
# Evaluates on the validation set
val_predictions = rf_model.predict(X_val)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Accuracy
val_accuracy = accuracy_score(y_val, val_predictions)
print(f'Validation Accuracy: {val_accuracy:.4f}')

# Confusion Matrix
val_conf_matrix = confusion_matrix(y_val, val_predictions)
print(f'Confusion Matrix:\n{val_conf_matrix}')

# Detailed classification report (Precision, Recall, F1-Score)
val_class_report = classification_report(y_val, val_predictions)
print(f'Classification Report:\n{val_class_report}')

Validation Accuracy: 0.9298
Confusion Matrix:
[[31222    86]
 [ 2275    70]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96     31308
         1.0       0.45      0.03      0.06      2345

    accuracy                           0.93     33653
   macro avg       0.69      0.51      0.51     33653
weighted avg       0.90      0.93      0.90     33653



In [19]:
feature_importances = rf_model.feature_importances_

importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print("Feature Importances:\n", importance_df)

Feature Importances:
              Feature  Importance
12           W2_FLUX    0.120979
0          PSFFLUX_0    0.095173
11           W1_FLUX    0.094985
1          PSFFLUX_1    0.090424
4          PSFFLUX_4    0.088195
2          PSFFLUX_2    0.085659
3          PSFFLUX_3    0.084978
6                NUV    0.074282
5                FUV    0.069433
17   GAIA_G_FLUX_SNR    0.055556
19  GAIA_RP_FLUX_SNR    0.048642
18  GAIA_BP_FLUX_SNR    0.048599
16     GAIA_PARALLAX    0.043096
9              HFLUX    0.000000
8              JFLUX    0.000000
7              YFLUX    0.000000
10             KFLUX    0.000000
15     XMM_HARD_FLUX    0.000000
13        FIRST_FLUX    0.000000
14     XMM_SOFT_FLUX    0.000000


Fitting 5 folds for each of 216 candidates, totalling 1080 fits


KeyboardInterrupt: 