In [1]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import pandas as pd

In [None]:
df = pd.read_csv("~/Documents/GitHub/LTY-Spectral-Classification/data files/lty_final.csv")

In [None]:
xgb = XGBRegressor(n_estimators=500,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
)

In [None]:
#split our data into training and testing sets
features = ["JH", "HK", "JK", "W1W2", "KW1", "KW2"]
y = df["spectral_type_code"]
X = df[features]

In [None]:
#stratify data so that the training and validation sets have a fair distribution of spectral types
y_bins = (y // 5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y_bins)

In [6]:
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

In [None]:
# Evaluate the model
mse = ((y_test - y_pred) ** 2).mean()
print(f"Mean Squared Error: {mse}") #3.7164334585889263
score = xgb.score(X_test, y_test)
print("R^2 on test:", score) #0.9116688046960163

Mean Squared Error: 3.7164334585889263
R^2 on test: 0.9116688046960163


In [8]:
#Build a model based off the data set that kept samples with at least 3 bands present.
df_3bands = pd.read_csv("~/Documents/GitHub/LTY-Spectral-Classification/data files/lty_final_highacc.csv")

In [9]:
xgb_3bands = XGBRegressor(n_estimators=500,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
)

In [10]:
#split our data into training and testing sets
y_3bands = df_3bands["spectral_type_code"]
X_3bands = df_3bands[features]

In [11]:
#stratify data so that the training and validation sets have a fair distribution of spectral types
y_3bands_bins = (y_3bands // 5)
X_3bands_train, X_3bands_test, y_3bands_train, y_3bands_test = train_test_split(X_3bands, y_3bands, test_size=0.2, random_state=42, stratify = y_3bands_bins)

In [12]:
xgb_3bands.fit(X_3bands_train, y_3bands_train)
y_3bands_pred = xgb.predict(X_3bands_test)

In [14]:
# Evaluate the model
print("=== XGB Regressor results for samples with at least 3 bands ===")
mse_3bands = ((y_3bands_test - y_3bands_pred) ** 2).mean()
print(f"Mean Squared Error: {mse_3bands}") #0.9159917957843763
score_3bands = xgb_3bands.score(X_3bands_test, y_3bands_test)
print("R^2 on test:", score_3bands) #0.8727215565099118

=== XGB Regressor results for samples with at least 3 bands ===
Mean Squared Error: 0.9159917957843763
R^2 on test: 0.8727215565099118
