In [11]:
import sklearn.linear_model
import sklearn.model_selection
from sklearn.metrics import roc_auc_score, confusion_matrix

import itertools
import pandas as pd

In [5]:
bpt_class_dict = {
    1:'Star Forming', 
    2:'Low S/N Star Forming',
    3:'Composite', 
    4:'AGN', 
    5:'Low S/N AGN', 
    -1:'Unclassifiable'
}

# Prelude

In [2]:
df = pd.read_csv('../data/agn_cleaned.csv')
df.columns

Index(['objID', 'DR7ObjID', 'specObjID', 'ra', 'dec', 'z', 'zErr', 'velDisp',
       'velDispErr', 'modelMag_u', 'modelMag_g', 'modelMag_r', 'modelMag_i',
       'modelMag_z', 'petroMag_r', 'petroR50_r', 'petroR90_r', 'bptclass',
       'oh_p50', 'lgm_tot_p50', 'sfr_tot_p50', 'nii_6584_flux',
       'nii_6584_flux_err', 'h_alpha_flux', 'h_alpha_flux_err',
       'oiii_5007_flux', 'oiii_5007_flux_err', 'h_beta_flux',
       'h_beta_flux_err', 'reliable'],
      dtype='object')

In [6]:
# relatively easy, separable problem (no composite or unclassified objects)
df = df[df.bptclass.isin([1, 2, 4, 5])].copy()

In [7]:
df['AGN'] = df.bptclass > 3

In [8]:
df.AGN.value_counts()

False    166155
True      44192
Name: AGN, dtype: int64

# Logistic regression

In [9]:
features = ['modelMag_u', 'modelMag_g', 'modelMag_r', 'modelMag_i', 'modelMag_z', 'petroMag_r', 'petroR50_r', 'petroR90_r']
label = ['AGN']

In [10]:
X = df[features]
y = df[label]

In [28]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X.values, y.values, test_size=0.2)

In [32]:
logreg = sklearn.linear_model.LogisticRegression(max_iter=1000)

logreg.fit(X_train, y_train.flatten())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [36]:
y_pred = logreg.predict(X_test)

In [39]:
roc_auc_score(y_test, y_pred)

0.7937358481041281

In [40]:
y_pred

array([False,  True, False, ..., False, False, False])

In [43]:
sum(y_test.flatten() == y_pred) / len(y_pred)

0.8803898264796768

In [46]:
confusion_matrix(y_test, y_pred)

array([[31317,  1865],
       [ 3167,  5721]])

# Using color features

In [63]:
color_features = list(f'{b1}m{b2}' for b1,b2 in itertools.combinations(bands, 2))
color_features

['umg', 'umr', 'umi', 'umz', 'gmr', 'gmi', 'gmz', 'rmi', 'rmz', 'imz']

In [51]:
bands = ['u', 'g', 'r', 'i', 'z']

for b1, b2 in itertools.combinations(bands, 2): 
    df[f'{b1}m{b2}'] = df[f'modelMag_{b1}'] - df[f'modelMag_{b2}']

In [54]:
features += color_features

In [55]:
X = df[features]
y = df[label]

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X.values, y.values, test_size=0.2)

In [56]:
logreg = sklearn.linear_model.LogisticRegression(max_iter=1000)

logreg.fit(X_train, y_train.flatten())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [57]:
y_pred = logreg.predict(X_test)

In [58]:
confusion_matrix(y_test, y_pred)

array([[31229,  1973],
       [ 3134,  5734]])

In [59]:
sum(y_test.flatten() == y_pred) / len(y_pred)

0.8786070834323746

In [60]:
roc_auc_score(y_test, y_pred)

0.7935851829958345