In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import StandardScaler
from alg_functions import *

# Obtain the Data

In [2]:
# open the pulsar dataset
qso_g_df = pd.read_csv('datasets/corrected_QSO_GALAXY_Dataset.csv')

# drop the rows with missing values
qso_g_df = qso_g_df.dropna()
qso_g_df.rename(columns={'class': 'type'}, inplace=True)

galaxy_df = qso_g_df[qso_g_df['type'] == 'GALAXY'].sample(n=2000, random_state=42)
qso_df = qso_g_df[qso_g_df['type'] == 'QSO'].sample(n=2000, random_state=42)

qso_g_df = pd.concat([galaxy_df, qso_df])

display(qso_g_df.head())

Unnamed: 0,type,subclass,mag_u,mag_g,mag_r,mag_i,mag_z,redshift,flux_u,flux_g,...,flux_i,flux_z,photRA,specRA,photDec,specDec,petrosianRadius_u,deVEffectiveRadius_u,petrosianRadius_z,deVEffectiveRadius_z
4498,GALAXY,STARFORMING,20.20175,18.42369,17.361,16.85209,16.4214,0.116623,4.067745,11.13524,...,51.35847,72.29363,218.563632,218.56364,8.992736,8.992744,2.970685,7.054471,5.1916,2.716197
3646,GALAXY,STARFORMING,19.64754,18.41117,17.75667,17.36501,17.06143,0.073963,3.82784,8.99992,...,29.52955,37.28042,339.559695,339.55971,14.157285,14.15729,43.76405,26.74674,8.903262,10.20472
4242,GALAXY,STARFORMING,19.61266,17.91331,17.00628,16.59109,16.24816,0.107316,6.293722,25.02789,...,96.433,125.7509,120.83341,120.83341,12.104724,12.104733,8.076633,6.684969,3.399417,1.76148
2464,GALAXY,STARFORMING,18.55299,17.09966,16.39841,16.012,15.75324,0.066666,18.77991,47.22098,...,135.9257,166.6958,180.117997,180.11801,51.75606,51.756076,5.504677,4.108453,4.365156,2.409809
8715,GALAXY,STARFORMING,16.71037,15.61988,15.25153,15.01981,14.84542,0.027207,18.91865,32.80019,...,72.02156,87.63742,158.926025,158.92608,5.615982,5.615976,17.75235,29.67733,16.22432,28.17896


In [3]:
qso_g_df['u-g'] = qso_g_df['flux_u'] - qso_g_df['flux_g']
qso_g_df['g-r'] = qso_g_df['flux_g'] - qso_g_df['flux_r']
qso_g_df['r-i'] = qso_g_df['flux_r'] - qso_g_df['flux_i']
qso_g_df['i-z'] = qso_g_df['flux_i'] - qso_g_df['flux_z']
qso_g_df['u-z'] = qso_g_df['flux_u'] - qso_g_df['flux_z']
qso_g_df['u-r'] = qso_g_df['flux_u'] - qso_g_df['flux_r']
qso_g_df['r-z'] = qso_g_df['flux_r'] - qso_g_df['flux_z']

In [4]:
# create a scaler object
scaler = StandardScaler()

pred_columns_qso_g = ['u-z', 'u-r', 'r-z', 'redshift']

qso_g_df_norm = qso_g_df[pred_columns_qso_g]

# fit and transform the data

qso_g_df_norm = pd.DataFrame(scaler.fit_transform(qso_g_df_norm), columns = pred_columns_qso_g)

# add the 'type' column back to the normalized DataFrame
qso_g_df_norm['type'] = qso_g_df['type'].values

display(qso_g_df_norm.head())

Unnamed: 0,u-z,u-r,r-z,redshift,type
0,-0.305543,-0.156948,-0.435702,-0.815634,GALAXY
1,0.136068,0.155662,0.11633,-0.861102,GALAXY
2,-0.956166,-0.946658,-0.950819,-0.825554,GALAXY
3,-1.317584,-1.429763,-1.196716,-0.86888,GALAXY
4,-0.311803,-0.397975,-0.229181,-0.910937,GALAXY


In [5]:
train_set = qso_g_df_norm[qso_g_df_norm['type'] == 'QSO']

outliers = train_set[(np.abs(stats.zscore(train_set[pred_columns_qso_g])) > 0.5).all(axis=1)]
train_set = train_set[~train_set.isin(outliers)].dropna()

# Algorithm 1

In [6]:
for epsilon in [0.01, 0.001, 0.0001]:

    print('---------------')
    print('Epsilon: ', epsilon)

    # Run the Away Step Frank-Wolfe algorithm
    c, r, k, chi, timer = frank_wolfe_alg_MEB(epsilon, train_set[pred_columns_qso_g].values)

    # Calculate Accuracies
    distances = np.linalg.norm(qso_g_df_norm[pred_columns_qso_g] - c, axis=1)
    qso_g_df_norm['InsideBall'] = distances < r
    print(measure_accuracy('GALAXY', 'QSO', qso_g_df_norm))

---------------
Epsilon:  0.01
Center: [ 0.08379819  0.44748363 -0.24690729  1.81358104]
Radius: 2.8105779165185036
Iterations: 7
CPU time: 0.0999748706817627
Core Set Size: 4
                Metric        Count
0   True Positive (TP)  1991.000000
1  False Negative (FN)     9.000000
2   True Negative (TN)   864.000000
3  False Positive (FP)  1136.000000
4            Precision     0.636713
5               Recall     0.995500
6             F1 Score     0.776673
---------------
Epsilon:  0.001
Center: [ 0.09023357  0.4182748  -0.20827391  1.82671998]
Radius: 2.7896281543289363
Iterations: 58
CPU time: 0.7666573524475098
Core Set Size: 5
                Metric        Count
0   True Positive (TP)  1991.000000
1  False Negative (FN)     9.000000
2   True Negative (TN)  1017.000000
3  False Positive (FP)   983.000000
4            Precision     0.669469
5               Recall     0.995500
6             F1 Score     0.800563
---------------
Epsilon:  0.0001
Center: [ 0.09041035  0.40910637 -0.1

# Algorithm 2

In [9]:
for epsilon in [0.01, 0.001, 0.0001]:

    print('---------------')
    print('Epsilon: ', epsilon)

    # Run the Away Step Frank-Wolfe algorithm
    center_as, radius_as, count_iterations_as, CPU_time_as, dual_val_list_as, dual_gap_list_as = frankWolfe_AwayStep(train_set[pred_columns_qso_g].values, epsilon)

    # Calculate Accuracies
    distances = np.linalg.norm(qso_g_df_norm[pred_columns_qso_g] - center_as, axis=1)
    qso_g_df_norm['InsideBall'] = distances < radius_as
    print(measure_accuracy('GALAXY', 'QSO', qso_g_df_norm))

---------------
Epsilon:  0.01
Center: [ 0.08717084  0.39103003 -0.18938412  1.82355267]
Radius: 2.7871223695981717
Iterations: 85
CPU time: 5.127383470535278
Set Size: 4
                Metric        Count
0   True Positive (TP)  1990.000000
1  False Negative (FN)    10.000000
2   True Negative (TN)   974.000000
3  False Positive (FP)  1026.000000
4            Precision     0.659814
5               Recall     0.995000
6             F1 Score     0.793461
---------------
Epsilon:  0.001
Center: [ 0.09005861  0.40608878 -0.19756485  1.82597021]
Radius: 2.787191828163187
Iterations: 140
CPU time: 8.467014074325562
Set Size: 4
                Metric        Count
0   True Positive (TP)  1990.000000
1  False Negative (FN)    10.000000
2   True Negative (TN)  1011.000000
3  False Positive (FP)   989.000000
4            Precision     0.668009
5               Recall     0.995000
6             F1 Score     0.799357
---------------
Epsilon:  0.0001
Center: [ 0.09038474  0.40780891 -0.19850637  1.

# Algorithm 3

In [10]:
for epsilon in [0.01, 0.001, 0.0001]:

    print('---------------')
    print('Epsilon: ', epsilon)

    # Run the Pairwise Frank-Wolfe algorithm
    center_pw, radius_pw, count_iterations_pw, CPU_time_pw, dual_val_list_pw, dual_gap_list_pw = frankWolfe_Pairwise(train_set[pred_columns_qso_g].values, epsilon)

    # Calculate Accuracies
    distances = np.linalg.norm(qso_g_df_norm[pred_columns_qso_g] - center_pw, axis=1)
    qso_g_df_norm['InsideBall'] = distances < radius_pw
    print(measure_accuracy('GALAXY', 'QSO', qso_g_df_norm))

---------------
Epsilon:  0.01
Center: [ 0.0899033   0.40443691 -0.19636209  1.82549142]
Radius: 2.7871893552250246
Iterations: 36
CPU time: 2.3502140045166016
Set Size: 4
                Metric        Count
0   True Positive (TP)  1991.000000
1  False Negative (FN)     9.000000
2   True Negative (TN)  1005.000000
3  False Positive (FP)   995.000000
4            Precision     0.666778
5               Recall     0.995500
6             F1 Score     0.798636
---------------
Epsilon:  0.001
Center: [ 0.09037292  0.40766589 -0.19839918  1.82618789]
Radius: 2.787192710152655
Iterations: 71
CPU time: 4.591294050216675
Set Size: 4
                Metric        Count
0   True Positive (TP)  1991.000000
1  False Negative (FN)     9.000000
2   True Negative (TN)  1014.000000
3  False Positive (FP)   986.000000
4            Precision     0.668794
5               Recall     0.995500
6             F1 Score     0.800080
---------------
Epsilon:  0.0001
Center: [ 0.09042853  0.4080023  -0.19859876  1.