In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import StandardScaler
from alg_functions import *

# Obtain the Data

In [2]:
# open the pulsar dataset
qso_g_df = pd.read_csv('datasets/corrected_QSO_GALAXY_Dataset.csv')

# drop the rows with missing values
qso_g_df = qso_g_df.dropna()
qso_g_df.rename(columns={'class': 'type'}, inplace=True)

galaxy_df = qso_g_df[qso_g_df['type'] == 'GALAXY'].sample(n=2000, random_state=42)
qso_df = qso_g_df[qso_g_df['type'] == 'QSO'].sample(n=2000, random_state=42)

qso_g_df = pd.concat([galaxy_df, qso_df])

display(qso_g_df.head())

Unnamed: 0,type,subclass,mag_u,mag_g,mag_r,mag_i,mag_z,redshift,flux_u,flux_g,...,flux_i,flux_z,photRA,specRA,photDec,specDec,petrosianRadius_u,deVEffectiveRadius_u,petrosianRadius_z,deVEffectiveRadius_z
4498,GALAXY,STARFORMING,20.20175,18.42369,17.361,16.85209,16.4214,0.116623,4.067745,11.13524,...,51.35847,72.29363,218.563632,218.56364,8.992736,8.992744,2.970685,7.054471,5.1916,2.716197
3646,GALAXY,STARFORMING,19.64754,18.41117,17.75667,17.36501,17.06143,0.073963,3.82784,8.99992,...,29.52955,37.28042,339.559695,339.55971,14.157285,14.15729,43.76405,26.74674,8.903262,10.20472
4242,GALAXY,STARFORMING,19.61266,17.91331,17.00628,16.59109,16.24816,0.107316,6.293722,25.02789,...,96.433,125.7509,120.83341,120.83341,12.104724,12.104733,8.076633,6.684969,3.399417,1.76148
2464,GALAXY,STARFORMING,18.55299,17.09966,16.39841,16.012,15.75324,0.066666,18.77991,47.22098,...,135.9257,166.6958,180.117997,180.11801,51.75606,51.756076,5.504677,4.108453,4.365156,2.409809
8715,GALAXY,STARFORMING,16.71037,15.61988,15.25153,15.01981,14.84542,0.027207,18.91865,32.80019,...,72.02156,87.63742,158.926025,158.92608,5.615982,5.615976,17.75235,29.67733,16.22432,28.17896


In [3]:
qso_g_df['u-g'] = qso_g_df['flux_u'] - qso_g_df['flux_g']
qso_g_df['g-r'] = qso_g_df['flux_g'] - qso_g_df['flux_r']
qso_g_df['r-i'] = qso_g_df['flux_r'] - qso_g_df['flux_i']
qso_g_df['i-z'] = qso_g_df['flux_i'] - qso_g_df['flux_z']
qso_g_df['u-z'] = qso_g_df['flux_u'] - qso_g_df['flux_z']
qso_g_df['u-r'] = qso_g_df['flux_u'] - qso_g_df['flux_r']
qso_g_df['r-z'] = qso_g_df['flux_r'] - qso_g_df['flux_z']

In [4]:
# create a scaler object
scaler = StandardScaler()



# fit and transform the data

qso_g_df_norm = pd.DataFrame(scaler.fit_transform(qso_g_df), columns=pred_columns_qso_g)

# add the 'type' column back to the normalized DataFrame
qso_g_df_norm['type'] = qso_g_df['type'].values

display(qso_g_df_norm.head())

Unnamed: 0,u-z,u-r,r-z,redshift,type
0,-0.305543,-0.156948,-0.435702,-0.815634,GALAXY
1,0.136068,0.155662,0.11633,-0.861102,GALAXY
2,-0.956166,-0.946658,-0.950819,-0.825554,GALAXY
3,-1.317584,-1.429763,-1.196716,-0.86888,GALAXY
4,-0.311803,-0.397975,-0.229181,-0.910937,GALAXY


In [5]:
train_set = qso_g_df_norm[qso_g_df_norm['type'] == 'QSO']

outliers = train_set[(np.abs(stats.zscore(train_set)) > 0.5).all(axis=1)]
train_set = train_set[~train_set.isin(outliers)].dropna()

# Algorithm 1

In [8]:
pred_columns_qso_g = ['u-z', 'u-r', 'r-z', 'redshift']

for epsilon in [0.01, 0.001, 0.0001]:

    print('---------------')
    print('Epsilon: ', epsilon)

    # Run the Away Step Frank-Wolfe algorithm
    c, r, k, chi, timer = frank_wolfe_alg_MEB(epsilon, qso_g_df_norm[qso_g_df_norm['type'] == 'QSO'][pred_columns_qso_g].values)

    # Calculate Accuracies
    distances = np.linalg.norm(qso_g_df_norm[pred_columns_qso_g] - c, axis=1)
    qso_g_df_norm['InsideBall'] = distances < r
    print(measure_accuracy('GALAXY', 'QSO', qso_g_df_norm))

---------------
Epsilon:  0.01
Center: [-4.32607063 -4.69491034 -3.92875318  1.7985004 ]
Radius: 8.804429857557178
Iterations: 0
CPU time: 0.023183822631835938
Core Set Size: 2
                Metric        Count
0   True Positive (TP)  1999.000000
1  False Negative (FN)     1.000000
2   True Negative (TN)    32.000000
3  False Positive (FP)  1968.000000
4            Precision     0.503907
5               Recall     0.999500
6             F1 Score     0.670018
---------------
Epsilon:  0.001
Center: [-4.2975814  -4.66197541 -3.90470753  1.67302442]
Radius: 8.743426697997597
Iterations: 19
CPU time: 0.2714676856994629
Core Set Size: 4
                Metric        Count
0   True Positive (TP)  1999.000000
1  False Negative (FN)     1.000000
2   True Negative (TN)    18.000000
3  False Positive (FP)  1982.000000
4            Precision     0.502135
5               Recall     0.999500
6             F1 Score     0.668450
---------------
Epsilon:  0.0001
Center: [-4.29952872 -4.66386821 -3.9

# Algorithm 2

In [9]:
for epsilon in [0.01, 0.001, 0.0001]:

    print('---------------')
    print('Epsilon: ', epsilon)

    # Run the Away Step Frank-Wolfe algorithm
    center_as, radius_as, count_iterations_as, CPU_time_as, dual_val_list_as, dual_gap_list_as = frankWolfe_AwayStep(qso_g_df_norm[qso_g_df_norm['type'] == 'QSO'][pred_columns_qso_g].values, epsilon)

    # Calculate Accuracies
    distances = np.linalg.norm(qso_g_df_norm[pred_columns_qso_g] - center_as, axis=1)
    qso_g_df_norm['InsideBall'] = distances < radius_as
    print(measure_accuracy('GALAXY', 'QSO', qso_g_df_norm))

---------------
Epsilon:  0.01
Center: [-4.29908072 -4.66337006 -3.90627979  1.65638457]
Radius: 8.735500610481294
Iterations: 42
CPU time: 2.8262200355529785
Set Size: 3
                Metric        Count
0   True Positive (TP)  1999.000000
1  False Negative (FN)     1.000000
2   True Negative (TN)    22.000000
3  False Positive (FP)  1978.000000
4            Precision     0.502640
5               Recall     0.999500
6             F1 Score     0.668897
---------------
Epsilon:  0.001
Center: [-4.29918434 -4.66346622 -3.90638865  1.65521888]
Radius: 8.735500705281122
Iterations: 62
CPU time: 4.191787481307983
Set Size: 3
                Metric        Count
0   True Positive (TP)  1999.000000
1  False Negative (FN)     1.000000
2   True Negative (TN)    22.000000
3  False Positive (FP)  1978.000000
4            Precision     0.502640
5               Recall     0.999500
6             F1 Score     0.668897
---------------
Epsilon:  0.0001
Center: [-4.29919312 -4.66347437 -3.90639789  1.6

# Algorithm 3

In [12]:
for epsilon in [0.01, 0.001, 0.0001]:

    print('---------------')
    print('Epsilon: ', epsilon)

    # Run the Pairwise Frank-Wolfe algorithm
    center_pw, radius_pw, count_iterations_pw, CPU_time_pw, dual_val_list_pw, dual_gap_list_pw = frankWolfe_Pairwise(qso_g_df_norm[qso_g_df_norm['type'] == 'QSO'][pred_columns_qso_g].values, epsilon)

    # Calculate Accuracies
    distances = np.linalg.norm(qso_g_df_norm[pred_columns_qso_g] - center_pw, axis=1)
    qso_g_df_norm['InsideBall'] = distances < radius_pw
    print(measure_accuracy('GALAXY', 'QSO', qso_g_df_norm))

---------------
Epsilon:  0.01
Center: [-4.29917014 -4.66344066 -3.90638496  1.65452841]
Radius: 8.73550068669475
Iterations: 4
CPU time: 0.3409140110015869
Set Size: 3
                Metric        Count
0   True Positive (TP)  1999.000000
1  False Negative (FN)     1.000000
2   True Negative (TN)    22.000000
3  False Positive (FP)  1978.000000
4            Precision     0.502640
5               Recall     0.999500
6             F1 Score     0.668897
---------------
Epsilon:  0.001
Center: [-4.29919999 -4.66348167 -3.90640425  1.65510665]
Radius: 8.735500706001165
Iterations: 5
CPU time: 0.3544306755065918
Set Size: 3
                Metric        Count
0   True Positive (TP)  1998.000000
1  False Negative (FN)     2.000000
2   True Negative (TN)    22.000000
3  False Positive (FP)  1978.000000
4            Precision     0.502515
5               Recall     0.999000
6             F1 Score     0.668675
---------------
Epsilon:  0.0001
Center: [-4.29919462 -4.66347585 -3.90639937  1.655