# Imports

In [1]:
import warnings
warnings.filterwarnings(action='once')

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC #???????????????//
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr, kendalltau

import astropy.table
from astropy.table import QTable, join
from time import time

In [3]:
master_cat = pd.read_csv('./catdata/master_catalog_jan_2023.csv')
cat_files = ['cat1_50.pk','cat51_100.pk','cat101_150.pk','cat151_200.pk','cat201_235.pk',
             'cat236_257.pk','cat258_279.pk','cat280_320.pk','cat321_360.pk','cat361_406.pk']

### Load Training Data

In [4]:
# select most recent training data
train_file = 'training_data_0802.pk' # training data with 3 classes
train_file = 'training_data_1702.pk' # training data with only gcs and galaxies
#train_file = 'training_data_1902_with_stars.pk' # training data with gcs galaxies and stars, classed as 'gc' and 'non-gc'
#train_file = 'training_data_2702_j.pk'
train_file = 'training_data_0203_jhk.pk'

with open(f'./pickle/training_data/{train_file}','rb') as f:
    training_data = pickle.load(f)

#training_data = training_data[training_data['j_acc']==True]

In [5]:
training_data

Unnamed: 0,obj_id,class,i,g,di,dg,ra,dec,field,pdidx,rbcidx,nearby,i-g,j,h,k,ph_qual,2mass_acc
0,HM33-A,gc,22.424000,22.940001,0.026,0.017,23.923733,28.821186,5,39800,2647,49,-0.516001,13.028,12.881,12.805,AAA,False
1,C30,galaxy,18.049000,19.500999,0.001,0.002,12.105896,29.267633,11,118854,2431,24,-1.452000,15.869,15.216,14.756,AAB,True
2,LAMOST-C22,galaxy,17.628000,19.153000,0.001,0.001,11.738621,29.693506,11,24692,2319,17,-1.525000,15.251,14.503,13.918,AAA,True
3,HM33-B,gc,19.538000,20.386000,0.003,0.003,24.008787,29.963625,13,43246,2648,45,-0.848000,16.429,15.589,15.154,BBC,False
4,LAMOST-C18,galaxy,17.177999,18.388000,0.001,0.001,23.842129,29.552473,14,122860,2644,12,-1.210001,14.838,14.277,13.847,AAA,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1329,PA-N185,gc,20.188000,21.688999,0.005,0.007,9.578100,48.367985,398,256376,306,106,-1.500999,11.881,11.436,11.314,AAA,False
1330,FJJ-V,gc,17.434999,18.451000,0.001,0.001,9.806167,48.384743,398,168423,372,156,-1.016001,13.321,12.966,12.889,AAA,False
1331,FJJ-VII,gc,19.523001,20.520000,0.003,0.004,9.826713,48.384266,398,168594,377,163,-0.997000,14.687,14.317,14.254,AAA,False
1332,PA-N147-3,gc,19.910000,20.820999,0.004,0.004,8.542029,49.044243,402,101369,91,36,-0.910999,14.378,13.857,13.694,AAA,False


### Add i-g to training data (17/02)

In [217]:
training_data['i-g'] = training_data['i']-training_data['g']

# Training Data Generator

In [6]:
def load_cat(field):
    bounds = [50,100,150,200,235,257,279,320,360,406]
    for b in range(len(bounds)):
        if field <= bounds[b]:
            to_load = cat_files[b]
            break
    with open(f'./pickle/{to_load}','rb') as f:
        catalogue = pickle.load(f)
    return catalogue

In [14]:
# 23.01.26 18:29
def generate_training_data(matches:dict, crowding=300) -> pd.DataFrame:
    cat = load_cat(1)
    
    columns = ['obj_id','class','i','g','di','dg','ra','dec','field','pdidx','rbcidx','nearby']
    values = []
    object_ids = []
    
    #TEMP
    crowded_objects = []
    
    
    for field in matches: # iterate through each field ID
        working_field = matches[field] # take the list of matches e.g. working_field = [(166727, 2642), (159637, 2646)]
        if field not in cat: # load the correct catalogue
            cat = load_cat(field)
        for m in working_field: # iterate through each match (a tuple) and grab values from catalogues
            
            if m[2] > crowding: # testing
                crowded_objects.append(m[1:])
                continue
            
            obj_id = master_cat.loc[m[1]].ID
            class_ = master_cat.loc[m[1]].CLASS
            
            if obj_id in object_ids: continue # if we've already added the object then skip
            else: object_ids.append(obj_id)   # else add it to the list of ids
            
            if class_ == 1: class_str = 'gc' # convert class numbers into strings
            elif class_ == 8: class_str = 'gc' # include extended clusters
            elif class_ == 4: class_str = 'galaxy'
           # elif class_ == 6: class_str = 'star'
            else: continue # skip non-gc/gal objects
            
            # collect required data
            row = cat[field][m[0]]
            ra = row['RA']
            dec = row['Dec']
            g = row['g']
            i = row['i']
            dg = row['dg']
            di = row['di']
            
            values.append([obj_id,class_str,i,g,di,dg,ra,dec,field,m[0],m[1],m[2]])
    
    training_data_dict = dict(zip(columns,zip(*values))) # zip values and columns together into a dict (columns as keys)
    training_data_df = pd.DataFrame(training_data_dict) # put into pd Dataframe
    return training_data_df, crowded_objects

#### Generate training data from object matches (17/02)

In [35]:
with open(f'./pickle/matches/matches_delta005_1702.pk','rb') as f:
    obj_mat = pickle.load(f)

In [40]:
new_training_data, crowded_obj = generate_training_data(obj_mat,crowding=350)

Loading cat1_50.pk ...
Loading cat51_100.pk ...
Loading cat101_150.pk ...
Loading cat151_200.pk ...
Loading cat201_235.pk ...
Loading cat236_257.pk ...
Loading cat258_279.pk ...
Loading cat280_320.pk ...
Loading cat321_360.pk ...
Loading cat361_406.pk ...


In [82]:
with open(f'./pickle/training_data/training_data_1702.pk','wb') as f:
    pickle.dump(new_training_data,f)

# Statistics

In [7]:
def calc_correlations(pred,true):
    correlations = {}
    correlations['mse'] = mean_squared_error(pred,true)
    correlations['ktau'] = kendalltau(pred,true)[0]
    correlations['pval-ktau'] = kendalltau(pred,true)[1]
    correlations['pearsonr'] = pearsonr(pred,true)[0]
    correlations['pval-pearsonr'] = pearsonr(pred,true)[1]
    correlations['r2'] = r2_score(true, pred)
    return correlations

In [8]:
def pretty_corr(c): # pretty print output from calc_correlations()
    print(f"""
    Mean squared error (RMS): \t{c['mse']:.5f}\t({(c['mse']**.5):.5})
    Kendall Tau: \t\t{c['ktau']:.5}
    \tKtau p-value: \t\t{c['pval-ktau']:.5}
    Pearson's r: \t\t{c['pearsonr']:.5}
    \tPearson's r p-value: \t{c['pval-pearsonr']:.5}
    Coef. of determination \t{c['r2']:.5}
    """)

# Machine Learning Models

## Random Forest

In [7]:
# generate the classifier and return (with optional returning of train and test values)
def ranfor(df,train_size=0.8,n_estimators=50,criterion='gini',features=['i','g','i-g','j'], max_depth=None, max_leaf_nodes=None, min_samples_leaf=1, stats=False, scale=False):
    # select features for training
    X = df[features]
    y = df['class']
    # split the data
    if scale:
        # scale the data
        scaler = preprocessing.StandardScaler().fit(X)
        X_scaled = scaler.transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=train_size) # X_scaled
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size) # X
    # train the regressor model
    ran_for_class = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                    criterion=criterion, max_leaf_nodes=max_leaf_nodes,
                                    min_samples_leaf=min_samples_leaf                                        
                                   ).fit(X_train,y_train)
    train_pred = ran_for_class.predict(X_train)
    test_pred = ran_for_class.predict(X_test)
    
    acc = ran_for_class.score(X_test,y_test)
    
    true = y_test.to_numpy()
    if stats:
        return ran_for_class, test_pred, y_test, train_pred, y_train
    else: return ran_for_class

In [9]:
# returns predictions for a given field, allowing a crowding parameter to filter training values
def rf_pred(field:int,train:pd.DataFrame,crowding=300,n_estimators=50,max_depth=None,max_leaf_nodes=None,min_samples_leaf=1,features=['i','g','i-g','j'],scale=False):
    training_data_ = train[train['nearby'] <= crowding]
    cat = load_cat(field)[field]
    
    # drop rows with high delta g/i values
    cat_d = cat[cat['dg']+cat['di'] < 0.05]
    # drop stars & saturated points
    cat_candidate = cat_d[(cat_d['ig'] == 1) & (cat_d['ii'] == 1)]
    # add in i-g feature
    cat_candidate['i-g'] = cat_candidate['i']-cat_candidate['g']
    
    X = cat_candidate[features]
    X = X.to_pandas()
    if scale:
        X_scaled = preprocessing.StandardScaler().fit(X).transform(X)
        res = ranfor(training_data_,train_size=0.8,n_estimators=n_estimators,criterion='gini',
                     features=features, max_depth=max_depth, max_leaf_nodes=max_leaf_nodes,
                     min_samples_leaf=1, scale=True).predict(X_scaled)
    else:
        res = ranfor(training_data_,train_size=0.8,n_estimators=n_estimators,criterion='gini',
                     features=features, max_depth=max_depth, max_leaf_nodes=max_leaf_nodes,
                     min_samples_leaf=1).predict(X)
    
    cat_pred = cat_candidate[['RA','Dec','iccd','xg','yg','g','dg','ig','xi','yi','i','di','ii','field']]
    cat_pred['pred'] = res
    return cat_pred

SyntaxError: invalid syntax (1847631071.py, line 14)

In [15]:
# make a plot of the different statistics from a dictionary
def plot_stats(stats: dict, xlabel: str):
    keys_ = stats.keys()
    acc = [stats[k]['acc'] for k in keys_]
    prec = [stats[k]['prec'] for k in keys_]
    rec = [stats[k]['rec'] for k in keys_]
    plt.plot(keys_, acc, label='accuracy')
    plt.plot(keys_, prec, label='precision')
    plt.plot(keys_, rec, label='recall')
    plt.xlabel(xlabel)
    plt.ylabel('score')
    plt.legend()
    plt.show()


# Make Predictions

In [10]:
# select most recent training data
train_file = 'training_data_0802.pk' # training data with 3 classes
train_file = 'training_data_1702.pk' # training data with only gcs and galaxies
#train_file = 'training_data_1902_with_stars.pk' # training data with gcs galaxies and stars, classed as 'gc' and 'non-gc'
#train_file = 'temp/train_plus_35.pk'
#train_file = 'temp/train_plus_148.pk'
#train_file = 'training_data_2702_j.pk'
# load training data and filter out stars
with open(f'./pickle/training_data/{train_file}','rb') as f:
    training_data = pickle.load(f)

#training_data = training_data[training_data['j_acc']==True]

In [28]:
field = 35
n_trees = 30
max_depth_ = 9
max_leaf_nodes_ = 12
min_samples_leaf_ = 15
features_ = ['i','g','i-g','j']
features_ = ['i','g','i-g']
predictions_list = []
gc_filter = []

print(f'Field {field}')
for i in range(1):
    predictions_list.append( rf_pred(field,training_data,crowding=250,n_estimators=n_trees,max_depth=max_depth_, max_leaf_nodes=max_leaf_nodes_,min_samples_leaf=min_samples_leaf_, features=features_) )
print('Filtering...')
for i in range(len(predictions_list[0])):
    gc_candidate = all([ p[i]['pred']=='gc' for p in predictions_list ])
    if gc_candidate: gc_filter.append(True)
    else: gc_filter.append(False)
gc_candidates = predictions_list[0][gc_filter]
with open(f'pickle/predictions/predictionsf{field}.pk','wb') as f:
    pickle.dump(gc_candidates,f)
print(len(gc_candidates))
print('\n')

Field 35
Filtering...
1678




In [29]:
gc_candidates

RA,Dec,iccd,xg,yg,g,dg,ig,xi,yi,i,di,ii,field,pred
float32,float32,uint8,float32,float32,float32,float32,int8,float32,float32,float32,float32,int8,uint16,object
15.026317,31.7554,1,272.72,699.69,20.595,0.003,1,257.47,695.57,19.593,0.003,1,35,gc
15.023812,31.755617,1,231.09,695.96,22.677,0.012,1,215.83,691.9,22.085,0.018,1,35,gc
15.030167,31.618309,1,326.22,3376.22,22.707,0.012,1,312.38,3371.73,22.352,0.023,1,35,gc
15.024075,31.756283,1,235.5,682.76,22.762,0.013,1,220.24,678.7,22.191,0.02,1,35,gc
15.045329,31.651045,1,580.17,2735.91,22.834,0.013,1,566.0,2731.44,22.152,0.019,1,35,gc
15.044471,31.593712,1,562.7,3852.42,22.846,0.014,1,549.12,3847.73,22.153,0.019,1,35,gc
15.103271,31.658052,1,1545.53,2588.06,22.855,0.014,1,1531.3,2583.36,22.129,0.019,1,35,gc
15.0551,31.71865,1,748.04,1413.6,23.02,0.016,1,733.18,1409.27,22.077,0.018,1,35,gc
15.093621,31.75924,1,1393.99,609.62,23.032,0.016,1,1378.71,605.23,22.575,0.028,1,35,gc
15.012479,31.643328,1,33.84,2891.97,23.089,0.016,1,19.74,2887.59,22.194,0.02,1,35,gc


In [31]:
field = 80

#fields = [9,21,32,35,37,41,53,56,59,63,73,78,80,79,97,104,103,122,126,121,117,118,135]
#fields = [34,35,36,53,56,59,63,78,80,103,101,99,148,146,85,88,86,5,162,135,188,186,184,185,169]
fields = [196,240,220,241,243,310,333]
n_trees = 30
max_depth_ = 5
max_leaf_nodes_ = 12
min_samples_leaf_ = 15
features_ = ['i','g','i-g','j']

n_trees = 30
max_depth_ = 9
max_leaf_nodes_ = 12
min_samples_leaf_ = 20
features_ = ['i','g','i-g']

predictions_list = []
print('Start:')
for field in fields:
    predictions_list = []
    gc_filter = []
    print(f'Field {field}')
    for i in range(1): # iterate to take the intersection of all predictions
        predictions_list.append( rf_pred(field,training_data,n_estimators=n_trees,max_depth=max_depth_, max_leaf_nodes=max_leaf_nodes_,min_samples_leaf=min_samples_leaf_, features=features_) )
    print('Filtering...')
    for i in range(len(predictions_list[0])):
        gc_candidate = all([ p[i]['pred']=='gc' for p in predictions_list ]) # select only gcs that were predicted on all iterations
        if gc_candidate: gc_filter.append(True)
        else: gc_filter.append(False)
    gc_candidates = predictions_list[0][gc_filter]
    with open(f'pickle/predictions/2802/predictionsf{field}.pk','wb') as f:
        pickle.dump(gc_candidates,f)
    print(len(gc_candidates))
    print('\n')

Start:
Field 196
Filtering...
3217


Field 240
Filtering...
1080


Field 220
Filtering...
713


Field 241
Filtering...
1726


Field 243
Filtering...
896


Field 310
Filtering...
881


Field 333
Filtering...
1296




In [202]:
gc_filter = []
for i in range(len(predictions_list[0])):
    gc_candidate = all([ p[i]['pred']=='gc' for p in predictions_list ])
    if gc_candidate: gc_filter.append(True)
    else: gc_filter.append(False)

In [203]:
gc_candidates = predictions_list[0][gc_filter]

In [206]:
# Save predictions to pickle file and print how many GCs were found
with open(f'pickle/predictionsf{field}.pk','wb') as f:
    pickle.dump(gc_candidates,f)
len(gc_candidates)

150