# Import

In [11]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC #???????????????//
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import mean_squared_error, r2_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr, kendalltau


## Load required data

In [2]:
bounds = pickle.load(open('./pickle/bounds.pk','rb'))
master_cat = pd.read_csv('./catdata/master_catalog_jan_2023.csv')
cat_files = ['cat1_50.pk','cat51_100.pk','cat101_150.pk','cat151_200.pk','cat201_235.pk',
             'cat236_257.pk','cat258_279.pk','cat280_320.pk','cat321_360.pk','cat361_406.pk']

In [34]:
training_data = pickle.load(open('./pickle/training_data_d006_c300.pk','rb'))

In [35]:
training_data

Unnamed: 0,obj_id,class,i,g,di,dg,ra,dec,field,pdidx
0,HM33-A,gc,22.424000,22.940001,0.026,0.017,23.923733,28.821186,5,39800
1,C30,galaxy,18.049000,19.500999,0.001,0.002,12.105896,29.267633,11,118854
2,LAMOST-C22,galaxy,17.628000,19.153000,0.001,0.001,11.738621,29.693506,11,24692
3,HM33-B,gc,19.538000,20.386000,0.003,0.003,24.008787,29.963625,13,43246
4,LAMOST-C18,galaxy,17.177999,18.388000,0.001,0.001,23.842129,29.552473,14,122860
...,...,...,...,...,...,...,...,...,...,...
2289,FJJ-IV,gc,18.417999,19.346001,0.002,0.002,9.801158,48.380100,398,168491
2290,PA-N185,gc,20.188000,21.688999,0.005,0.007,9.578100,48.367985,398,256376
2291,FJJ-V,gc,17.434999,18.451000,0.001,0.001,9.806167,48.384743,398,168423
2292,FJJ-VII,gc,19.523001,20.520000,0.003,0.004,9.826713,48.384266,398,168594


# Statistics

In [6]:
def calc_correlations(pred,true):
    correlations = {}
    correlations['mse'] = mean_squared_error(pred,true)
    correlations['ktau'] = kendalltau(pred,true)[0]
    correlations['pval-ktau'] = kendalltau(pred,true)[1]
    correlations['pearsonr'] = pearsonr(pred,true)[0]
    correlations['pval-pearsonr'] = pearsonr(pred,true)[1]
    correlations['r2'] = r2_score(true, pred)
    return correlations

In [7]:
def pretty_corr(c): # pretty print output from calc_correlations()
    print(f"""
    Mean squared error (RMS): \t{c['mse']:.5f}\t({(c['mse']**.5):.5})
    Kendall Tau: \t\t{c['ktau']:.5}
    \tKtau p-value: \t\t{c['pval-ktau']:.5}
    Pearson's r: \t\t{c['pearsonr']:.5}
    \tPearson's r p-value: \t{c['pval-pearsonr']:.5}
    Coef. of determination \t{c['r2']:.5}
    """)

# Random Forest

In [8]:
training_data.head()

Unnamed: 0,obj_id,class,i,g,di,dg,ra,dec,field,pdidx
0,HM33-A,1.0,22.424,22.940001,0.026,0.017,23.923733,28.821186,5,39800
1,C30,4.0,18.049,19.500999,0.001,0.002,12.105896,29.267633,11,118854
2,LAMOST-C22,4.0,17.628,19.153,0.001,0.001,11.738621,29.693506,11,24692
3,HM33-B,1.0,19.538,20.386,0.003,0.003,24.008787,29.963625,13,43246
4,LAMOST-C18,4.0,17.177999,18.388,0.001,0.001,23.842129,29.552473,14,122860


In [218]:
def ranfor(df,train_size=0.7,n_estimators=50,criterion='gini',features=['i','g']):
    # select features for training
    X = df[features]
    y = df['class']
    # scale the data
    scaler = preprocessing.StandardScaler().fit(X)
    X_scaled = scaler.transform(X)
    # split the data
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=train_size)
    # train the regressor model
    ran_for_class = RandomForestClassifier(n_estimators=n_estimators,
                                    criterion=criterion
                                   ).fit(X_train,y_train)
    pred = ran_for_class.predict(X_test)
    true = y_test.to_numpy()
    return (pred,true)

In [225]:
results = ranfor(training_data)

In [226]:
tot = len(results[1])
correct_count = 0
for i in range(tot):
    if results[0][i] == results[1][i]: correct_count+=1

print(correct_count/tot)

0.5979680696661829
