In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix
from astropy.io import ascii
from utils import col_names, normalise_sdss_class, ellipticity, filling_factor
import matplotlib.pyplot as plt
%matplotlib inline


In [14]:
relevant_indices = [6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]

datasets = "../datasets/SuperCOSMOS/"
colnames_relevant=[col_names[i] for i in relevant_indices]
uki823_df = ascii.read(datasets + "UKI823/sssedrpair.dat", guess=False, Reader=ascii.FastNoHeader).to_pandas()
data=uki823_df.iloc[:, relevant_indices]
data = pd.DataFrame(data.values, columns = colnames_relevant)   
data['Ellipticity']=  ellipticity(uki823_df.iloc[:,15],uki823_df.iloc[:,16])
data['Filling Factor']= filling_factor(data['AREA'], uki823_df.iloc[:,12],uki823_df.iloc[:,13])
data=normalise_sdss_class(data)

In [15]:
#Seperate data into variables and classification
data_x=data.loc[:,data.columns!='CLASS_SDSS']
data_y=data['CLASS_SDSS']

In [16]:
#Split dataset into train, val, test
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss

X_train,X_test,y_train,y_test = train_test_split(data_x,data_y,test_size=0.1,random_state=1)
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train,test_size=2./9,random_state=1)

sc = StandardScaler().fit(X_train)
X_train_sc = sc.transform(X_train)
X_val_sc = sc.transform(X_val)
X_test_sc = sc.transform(X_test)

random_state = 1

In [17]:
# data.info()


In [18]:
print("There are {} entries and {} columns in the uki823 DataFramewhen we select the relevant columns"\
      .format(data.shape[0], data.shape[1]))


There are 15645 entries and 25 columns in the uki823 DataFramewhen we select the relevant columns


In [19]:
data.describe()

Unnamed: 0,AREA,IPEAK,COSMAG,ISKY,A_U,B_U,THETA_U,A_I,B_I,THETA_I,...,C_COSMAG,C_PRFMAG,RA_SDSS,DEC_SDSS,GMAG_SDSS,RMAG_SDSS,IMAG_SDSS,CLASS_SDSS,Ellipticity,Filling Factor
count,15645.0,15645.0,15645.0,15645.0,15645.0,15645.0,15645.0,15645.0,15645.0,15645.0,...,15645.0,15645.0,15645.0,15645.0,15645.0,15645.0,15645.0,15645.0,15645.0,15645.0
mean,92.001151,13193640.0,-20152.272867,18080980.0,5423.954618,4149.977245,84.416874,4291.177181,3267.532886,81.601214,...,17.481817,17.367207,174.007529,0.048635,19.528314,18.495397,17.893314,1.628444,0.25424,8.414883e-07
std,314.990581,18171660.0,1971.753396,444152.0,3550.485593,3341.520804,57.212674,1935.854217,1881.680318,55.967697,...,1.285655,1.894035,179.404607,0.724706,2.194382,1.931732,1.773068,0.483236,0.182793,1.267339e-07
min,8.0,1297920.0,-28893.0,17099830.0,1633.0,538.0,0.0,1551.0,524.0,0.0,...,11.154,4.781,0.000165,-1.243447,8.98526,6.618781,6.940934,1.0,0.001402,6.56284e-08
25%,17.0,2348600.0,-21393.0,17698420.0,3434.0,2121.0,35.0,3235.0,2035.0,30.0,...,16.776,16.536,0.502112,-0.569497,18.465448,17.573668,17.055212,1.0,0.094976,7.727523e-07
50%,37.0,3912512.0,-19703.0,18029930.0,4537.0,3286.0,79.0,4006.0,3000.0,73.0,...,17.858,18.019,0.96911,0.075373,19.967171,18.960104,18.355196,2.0,0.214364,8.778205e-07
75%,92.0,14174210.0,-18561.0,18459080.0,6325.0,5203.0,135.0,4777.0,4040.0,134.0,...,18.471,18.748,359.492904,0.682301,21.087143,19.793398,19.087181,2.0,0.378731,9.381813e-07
max,24006.0,121054900.0,-17410.0,18995850.0,101032.0,90696.0,999.0,59866.0,55313.0,179.0,...,19.292,19.399,359.999933,1.273383,22.999374,24.80249,27.166468,2.0,0.889707,1.138855e-06


In [21]:
from scipy.stats import skew, kurtosis
print('Skewness:\n{}'.format(skew(X_test)[:-1])) # Get rid of label column 
print('Kurtosis:\n{}'.format(kurtosis(X_test)[:-1])) # Get rid of label column

Skewness:
[12.04458771  1.72759049 -0.80813136  0.19190038  4.35562322  4.0609034
  0.18955967  4.9320142   3.06486617  0.28905228  0.67206806 12.55641504
  4.47334514  1.84671865 -1.63862394 -1.05465903 -1.60588994  0.06264994
 -0.11194782 -1.07431057 -1.05309208 -1.04999504  0.68272406]
Kurtosis:
[ 2.12749116e+02  1.75741610e+00 -1.21567889e-01 -1.02374078e+00
  3.55802978e+01  3.49679494e+01 -1.29799969e+00  4.64992426e+01
  2.28726436e+01 -1.31425011e+00  8.10748256e-01  3.86577688e+02
  1.80301079e+01  9.87093110e+00  3.12913311e+00  4.30543369e-01
  2.93198973e+00 -1.99606461e+00 -1.16813850e+00  1.01090075e+00
  1.11874500e+00  1.73216842e+00 -3.93280064e-01]
