In [1]:
# import libraries

import pandas as pd
import numpy as np

In [2]:
# set display settings

pd.set_option("display.max_rows", None)
pd.set_option('display.max_columns', None)

In [3]:
# specify columns, datatypes, and load full consolidated sdss dataset

# load column names from file into a list
sdss_headers = pd.read_csv('../data/processed/sdss_full.csv.gz', compression='gzip', header=0, index_col=0, nrows=0).columns.tolist()

# specify groups of columns
object_identifiers = ['dr7objid', 'dr8objid', 'specobjid', 'gz2_filenumber']
object_locations = ['ra', 'dec']
object_class_labels = ['sdss_clean_class_name', 'sdss_clean_subclass_name', 'gz2_subclass_name', 'elodie_spectral_type']
object_binary_labels = ['gz1_flag_spiral', 'gz1_flag_elliptical', 'gz1_flag_uncertain'] \
    + [label for label in sdss_headers if 'flag' in label]
object_filters = ['score', 'petroR90_r']
object_features = ['redshift_final', 'redshift_noqso', 'redshift_elodie',
                   'u_s', 'g_s', 'r_s', 'i_s', 'z_s',
                   'u_p', 'g_p', 'r_p', 'i_p', 'z_p',
                   'elodie_color_index', 'elodie_temperature', 'elodie_metallicity']

# create a datatype dictionary for object_identifiers
object_dtypes = {col: 'string' for col in object_identifiers}

# load full consolidated sdss dataset
sdss_full = pd.read_csv('../data/processed/sdss_full.csv.gz', compression='gzip', header=0, dtype=object_dtypes)

  sdss_full = pd.read_csv('../data/processed/sdss_full.csv.gz', compression='gzip', header=0, dtype=object_dtypes)


In [4]:
# inspect the imported data
 
 # display some data
display(sdss_full.head(3))
display(sdss_full.tail(3))
display(sdss_full.sample(10))
 
# inspect dataframe
print(sdss_full.shape)
print(sdss_full.info())
for identifier in object_identifiers:
    print(f"\nTop Value Counts:  {sdss_full[identifier].value_counts().nlargest(5)}")

Unnamed: 0,dr7objid,dr8objid,specobjid,gz2_filenumber,ra,dec,sdss_clean_class_name,sdss_clean_subclass_name,gz2_subclass_name,elodie_spectral_type,gz1_flag_spiral,gz1_flag_elliptical,gz1_flag_uncertain,gz2_flag_smooth,gz2_flag_features_or_disk,gz2_flag_star_or_artifact,gz2_flag_edgeon_yes,gz2_flag_edgeon_no,gz2_flag_bar_yes,gz2_flag_bar_no,gz2_flag_spiral_yes,gz2_flag_spiral_no,gz2_flag_bulge_none1,gz2_flag_bulge_small,gz2_flag_bulge_medium,gz2_flag_bulge_large,gz2_flag_odd_yes,gz2_flag_odd_no,gz2_flag_round_circular,gz2_flag_round_medium,gz2_flag_round_cigar,gz2_flag_feature_ring,gz2_flag_feature_lens_or_arc,gz2_flag_feature_disturbed,gz2_flag_feature_irregular,gz2_flag_feature_other,gz2_flag_feature_merger,gz2_flag_feature_dust_lane,gz2_flag_bulge_round,gz2_flag_bulge_boxy,gz2_flag_bulge_none2,gz2_flag_arms_tight,gz2_flag_arms_medium,gz2_flag_arms_loose,gz2_flag_arms_1,gz2_flag_arms_2,gz2_flag_arms_3,gz2_flag_arms_4,gz2_flag_arms_many,gz2_flag_arms_unsure,score,petroR90_r,redshift_final,redshift_noqso,redshift_elodie,u_s,g_s,r_s,i_s,z_s,u_p,g_p,r_p,i_p,z_p,elodie_color_index,elodie_temperature,elodie_metallicity
0,588007006336254064,1237651252584448100,693716010653476864,192933.0,236.35553,54.820911,galaxy,STARFORMING,SBc2l,unclassified,True,False,False,False,True,False,False,True,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,0.850861,10.06485,0.080127,0.0,0.0,6.013997,19.66714,43.23034,61.21994,73.57887,18.91253,17.53665,16.81467,16.43992,16.18453,,,
1,588007006336254083,1237651252584448112,693718759432546304,,236.342,54.80283,galaxy,unclassified,unclassified,unclassified,False,False,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.850861,6.995347,0.193371,0.0,0.0,4.234163,11.95987,31.28475,46.43085,56.40972,19.66911,18.16584,17.27259,16.84296,16.5585,,,
2,587729226614112406,1237651252584251555,693718209676732416,32264.0,235.891464,55.133472,galaxy,AGN,SBc?t,unclassified,True,False,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.86336,10.95925,0.040399,0.0,0.0,18.20395,60.73151,129.9764,179.2148,215.1861,17.46699,15.83618,15.09113,14.70903,14.41906,,,


Unnamed: 0,dr7objid,dr8objid,specobjid,gz2_filenumber,ra,dec,sdss_clean_class_name,sdss_clean_subclass_name,gz2_subclass_name,elodie_spectral_type,gz1_flag_spiral,gz1_flag_elliptical,gz1_flag_uncertain,gz2_flag_smooth,gz2_flag_features_or_disk,gz2_flag_star_or_artifact,gz2_flag_edgeon_yes,gz2_flag_edgeon_no,gz2_flag_bar_yes,gz2_flag_bar_no,gz2_flag_spiral_yes,gz2_flag_spiral_no,gz2_flag_bulge_none1,gz2_flag_bulge_small,gz2_flag_bulge_medium,gz2_flag_bulge_large,gz2_flag_odd_yes,gz2_flag_odd_no,gz2_flag_round_circular,gz2_flag_round_medium,gz2_flag_round_cigar,gz2_flag_feature_ring,gz2_flag_feature_lens_or_arc,gz2_flag_feature_disturbed,gz2_flag_feature_irregular,gz2_flag_feature_other,gz2_flag_feature_merger,gz2_flag_feature_dust_lane,gz2_flag_bulge_round,gz2_flag_bulge_boxy,gz2_flag_bulge_none2,gz2_flag_arms_tight,gz2_flag_arms_medium,gz2_flag_arms_loose,gz2_flag_arms_1,gz2_flag_arms_2,gz2_flag_arms_3,gz2_flag_arms_4,gz2_flag_arms_many,gz2_flag_arms_unsure,score,petroR90_r,redshift_final,redshift_noqso,redshift_elodie,u_s,g_s,r_s,i_s,z_s,u_p,g_p,r_p,i_p,z_p,elodie_color_index,elodie_temperature,elodie_metallicity
1654165,,1237658613593997533,9259471051523641344,,177.50007,50.211641,quasar,unclassified,unclassified,unclassified,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.807279,,1.076416,0.231599,0.0,0.765539,1.052253,1.874948,1.917074,2.357079,21.84895,21.91097,21.59686,21.60826,21.09701,0.0,0.0,0.0
1654166,,1237658613594063054,9259459231773642752,,177.79391,50.208232,unclassified,unclassified,unclassified,unclassified,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.819832,,2.060357,-0.004214,0.0,7.819283,5.829438,6.240019,6.629135,8.703038,20.83662,20.683,20.50494,20.39749,20.39611,0.0,0.0,0.0
1654167,,1237658613594063058,9259460331285270528,,177.79678,50.375821,quasar,BROADLINE,unclassified,unclassified,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.819832,,1.856644,0.123349,0.0,0.191132,1.780057,1.906872,2.832188,3.034446,22.0247,21.56175,21.59132,21.17899,21.09313,0.0,0.0,0.0


Unnamed: 0,dr7objid,dr8objid,specobjid,gz2_filenumber,ra,dec,sdss_clean_class_name,sdss_clean_subclass_name,gz2_subclass_name,elodie_spectral_type,gz1_flag_spiral,gz1_flag_elliptical,gz1_flag_uncertain,gz2_flag_smooth,gz2_flag_features_or_disk,gz2_flag_star_or_artifact,gz2_flag_edgeon_yes,gz2_flag_edgeon_no,gz2_flag_bar_yes,gz2_flag_bar_no,gz2_flag_spiral_yes,gz2_flag_spiral_no,gz2_flag_bulge_none1,gz2_flag_bulge_small,gz2_flag_bulge_medium,gz2_flag_bulge_large,gz2_flag_odd_yes,gz2_flag_odd_no,gz2_flag_round_circular,gz2_flag_round_medium,gz2_flag_round_cigar,gz2_flag_feature_ring,gz2_flag_feature_lens_or_arc,gz2_flag_feature_disturbed,gz2_flag_feature_irregular,gz2_flag_feature_other,gz2_flag_feature_merger,gz2_flag_feature_dust_lane,gz2_flag_bulge_round,gz2_flag_bulge_boxy,gz2_flag_bulge_none2,gz2_flag_arms_tight,gz2_flag_arms_medium,gz2_flag_arms_loose,gz2_flag_arms_1,gz2_flag_arms_2,gz2_flag_arms_3,gz2_flag_arms_4,gz2_flag_arms_many,gz2_flag_arms_unsure,score,petroR90_r,redshift_final,redshift_noqso,redshift_elodie,u_s,g_s,r_s,i_s,z_s,u_p,g_p,r_p,i_p,z_p,elodie_color_index,elodie_temperature,elodie_metallicity
47572,5.880070057678605e+17,1237651252016054629,486564438346852352,,116.907,40.13042,galaxy,STARFORMING,unclassified,unclassified,False,False,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.854485,5.041833,0.097347,0.0,0.0,7.37118,18.23594,37.00751,59.78032,75.78127,19.67094,18.10534,17.35745,16.93291,16.63924,,,
507687,5.877420138135553e+17,1237667735038460021,2801414135059867648,163652.0,167.08812,21.413565,galaxy,unclassified,Ec,unclassified,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,True,True,False,True,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,0.888292,4.280978,0.078183,0.0,0.0,11.13113,43.32649,102.2366,147.3402,185.9559,19.51984,17.6609,16.77121,16.36727,16.02495,,,
1147807,,1237666185108324599,8738295954514661376,,353.87663,31.109044,quasar,BROADLINE,unclassified,unclassified,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.938691,,1.306339,0.414791,0.0,13.76163,10.56174,16.60096,17.6696,19.20623,20.19659,20.32275,19.82977,19.82003,19.82268,0.0,0.0,0.0
166765,5.877318916478732e+17,1237657771787419898,1021311926719768576,49189.0,163.214966,56.403904,galaxy,unclassified,Sb(r),unclassified,False,False,True,False,False,False,False,True,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,0.890322,5.20126,0.075332,0.0,0.0,13.50597,40.36794,96.39004,143.7275,188.4565,19.60236,17.6347,16.71468,16.27759,15.94906,,,
5753,5.877293882114049e+17,1237654380900450464,859146878569179136,,129.523,45.71931,galaxy,unclassified,unclassified,unclassified,False,False,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.814789,6.505565,0.144733,0.0,0.0,3.748132,16.30452,48.50946,73.13473,99.2142,20.43709,18.33979,17.19519,16.73663,16.36428,,,
10576,5.877255903853284e+17,1237656529994056042,403212659761113088,,263.7379,56.78886,unclassified,unclassified,unclassified,unclassified,True,False,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.860084,5.091026,0.057891,0.0,0.0,11.63709,27.71627,52.07922,76.0608,92.43719,19.50139,17.66855,17.06127,16.67248,16.38801,,,
116505,5.877307728172812e+17,1237656494042185933,847868912891619328,,5.339042,13.90064,galaxy,unclassified,unclassified,unclassified,False,False,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.885892,4.15155,0.08374,0.0,0.0,3.526451,15.53632,40.61706,64.73015,88.63673,20.49458,18.66137,17.6406,17.11238,16.73773,,,
141509,5.877241972118653e+17,1237649918436769985,481885466598074368,,33.46212,12.13547,galaxy,unclassified,unclassified,unclassified,False,True,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.838314,11.14471,0.207283,0.0,0.0,1.040788,8.294634,36.8667,68.79487,105.1652,20.93645,18.4796,16.97113,16.36002,15.95005,,,
1280425,,1237666273663647999,8585224676690907136,,17.611049,23.280371,quasar,BROADLINE,unclassified,unclassified,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.926004,,1.30176,1.94459,0.0,18.9924,15.77405,21.66826,21.93114,20.47312,19.6192,19.64216,19.23118,19.17639,19.14392,0.0,0.0,0.0
1554643,,1237655129845662739,5626380590872023040,,258.34731,31.822081,star,M4.5:III (123657),unclassified,G2V,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.904426,,0.000159,0.000159,0.0,0.803294,2.441131,7.589796,15.56832,25.37063,22.31522,20.68083,19.47939,18.71225,18.23027,0.539,5373.0,-1.08


(1654168, 68)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1654168 entries, 0 to 1654167
Data columns (total 68 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   dr7objid                      658334 non-null   string 
 1   dr8objid                      1654168 non-null  string 
 2   specobjid                     1654168 non-null  string 
 3   gz2_filenumber                236466 non-null   string 
 4   ra                            1654168 non-null  float64
 5   dec                           1654168 non-null  float64
 6   sdss_clean_class_name         1654168 non-null  object 
 7   sdss_clean_subclass_name      1654168 non-null  object 
 8   gz2_subclass_name             1654168 non-null  object 
 9   elodie_spectral_type          1654168 non-null  object 
 10  gz1_flag_spiral               658334 non-null   object 
 11  gz1_flag_elliptical           658334 non-null   object 
 12  gz1_flag_uncer

In [12]:
# investigate classes (categorical labels)

# create a helper function for investigating a column w/ categorical data
def investigate_classes(df, column):
    print(f"\n----------" + label + "----------")
    print(f"\tThe number of rows with empty values is: {df[column].isna().sum()}")
    print(f"\tThe number of unique labels is: {len(df[column].unique())}")
    print(f"\tThe % of the most common labels for column: {100 * df[column].value_counts().head(10) / len(df[column])}")

# summarize label counts for each categorical data column
for label in object_class_labels:
    investigate_classes(sdss_full, label)




----------sdss_clean_class_name----------
	The number of rows with empty values is: 0
	The number of unique labels is: 4
	The % of the most common labels for column: sdss_clean_class_name
galaxy          34.326380
unclassified    28.910304
star            24.431013
quasar          12.332302
Name: count, dtype: float64

----------sdss_clean_subclass_name----------
	The number of rows with empty values is: 0
	The number of unique labels is: 171
	The % of the most common labels for column: sdss_clean_subclass_name
unclassified      54.664399
STARFORMING        9.275600
BROADLINE          8.997816
F9                 3.774949
F5                 3.246889
A0                 1.914497
F3/F5V (30743)     1.720744
STARBURST          1.514417
G2                 0.983999
K3                 0.927354
Name: count, dtype: float64

----------gz2_subclass_name----------
	The number of rows with empty values is: 0
	The number of unique labels is: 815
	The % of the most common labels for column: gz2_subcl