# Extracción de caracteristicas proyecto:
# Clasificación de supernovas de ALeRCE - ZTF
### Por Joaquin Zepeda y Benjamin Irarrazabal - Tutor: Pablo Montero


# Instalación de librerias ALeRCE

In [None]:
# pyarrow might be needed to read the data
!python -m pip install Cython
!python -m pip install -e git+https://git@github.com/alercebroker/turbo-fats#egg=turbofats
!python -m pip install -e git+https://git@github.com/alercebroker/mhps#egg=mhps
!python -m pip install -e git+https://git@github.com/alercebroker/P4J#egg=P4J
!python -m pip install pyarrow
!python -m pip install -e git+https://git@github.com/alercebroker/lc_classifier#egg=lc_classifier

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import time
inicio_ejecucion = time.time()
# Se imprime la version de Tensorflow
print('Tensorflow version', tf.__version__)

Tensorflow version 2.4.0


# Se importan los extractores de caracteristicas provistos por lc_classifier ALeRCE

In [4]:
from lc_classifier.features import MHPSExtractor, PeriodExtractor, GPDRWExtractor
from lc_classifier.features import FoldedKimExtractor
from lc_classifier.features import HarmonicsExtractor, IQRExtractor, SupernovaeDetectionFeatureExtractor
from lc_classifier.features import PowerRateExtractor
from lc_classifier.features import TurboFatsFeatureExtractor,SNParametricModelExtractor

from lc_classifier.features import FeatureExtractorComposer

In [16]:
'''
2.- Los extractores de características recomendados son:

SupernovaeDetectionFeatureExtractor(band_names),
SNParametricModelExtractor(band_names),
IQRExtractor(band_names),
MHPSExtractor(band_names),
TurboFatsFeatureExtractor(band_names),
GPDRWExtractor(band_names),
PowerRateExtractor(band_names),
PeriodExtractor(band_names),
HarmonicsExtractor(band_names),

'''

# fid = {1 2} Indica la banda a la que corresponde (green, red) 
# bands = ['u', 'g', 'r']
bands = [1,2]
feature_extractor = FeatureExtractorComposer(
    [
        SupernovaeDetectionFeatureExtractor(bands),
        PeriodExtractor(bands),
        MHPSExtractor(bands),
        GPDRWExtractor(bands),
        FoldedKimExtractor(bands),
        HarmonicsExtractor(bands),
        PowerRateExtractor(bands),
        TurboFatsFeatureExtractor(bands)
    ]
)


# Se leen los datos de las detecciones de supernovas

In [7]:
url_det='https://raw.githubusercontent.com/joaquinzepeda/Datos/main/datos%20supernovas/detections_SNe_v7.0.1.csv'
url_dfcross='https://raw.githubusercontent.com/joaquinzepeda/Datos/main/datos%20supernovas/dfcrossmatches_prioritized_v7.0.1.csv'
detections = pd.read_csv(url_det, index_col=None)
dfcrossmatches = pd.read_csv(url_dfcross, index_col=None)

# Eliminación de outliers

In [8]:
from scipy import stats

Conjunto = pd.merge(detections, dfcrossmatches, left_on='objectId', right_on='oid')
slice_cjto = Conjunto[['oid','fid', 'isdiffpos', 'sigmapsf','sigmapsf_corr', 'field',
       'fwhm', 'dec_x', 'magpsf', 'magpsf_corr', 'rcid', 'ra_x', 'sky', 'rb', 'ssmagnr',
       'distpsnr3', 'sgscore2', 'maggaiabright', 'distpsnr2', 'distpsnr1',
       'maggaia', 'exptime', 'drb', 'sgscore3', 'neargaia', 'sgscore1', 'mjd', 'has_stamp', 'ra_y', 'dec_y', 'classALeRCE']]
clean_data = slice_cjto[((np.abs(stats.zscore(slice_cjto['magpsf']))<3)&(slice_cjto['sigmapsf']<1))]

n = len(clean_data)
copy_cleandata = clean_data.copy()


In [10]:
# Se obtienen los indices de las columnas magpsf_corr, sigmapsf_corr, esto permitira que 
# aunque se cambien de posición estas columnas siga funcionando el código, siempre y cuando las versiones no
# corregidas se encuentren a la izquierda de la columna corregida

j=0
indice_magpsf_corr = 0
indice_sigmapsf_corr = 0
indice_fid = 0
for columna in copy_cleandata.columns:
    if columna=='magpsf_corr':
        print(columna)
        indice_magpsf_corr = j
    if columna=='sigmapsf_corr':
        print(columna)
        indice_sigmapsf_corr = j
    if columna=='fid':
        print(columna)
        indice_fid = j
    j+=1
indice_magpsf_corr,indice_sigmapsf_corr 

fid
sigmapsf_corr
magpsf_corr


(9, 4)

# Arreglo de los datos corregidos

In [11]:
for i in range(0,n):
  if pd.isna(clean_data.iloc[i,indice_magpsf_corr]) == True:
    copy_cleandata.iloc[i,indice_magpsf_corr] = copy_cleandata.iloc[i,indice_magpsf_corr-1]
  if pd.isna(clean_data.iloc[i,indice_sigmapsf_corr ]) == True:
    copy_cleandata.iloc[i,indice_sigmapsf_corr ] = copy_cleandata.iloc[i,indice_sigmapsf_corr-1]

In [12]:
copy_cleandata.head()

Unnamed: 0,oid,fid,isdiffpos,sigmapsf,sigmapsf_corr,field,fwhm,dec_x,magpsf,magpsf_corr,...,exptime,drb,sgscore3,neargaia,sgscore1,mjd,has_stamp,ra_y,dec_y,classALeRCE
2,ZTF18aahvndq,1,1.0,0.032161,0.032161,626,1.98,27.008647,16.072975,16.072975,...,30.0,0.999939,0.5,78.11131,0.175976,59004.192674,True,198.722653,27.008637,SNIa
3,ZTF18aahvndq,1,1.0,0.049931,0.049931,626,2.75,27.008624,17.708502,17.708502,...,30.0,0.999956,0.5,78.077545,0.175976,59021.227604,True,198.722653,27.008637,SNIa
6,ZTF18aahvndq,1,1.0,0.044748,0.044748,626,3.49,27.008622,16.626165,16.626165,...,30.0,0.99915,0.5,78.05823,0.175976,59009.258669,True,198.722653,27.008637,SNIa
8,ZTF18aahvndq,1,1.0,0.078094,0.078094,626,2.24,27.008641,18.192377,18.192377,...,30.0,0.999988,0.5,78.038506,0.175976,59036.19809,True,198.722653,27.008637,SNIa
9,ZTF18aahvndq,1,1.0,0.083104,0.083104,626,2.28,27.008652,18.481302,18.481302,...,30.0,0.999997,0.5,78.0757,0.175976,59045.23809,False,198.722653,27.008637,SNIa


In [13]:
# Ahora, tomaremos un ejemplo de cada supernova y graficaremos su curva de luz.
datos_final = copy_cleandata #Las muestras con los NaN reemplazados y sin outliers

In [14]:
try:
    datos_final = datos_final.set_index('oid')
except:
    print('oid is already the index')
datos_final.head()

Unnamed: 0_level_0,fid,isdiffpos,sigmapsf,sigmapsf_corr,field,fwhm,dec_x,magpsf,magpsf_corr,rcid,...,exptime,drb,sgscore3,neargaia,sgscore1,mjd,has_stamp,ra_y,dec_y,classALeRCE
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZTF18aahvndq,1,1.0,0.032161,0.032161,626,1.98,27.008647,16.072975,16.072975,34,...,30.0,0.999939,0.5,78.11131,0.175976,59004.192674,True,198.722653,27.008637,SNIa
ZTF18aahvndq,1,1.0,0.049931,0.049931,626,2.75,27.008624,17.708502,17.708502,34,...,30.0,0.999956,0.5,78.077545,0.175976,59021.227604,True,198.722653,27.008637,SNIa
ZTF18aahvndq,1,1.0,0.044748,0.044748,626,3.49,27.008622,16.626165,16.626165,34,...,30.0,0.99915,0.5,78.05823,0.175976,59009.258669,True,198.722653,27.008637,SNIa
ZTF18aahvndq,1,1.0,0.078094,0.078094,626,2.24,27.008641,18.192377,18.192377,34,...,30.0,0.999988,0.5,78.038506,0.175976,59036.19809,True,198.722653,27.008637,SNIa
ZTF18aahvndq,1,1.0,0.083104,0.083104,626,2.28,27.008652,18.481302,18.481302,34,...,30.0,0.999997,0.5,78.0757,0.175976,59045.23809,False,198.722653,27.008637,SNIa


# Extracción de caracteristicas
## Esta sección se demora entre 15 y 25 minutos, dependiendo de los recursos computacionales que se tengan disponibles.

In [17]:
import warnings
import time
warnings.filterwarnings("ignore")
inicio = time.time()

detections_df= datos_final.rename(columns={'magpsf':'magnitude','mjd':'time', 'fid':'band','sigmapsf':'error'})
features_ = feature_extractor.compute_features(detections_df)
fin = time.time()
features_

ERROR:root:TypeError exception in PeriodExtractor: oid ZTF19aawgopm
'NoneType' object is not iterable
ERROR:root:TypeError exception in PeriodExtractor: oid ZTF19acihlft
'NoneType' object is not iterable
ERROR:root:TypeError exception in PeriodExtractor: oid ZTF20aamuqwn
'NoneType' object is not iterable
ERROR:root:TypeError exception in PeriodExtractor: oid ZTF20aavpnlv
'NoneType' object is not iterable
ERROR:root:TypeError exception in PeriodExtractor: oid ZTF20aaynrrh
'NoneType' object is not iterable
ERROR:root:TypeError exception in PeriodExtractor: oid ZTF19aawgopm
'NoneType' object is not iterable
ERROR:root:TypeError exception in PeriodExtractor: oid ZTF19acihlft
'NoneType' object is not iterable
ERROR:root:TypeError exception in PeriodExtractor: oid ZTF20aamuqwn
'NoneType' object is not iterable
ERROR:root:TypeError exception in PeriodExtractor: oid ZTF20aavpnlv
'NoneType' object is not iterable
ERROR:root:TypeError exception in PeriodExtractor: oid ZTF20aaynrrh
'NoneType' obj

Unnamed: 0_level_0,delta_mag_fid_1,delta_mjd_fid_1,first_mag_1,mean_mag_1,min_mag_1,n_det_1,n_neg_1,n_pos_1,positive_fraction_1,delta_mag_fid_2,...,Skew_2,SmallKurtosis_2,Std_2,StetsonK_2,Pvar_2,ExcessVar_2,SF_ML_amplitude_2,SF_ML_gamma_2,IAR_phi_2,LinearTrend_2
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZTF17aadlxmv,2.244800,42.055613,19.879980,19.087656,17.947500,18.0,0.0,18.0,1.0,1.847730,...,0.767211,1.142780,0.466788,0.893531,1.000000,0.000594,1.874733,0.348850,0.838971,0.001989
ZTF18aaaqexr,0.896138,18.870671,19.133900,18.977915,18.447520,9.0,0.0,9.0,1.0,0.191736,...,,,,,,,,,,
ZTF18aacdbzx,0.495977,19.991725,19.741877,19.620142,19.432064,14.0,0.0,14.0,1.0,0.786995,...,1.301440,2.497958,0.212669,0.743787,0.999385,0.000068,6.512830,0.948480,0.924340,0.017806
ZTF18aadmssd,3.293953,59.951100,16.887047,18.581230,16.887047,16.0,0.0,16.0,1.0,2.170098,...,-0.066124,-0.621467,0.662524,0.917680,1.000000,0.001411,8.845398,0.926170,0.988949,0.027800
ZTF18aadzfso,1.371363,32.999224,19.840900,19.273643,18.476837,24.0,0.0,24.0,1.0,0.797367,...,-0.444814,-0.743503,0.238186,0.886096,1.000000,0.000121,0.144780,-0.214761,0.899290,0.008069
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZTF20abgbxfm,2.981503,68.950197,18.893476,19.555296,18.296597,58.0,0.0,58.0,1.0,2.216758,...,0.100401,-1.245222,0.664534,0.839586,1.000000,0.001109,11.378320,0.987949,0.991831,0.034059
ZTF20abgdtmv,2.669654,33.020729,18.728144,20.046770,18.663546,26.0,0.0,26.0,1.0,1.719614,...,0.070929,-0.974009,0.528035,0.858666,1.000000,0.000697,7.053065,0.740276,0.979279,0.039661
ZTF20abgfekk,1.627867,31.928044,19.095242,17.742374,17.467375,32.0,0.0,32.0,1.0,1.534458,...,2.443501,6.694064,0.364999,0.679766,1.000000,0.000406,15.000000,1.116547,0.952584,-0.026970
ZTF20abgfljj,1.344170,21.932766,19.791739,19.019873,18.447569,26.0,0.0,26.0,1.0,2.227105,...,1.158201,1.331404,0.622109,0.826843,1.000000,0.000997,5.794355,0.578048,0.932772,-0.034009


In [19]:
# Falta utilizar el SNParametricModelExtractor(bands)
# 'time', 'magpsf', 'sigmapsf', 'band'

SNP_feature_extractor = SNParametricModelExtractor(bands)
inicio = time.time()

detections_df= datos_final.rename(columns={'mjd':'time', 'fid':'band'})
SNP_features = SNP_feature_extractor.compute_features(detections_df)
fin = time.time()
SNP_features

Unnamed: 0_level_0,SPM_A_1,SPM_t0_1,SPM_gamma_1,SPM_beta_1,SPM_tau_rise_1,SPM_tau_fall_1,SPM_chi_1,SPM_A_2,SPM_t0_2,SPM_gamma_2,SPM_beta_2,SPM_tau_rise_2,SPM_tau_fall_2,SPM_chi_2
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ZTF17aadlxmv,0.339168,5.039545,21.779965,0.642131,2.329822,11.688865,0.007554,0.311686,6.958040,21.231911,0.416776,3.677212,30.091547,0.594410
ZTF18aaaqexr,0.198096,0.504447,2.074468,0.112886,1.205330,17.585404,0.281999,0.292699,-8.394148,9.801738,0.447687,8.599661,32.414717,
ZTF18aacdbzx,0.183777,-7.254933,1.000737,0.333585,3.344742,24.821418,0.033287,0.114158,-5.114811,13.845734,0.063725,10.514372,20.820992,0.044529
ZTF18aadmssd,1.462568,-20.995956,28.830308,0.763144,2.939173,22.204705,0.010855,2.067952,-22.606529,39.255637,0.755130,17.087419,46.760685,1.111489
ZTF18aadzfso,0.211340,5.369283,5.263504,0.073814,3.554814,14.225585,0.028317,0.257985,4.309858,18.030784,0.606824,3.959666,85.046677,0.027387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZTF20abgbxfm,0.242148,1.301380,28.888657,0.813951,3.176701,24.208459,0.077253,0.287074,4.316309,14.243013,0.570416,4.845103,28.891700,0.040567
ZTF20abgdtmv,0.254616,4.286965,7.378327,0.633743,6.300816,10.058062,0.013802,0.245710,12.105234,2.379414,0.340936,6.101452,14.828348,0.039756
ZTF20abgfekk,0.448833,5.252311,18.219169,0.269451,3.415255,19.974884,0.045213,0.401462,5.696664,35.855792,0.521981,3.882842,30.064385,0.056440
ZTF20abgfljj,0.139119,-2.485712,20.374524,0.003264,21.405151,4.324015,1.353607,0.145629,4.926030,8.235378,0.014030,2.534057,19.834071,0.047055


In [29]:
features_ = features_.reset_index()
SNP_features = SNP_features.reset_index()

In [27]:
features = pd.merge(features_,SNP_features, left_on='oid', right_on='oid')
features = features.set_index('oid')
features 



Unnamed: 0,oid,index_x,delta_mag_fid_1,delta_mjd_fid_1,first_mag_1,mean_mag_1,min_mag_1,n_det_1,n_neg_1,n_pos_1,...,SPM_tau_rise_1_y,SPM_tau_fall_1_y,SPM_chi_1_y,SPM_A_2_y,SPM_t0_2_y,SPM_gamma_2_y,SPM_beta_2_y,SPM_tau_rise_2_y,SPM_tau_fall_2_y,SPM_chi_2_y
0,ZTF17aadlxmv,0,2.244800,42.055613,19.879980,19.087656,17.947500,18.0,0.0,18.0,...,2.329822,11.688865,0.007554,0.311686,6.958040,21.231911,0.416776,3.677212,30.091547,0.594410
1,ZTF18aaaqexr,1,0.896138,18.870671,19.133900,18.977915,18.447520,9.0,0.0,9.0,...,1.205330,17.585404,0.281999,0.292699,-8.394148,9.801738,0.447687,8.599661,32.414717,
2,ZTF18aacdbzx,2,0.495977,19.991725,19.741877,19.620142,19.432064,14.0,0.0,14.0,...,3.344742,24.821418,0.033287,0.114158,-5.114811,13.845734,0.063725,10.514372,20.820992,0.044529
3,ZTF18aadmssd,3,3.293953,59.951100,16.887047,18.581230,16.887047,16.0,0.0,16.0,...,2.939173,22.204705,0.010855,2.067952,-22.606529,39.255637,0.755130,17.087419,46.760685,1.111489
4,ZTF18aadzfso,4,1.371363,32.999224,19.840900,19.273643,18.476837,24.0,0.0,24.0,...,3.554814,14.225585,0.028317,0.257985,4.309858,18.030784,0.606824,3.959666,85.046677,0.027387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2063,ZTF20abgbxfm,2063,2.981503,68.950197,18.893476,19.555296,18.296597,58.0,0.0,58.0,...,3.176701,24.208459,0.077253,0.287074,4.316309,14.243013,0.570416,4.845103,28.891700,0.040567
2064,ZTF20abgdtmv,2064,2.669654,33.020729,18.728144,20.046770,18.663546,26.0,0.0,26.0,...,6.300816,10.058062,0.013802,0.245710,12.105234,2.379414,0.340936,6.101452,14.828348,0.039756
2065,ZTF20abgfekk,2065,1.627867,31.928044,19.095242,17.742374,17.467375,32.0,0.0,32.0,...,3.415255,19.974884,0.045213,0.401462,5.696664,35.855792,0.521981,3.882842,30.064385,0.056440
2066,ZTF20abgfljj,2066,1.344170,21.932766,19.791739,19.019873,18.447569,26.0,0.0,26.0,...,21.405151,4.324015,1.353607,0.145629,4.926030,8.235378,0.014030,2.534057,19.834071,0.047055


In [41]:
try:
    #features.drop(['oid'],axis=1)
    features = features.set_index('oid')
except:
    pass
features

Unnamed: 0_level_0,delta_mag_fid_1,delta_mjd_fid_1,first_mag_1,mean_mag_1,min_mag_1,n_det_1,n_neg_1,n_pos_1,positive_fraction_1,delta_mag_fid_2,...,SPM_tau_rise_1_y,SPM_tau_fall_1_y,SPM_chi_1_y,SPM_A_2_y,SPM_t0_2_y,SPM_gamma_2_y,SPM_beta_2_y,SPM_tau_rise_2_y,SPM_tau_fall_2_y,SPM_chi_2_y
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZTF17aadlxmv,2.244800,42.055613,19.879980,19.087656,17.947500,18.0,0.0,18.0,1.0,1.847730,...,2.329822,11.688865,0.007554,0.311686,6.958040,21.231911,0.416776,3.677212,30.091547,0.594410
ZTF18aaaqexr,0.896138,18.870671,19.133900,18.977915,18.447520,9.0,0.0,9.0,1.0,0.191736,...,1.205330,17.585404,0.281999,0.292699,-8.394148,9.801738,0.447687,8.599661,32.414717,
ZTF18aacdbzx,0.495977,19.991725,19.741877,19.620142,19.432064,14.0,0.0,14.0,1.0,0.786995,...,3.344742,24.821418,0.033287,0.114158,-5.114811,13.845734,0.063725,10.514372,20.820992,0.044529
ZTF18aadmssd,3.293953,59.951100,16.887047,18.581230,16.887047,16.0,0.0,16.0,1.0,2.170098,...,2.939173,22.204705,0.010855,2.067952,-22.606529,39.255637,0.755130,17.087419,46.760685,1.111489
ZTF18aadzfso,1.371363,32.999224,19.840900,19.273643,18.476837,24.0,0.0,24.0,1.0,0.797367,...,3.554814,14.225585,0.028317,0.257985,4.309858,18.030784,0.606824,3.959666,85.046677,0.027387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZTF20abgbxfm,2.981503,68.950197,18.893476,19.555296,18.296597,58.0,0.0,58.0,1.0,2.216758,...,3.176701,24.208459,0.077253,0.287074,4.316309,14.243013,0.570416,4.845103,28.891700,0.040567
ZTF20abgdtmv,2.669654,33.020729,18.728144,20.046770,18.663546,26.0,0.0,26.0,1.0,1.719614,...,6.300816,10.058062,0.013802,0.245710,12.105234,2.379414,0.340936,6.101452,14.828348,0.039756
ZTF20abgfekk,1.627867,31.928044,19.095242,17.742374,17.467375,32.0,0.0,32.0,1.0,1.534458,...,3.415255,19.974884,0.045213,0.401462,5.696664,35.855792,0.521981,3.882842,30.064385,0.056440
ZTF20abgfljj,1.344170,21.932766,19.791739,19.019873,18.447569,26.0,0.0,26.0,1.0,2.227105,...,21.405151,4.324015,1.353607,0.145629,4.926030,8.235378,0.014030,2.534057,19.834071,0.047055


In [40]:
#features=features.drop(['index_x'],axis=1)

In [32]:
tiempo_de_extraccion = (fin-inicio)/60
print(f'La extracción de caracteristicas se demoro {"{:.2f}".format(tiempo_de_extraccion)} minutos')

La extracción de caracteristicas se demoro 0.64 minutos


In [42]:
#Ejecutar una sola vez
ejecutado = False
if ejecutado == False:
    features = features.reset_index()
    ejecutado = True
else:
    print('ya fue reseteado el indice una vez, si quiere realizarlo nuevamente porfavor cambie el valor de ejecutado')
features

Unnamed: 0,oid,delta_mag_fid_1,delta_mjd_fid_1,first_mag_1,mean_mag_1,min_mag_1,n_det_1,n_neg_1,n_pos_1,positive_fraction_1,...,SPM_tau_rise_1_y,SPM_tau_fall_1_y,SPM_chi_1_y,SPM_A_2_y,SPM_t0_2_y,SPM_gamma_2_y,SPM_beta_2_y,SPM_tau_rise_2_y,SPM_tau_fall_2_y,SPM_chi_2_y
0,ZTF17aadlxmv,2.244800,42.055613,19.879980,19.087656,17.947500,18.0,0.0,18.0,1.0,...,2.329822,11.688865,0.007554,0.311686,6.958040,21.231911,0.416776,3.677212,30.091547,0.594410
1,ZTF18aaaqexr,0.896138,18.870671,19.133900,18.977915,18.447520,9.0,0.0,9.0,1.0,...,1.205330,17.585404,0.281999,0.292699,-8.394148,9.801738,0.447687,8.599661,32.414717,
2,ZTF18aacdbzx,0.495977,19.991725,19.741877,19.620142,19.432064,14.0,0.0,14.0,1.0,...,3.344742,24.821418,0.033287,0.114158,-5.114811,13.845734,0.063725,10.514372,20.820992,0.044529
3,ZTF18aadmssd,3.293953,59.951100,16.887047,18.581230,16.887047,16.0,0.0,16.0,1.0,...,2.939173,22.204705,0.010855,2.067952,-22.606529,39.255637,0.755130,17.087419,46.760685,1.111489
4,ZTF18aadzfso,1.371363,32.999224,19.840900,19.273643,18.476837,24.0,0.0,24.0,1.0,...,3.554814,14.225585,0.028317,0.257985,4.309858,18.030784,0.606824,3.959666,85.046677,0.027387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2063,ZTF20abgbxfm,2.981503,68.950197,18.893476,19.555296,18.296597,58.0,0.0,58.0,1.0,...,3.176701,24.208459,0.077253,0.287074,4.316309,14.243013,0.570416,4.845103,28.891700,0.040567
2064,ZTF20abgdtmv,2.669654,33.020729,18.728144,20.046770,18.663546,26.0,0.0,26.0,1.0,...,6.300816,10.058062,0.013802,0.245710,12.105234,2.379414,0.340936,6.101452,14.828348,0.039756
2065,ZTF20abgfekk,1.627867,31.928044,19.095242,17.742374,17.467375,32.0,0.0,32.0,1.0,...,3.415255,19.974884,0.045213,0.401462,5.696664,35.855792,0.521981,3.882842,30.064385,0.056440
2066,ZTF20abgfljj,1.344170,21.932766,19.791739,19.019873,18.447569,26.0,0.0,26.0,1.0,...,21.405151,4.324015,1.353607,0.145629,4.926030,8.235378,0.014030,2.534057,19.834071,0.047055


# Se eliminan las caracteristicas no recomendadas

In [43]:
"""
banned_features = [
   'W1', 'W2', 'W3', 'W4',
   'iqr_1','iqr_2',
   'last_mjd_before_fid_1',
   'last_mjd_before_fid_2',
   'g-r_ml',
   'MHAOV_Period_1', 'MHAOV_Period_2'
]
"""


banned_features1 = ['mean_mag_1',
   'mean_mag_2',
   'min_mag_1',
   'min_mag_2',
   'delta_mjd_fid_1',
   'delta_mjd_fid_2',
   'Mean_1',
   'Mean_2',
   'n_det_1',
   'n_det_2',
   'n_pos_1',
   'n_pos_2',
   'n_neg_1',
   'n_neg_2',
   'first_mag_1',
   'first_mag_2',
   'MHPS_non_zero_1',
   'MHPS_non_zero_2',
   'MHPS_PN_flag_1',
   'MHPS_PN_flag_2']

# además se eliminan estas caracteristicas pues la mayoria de sus celdas son Nan
features_clean = features.drop(['Eta_e_1','MaxSlope_1','Eta_e_2','MaxSlope_2'], axis=1)
features_clean = features_clean.drop(banned_features1, axis=1)
features_plus_target = pd.merge(features_clean, dfcrossmatches[['oid','classALeRCE']], left_on='oid', right_on='oid')
features_plus_target 

Unnamed: 0,oid,delta_mag_fid_1,positive_fraction_1,delta_mag_fid_2,positive_fraction_2,Multiband_period,PPE,Period_band_1,delta_period_1,Period_band_2,...,SPM_tau_fall_1_y,SPM_chi_1_y,SPM_A_2_y,SPM_t0_2_y,SPM_gamma_2_y,SPM_beta_2_y,SPM_tau_rise_2_y,SPM_tau_fall_2_y,SPM_chi_2_y,classALeRCE
0,ZTF17aadlxmv,2.244800,1.0,1.847730,1.0,51.630429,0.018797,0.200240,51.430189,0.069774,...,11.688865,0.007554,0.311686,6.958040,21.231911,0.416776,3.677212,30.091547,0.594410,SNIa
1,ZTF18aaaqexr,0.896138,1.0,0.191736,1.0,29.230770,0.000936,29.230770,0.000000,,...,17.585404,0.281999,0.292699,-8.394148,9.801738,0.447687,8.599661,32.414717,,SNIa
2,ZTF18aacdbzx,0.495977,1.0,0.786995,1.0,0.156563,0.004384,0.165508,0.008945,0.185710,...,24.821418,0.033287,0.114158,-5.114811,13.845734,0.063725,10.514372,20.820992,0.044529,SNIbc
3,ZTF18aadmssd,3.293953,1.0,2.170098,1.0,1.005344,0.056146,0.065053,0.940291,999.999953,...,22.204705,0.010855,2.067952,-22.606529,39.255637,0.755130,17.087419,46.760685,1.111489,SNIIn
4,ZTF18aadzfso,1.371363,1.0,0.797367,1.0,1.028361,0.034738,1.029866,0.001505,37.037036,...,14.225585,0.028317,0.257985,4.309858,18.030784,0.606824,3.959666,85.046677,0.027387,SNIa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2063,ZTF20abgbxfm,2.981503,1.0,2.216758,1.0,104.972373,0.126945,90.909084,14.063289,142.857138,...,24.208459,0.077253,0.287074,4.316309,14.243013,0.570416,4.845103,28.891700,0.040567,SNIa
2064,ZTF20abgdtmv,2.669654,1.0,1.719614,1.0,73.929961,0.074820,66.666664,7.263297,76.923075,...,10.058062,0.013802,0.245710,12.105234,2.379414,0.340936,6.101452,14.828348,0.039756,SNIa
2065,ZTF20abgfekk,1.627867,1.0,1.534458,1.0,1.016858,0.042308,999.999953,998.983094,999.999953,...,19.974884,0.045213,0.401462,5.696664,35.855792,0.521981,3.882842,30.064385,0.056440,SNIa
2066,ZTF20abgfljj,1.344170,1.0,2.227105,1.0,1.024148,0.016782,999.999953,998.975804,999.999953,...,4.324015,1.353607,0.145629,4.926030,8.235378,0.014030,2.534057,19.834071,0.047055,SNIa


# EL valor del thresh es la cota de valores que se necesitan no nulos para que la fila no sea eliminada

In [44]:
thresh = features_plus_target.shape[1]
thresh

165

# Mantenga solo las filas con al menos 106 valores que no sean NA.

In [45]:
features_plus_target_thresh = features_plus_target.dropna(thresh=thresh)
features_plus_target_thresh 

Unnamed: 0,oid,delta_mag_fid_1,positive_fraction_1,delta_mag_fid_2,positive_fraction_2,Multiband_period,PPE,Period_band_1,delta_period_1,Period_band_2,...,SPM_tau_fall_1_y,SPM_chi_1_y,SPM_A_2_y,SPM_t0_2_y,SPM_gamma_2_y,SPM_beta_2_y,SPM_tau_rise_2_y,SPM_tau_fall_2_y,SPM_chi_2_y,classALeRCE
0,ZTF17aadlxmv,2.244800,1.0,1.847730,1.0,51.630429,0.018797,0.200240,51.430189,0.069774,...,11.688865,0.007554,0.311686,6.958040,21.231911,0.416776,3.677212,30.091547,0.594410,SNIa
2,ZTF18aacdbzx,0.495977,1.0,0.786995,1.0,0.156563,0.004384,0.165508,0.008945,0.185710,...,24.821418,0.033287,0.114158,-5.114811,13.845734,0.063725,10.514372,20.820992,0.044529,SNIbc
3,ZTF18aadmssd,3.293953,1.0,2.170098,1.0,1.005344,0.056146,0.065053,0.940291,999.999953,...,22.204705,0.010855,2.067952,-22.606529,39.255637,0.755130,17.087419,46.760685,1.111489,SNIIn
4,ZTF18aadzfso,1.371363,1.0,0.797367,1.0,1.028361,0.034738,1.029866,0.001505,37.037036,...,14.225585,0.028317,0.257985,4.309858,18.030784,0.606824,3.959666,85.046677,0.027387,SNIa
5,ZTF18aaermez,0.728745,1.0,1.682007,1.0,0.117252,0.008665,0.159770,0.042518,0.183438,...,21.426885,0.123988,0.456477,-10.767997,44.214095,0.483826,25.675861,25.166046,17.122697,SNIa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2063,ZTF20abgbxfm,2.981503,1.0,2.216758,1.0,104.972373,0.126945,90.909084,14.063289,142.857138,...,24.208459,0.077253,0.287074,4.316309,14.243013,0.570416,4.845103,28.891700,0.040567,SNIa
2064,ZTF20abgdtmv,2.669654,1.0,1.719614,1.0,73.929961,0.074820,66.666664,7.263297,76.923075,...,10.058062,0.013802,0.245710,12.105234,2.379414,0.340936,6.101452,14.828348,0.039756,SNIa
2065,ZTF20abgfekk,1.627867,1.0,1.534458,1.0,1.016858,0.042308,999.999953,998.983094,999.999953,...,19.974884,0.045213,0.401462,5.696664,35.855792,0.521981,3.882842,30.064385,0.056440,SNIa
2066,ZTF20abgfljj,1.344170,1.0,2.227105,1.0,1.024148,0.016782,999.999953,998.975804,999.999953,...,4.324015,1.353607,0.145629,4.926030,8.235378,0.014030,2.534057,19.834071,0.047055,SNIa


In [46]:
features_plus_target = features_plus_target_thresh 

# Guardamos las caracteristicas sin balancear

In [47]:
features_plus_target = features_plus_target.set_index("oid")
features_plus_target.to_csv('features_clean_unbalanced.csv',index=False)

In [50]:
features_plus_target = features_plus_target.reset_index()